In [105]:
import pandas as pd
import zipfile

zip_file = zipfile.ZipFile('titanic.zip')
train_data = zip_file.open('train.csv')
test_data = zip_file.open('test.csv')

train_df = pd.read_csv(train_data)
test_df = pd.read_csv(test_data)

print(train_df.shape)
print(train_df.head())

(891, 12)
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN    

In [106]:
print(train_df.isnull().sum())  # Cabin has too many Nan values

# drop columns Cabin, name
train_df = train_df.drop(['Name', 'Cabin', 'Ticket'], axis=1)
test_df = test_df.drop(['Name', 'Cabin', 'Ticket'], axis=1)

# make apssengerId as index
train_df = train_df.set_index('PassengerId')
test_df = test_df.set_index('PassengerId')

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [107]:
train_df.head()
test_df.head()

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
892,3,male,34.5,0,0,7.8292,Q
893,3,female,47.0,1,0,7.0,S
894,2,male,62.0,0,0,9.6875,Q
895,3,male,27.0,0,0,8.6625,S
896,3,female,22.0,1,1,12.2875,S


In [108]:
# Build a logistic regression model to predict 'Survived'
from sklearn.linear_model import LogisticRegression

# drop rows with missing values
train_df = train_df.dropna()
# test_df = test_df.dropna()

# convert categorical variables to dummy variables
train_df = pd.get_dummies(train_df)
test_df = pd.get_dummies(test_df)

# split the data into train and test
from sklearn.model_selection import train_test_split

X = train_df.drop(['Survived'], axis=1)
y = train_df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# fit the model
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

# predict on test data, make sure y_pred keeps the same index as y_test
y_pred = logreg.predict(X_test)
y_pred = pd.Series(y_pred, index=y_test.index)

# evaluate the model
from sklearn import metrics

print(metrics.accuracy_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))


0.794392523364486
[[111  23]
 [ 21  59]]
              precision    recall  f1-score   support

           0       0.84      0.83      0.83       134
           1       0.72      0.74      0.73        80

    accuracy                           0.79       214
   macro avg       0.78      0.78      0.78       214
weighted avg       0.80      0.79      0.79       214



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [109]:
# retrain with all trainning data
logreg.fit(X, y)

# test data has missing values
# replace missing values with median value
test_df['Age'].fillna(test_df['Age'].median(), inplace=True)
test_df['Fare'].fillna(test_df['Fare'].median(), inplace=True)

# predict on test data, keep PassengerId as index
y_pred = logreg.predict(test_df)
y_pred = pd.DataFrame(y_pred, columns=['Survived'], index=test_df.index)
y_pred.head()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,1


In [110]:
y_pred.to_csv('titanic_pred.csv', index=True)