In [186]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [187]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
combine = [df_train, df_test]

In [188]:
print(df_train.columns.values)

['PassengerId' 'Survived' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch'
 'Ticket' 'Fare' 'Cabin' 'Embarked']


In [189]:
print(df_test.columns.values)

['PassengerId' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch' 'Ticket' 'Fare'
 'Cabin' 'Embarked']


In [190]:
df_train['PassengerId'].unique()
df_train['Pclass'].unique
df_train['SibSp'].unique()
df_train['Fare'].unique()
df_train['Ticket'].unique()
df_train['Parch'].unique()
df_train['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [192]:
df_test.describe(include='all')

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,418.0,418.0,418,418,332.0,418.0,418.0,418,417.0,91,418
unique,,,418,2,,,,363,,76,3
top,,,"Cornell, Mrs. Robert Clifford (Malvina Helen L...",male,,,,PC 17608,,B57 B59 B63 B66,S
freq,,,1,266,,,,5,,3,270
mean,1100.5,2.26555,,,30.27259,0.447368,0.392344,,35.627188,,
std,120.810458,0.841838,,,14.181209,0.89676,0.981429,,55.907576,,
min,892.0,1.0,,,0.17,0.0,0.0,,0.0,,
25%,996.25,1.0,,,21.0,0.0,0.0,,7.8958,,
50%,1100.5,3.0,,,27.0,0.0,0.0,,14.4542,,
75%,1204.75,3.0,,,39.0,1.0,0.0,,31.5,,


## Modeling using Scikit Learn
---

In [185]:
# //Feature Engineeering - Cabin//
def parse_cabin(cabin_string):
    cabin_string = str(cabin_string)
    if cabin_string[0] == 'N':
        return 0
    else:
        return 1

df_train['Has_Cabin'] = df_train['Cabin'].apply(parse_cabin)

KeyError: 'Cabin'

In [193]:
df_train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [140]:
# //Feature Engineeering - Cabin//
def parse_cabin(cabin_string):
    cabin_string = str(cabin_string)
    return cabin_string[0]

df_train['Type_Cabin'] = df_train['Cabin'].apply(parse_cabin)

In [141]:
df_train

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Has_Cabin,Type_Cabin
0,0,3,male,22.0,1,0,7.2500,,S,0,n
1,1,1,female,38.0,1,0,71.2833,C85,C,0,C
2,1,3,female,26.0,0,0,7.9250,,S,0,n
3,1,1,female,35.0,1,0,53.1000,C123,S,0,C
4,0,3,male,35.0,0,0,8.0500,,S,0,n
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,,S,0,n
887,1,1,female,19.0,0,0,30.0000,B42,S,0,B
888,0,3,female,28.0,1,2,23.4500,,S,0,n
889,1,1,male,26.0,0,0,30.0000,C148,C,0,C


In [143]:
df_train['Type_Cabin'].unique()

array(['n', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [152]:
# //Feature Engineeering - Cabin//
cabin_mapping = {"A":1, "B":2, "C":3, "D":4, "E":5, "F":6, "G":7, "T":8, "n":0}.astype(int)
for dataset in df_train:
    dataset['Type_Cabin'] = dataset['Type_Cabin'].map(cabin_mapping)
    dataset['Type_Cabin'] = dataset['Type_Cabin'].fillna(0)

AttributeError: 'dict' object has no attribute 'astype'

In [133]:
# //Feature Engineeering - Age//
df_train.loc[df_train['Age'].isna(),:]
df_train['Age'].fillna(df_train['Age'].median(), inplace=True)

In [149]:
df_train

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Has_Cabin,Type_Cabin
0,0,3,male,22.0,1,0,7.2500,,S,0,n
1,1,1,female,38.0,1,0,71.2833,C85,C,0,C
2,1,3,female,26.0,0,0,7.9250,,S,0,n
3,1,1,female,35.0,1,0,53.1000,C123,S,0,C
4,0,3,male,35.0,0,0,8.0500,,S,0,n
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,,S,0,n
887,1,1,female,19.0,0,0,30.0000,B42,S,0,B
888,0,3,female,28.0,1,2,23.4500,,S,0,n
889,1,1,male,26.0,0,0,30.0000,C148,C,0,C


In [150]:
df_train['SibSp'].unique()

array([1, 0, 3, 4, 2, 5, 8], dtype=int64)

In [151]:
df_train['Parch'].unique()

array([0, 1, 2, 5, 3, 4, 6], dtype=int64)

In [154]:
# //Feature Engineeering - SinSp and Parch//
df_train['Group_Index']=df_train['SibSp']+df_train['Parch']

In [155]:
df_train

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Has_Cabin,Type_Cabin,Group_Index
0,0,3,male,22.0,1,0,7.2500,,S,0,n,1
1,1,1,female,38.0,1,0,71.2833,C85,C,0,C,1
2,1,3,female,26.0,0,0,7.9250,,S,0,n,0
3,1,1,female,35.0,1,0,53.1000,C123,S,0,C,1
4,0,3,male,35.0,0,0,8.0500,,S,0,n,0
...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,,S,0,n,0
887,1,1,female,19.0,0,0,30.0000,B42,S,0,B,0
888,0,3,female,28.0,1,2,23.4500,,S,0,n,3
889,1,1,male,26.0,0,0,30.0000,C148,C,0,C,0


In [159]:
df_train = df_train.drop(['Cabin','SibSp','Parch'], axis=1)

In [160]:
df_train

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Has_Cabin,Type_Cabin,Group_Index
0,0,3,male,22.0,7.2500,S,0,n,1
1,1,1,female,38.0,71.2833,C,0,C,1
2,1,3,female,26.0,7.9250,S,0,n,0
3,1,1,female,35.0,53.1000,S,0,C,1
4,0,3,male,35.0,8.0500,S,0,n,0
...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,13.0000,S,0,n,0
887,1,1,female,19.0,30.0000,S,0,B,0
888,0,3,female,28.0,23.4500,S,0,n,3
889,1,1,male,26.0,30.0000,C,0,C,0


In [161]:
df_train = pd.get_dummies(df_train, columns=["Embarked", "Sex","Pclass","Type_Cabin"])

In [162]:
df_train

Unnamed: 0,Survived,Age,Fare,Has_Cabin,Group_Index,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male,...,Pclass_3,Type_Cabin_A,Type_Cabin_B,Type_Cabin_C,Type_Cabin_D,Type_Cabin_E,Type_Cabin_F,Type_Cabin_G,Type_Cabin_T,Type_Cabin_n
0,0,22.0,7.2500,0,1,0,0,1,0,1,...,1,0,0,0,0,0,0,0,0,1
1,1,38.0,71.2833,0,1,1,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
2,1,26.0,7.9250,0,0,0,0,1,1,0,...,1,0,0,0,0,0,0,0,0,1
3,1,35.0,53.1000,0,1,0,0,1,1,0,...,0,0,0,1,0,0,0,0,0,0
4,0,35.0,8.0500,0,0,0,0,1,0,1,...,1,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,27.0,13.0000,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,1
887,1,19.0,30.0000,0,0,0,0,1,1,0,...,0,0,1,0,0,0,0,0,0,0
888,0,28.0,23.4500,0,3,0,0,1,1,0,...,1,0,0,0,0,0,0,0,0,1
889,1,26.0,30.0000,0,0,1,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0


In [163]:
x = df_train.loc[:,df_train.columns!='Survived']
y = df_train.loc[:,df_train.columns=='Survived']

In [164]:
x.shape

(891, 21)

In [165]:
y.shape

(891, 1)

In [179]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=43)

In [180]:
classifier = LogisticRegression()

In [181]:
classifier.fit(x_train,y_train)
y_pred = classifier.predict(x_test)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [182]:
classifier.score(x_test, y_test)

0.7892376681614349

In [183]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.81      0.86      0.83       276
           1       0.74      0.68      0.71       170

    accuracy                           0.79       446
   macro avg       0.78      0.77      0.77       446
weighted avg       0.79      0.79      0.79       446



In [174]:
confusion_matrix(y_test,y_pred)

array([[93, 17],
       [29, 40]], dtype=int64)

In [None]:
Experiment 2 
