In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 
import seaborn as sns

## Cleaning the file

In [2]:
# Load the dataset file into "titanic" object
df = pd.read_csv("../data/titanic-1309-rows-biostatvanderbilt.csv")
# Let's have a look at the dataset attributes
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [3]:
df.drop(['name','ticket'],axis=1,inplace=True)
df.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,cabin,embarked,boat,body,home.dest
0,1,1,female,29.0,0,0,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,male,0.92,1,2,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,female,2.0,1,2,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,male,30.0,1,2,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,female,25.0,1,2,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [4]:
df.groupby('survived').mean()

Unnamed: 0_level_0,pclass,age,sibsp,parch,fare,body
survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2.500618,30.545363,0.521632,0.328801,23.353831,160.809917
1,1.962,28.918244,0.462,0.476,49.361184,


In [5]:
df['sibsp'].value_counts()

0    891
1    319
2     42
4     22
3     20
8      9
5      6
Name: sibsp, dtype: int64

In [6]:
df['sex'].value_counts()

male      843
female    466
Name: sex, dtype: int64

In [7]:
df.isnull().sum()>(len(df)/2)

pclass       False
survived     False
sex          False
age          False
sibsp        False
parch        False
fare         False
cabin         True
embarked     False
boat          True
body          True
home.dest    False
dtype: bool

In [8]:
df.drop(['cabin','home.dest','boat','body'],axis=1,inplace=True)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    1309 non-null   int64  
 1   survived  1309 non-null   int64  
 2   sex       1309 non-null   object 
 3   age       1046 non-null   float64
 4   sibsp     1309 non-null   int64  
 5   parch     1309 non-null   int64  
 6   fare      1308 non-null   float64
 7   embarked  1307 non-null   object 
dtypes: float64(2), int64(4), object(2)
memory usage: 81.9+ KB


In [10]:
df['age'].isnull().value_counts()

False    1046
True      263
Name: age, dtype: int64

In [11]:
df.groupby('sex')['age'].median()

sex
female    27.0
male      28.0
Name: age, dtype: float64

In [12]:
df['age'] = df.groupby('sex')['age'].apply(lambda x: x.fillna(x.median()))

In [13]:
df.isnull().sum()

pclass      0
survived    0
sex         0
age         0
sibsp       0
parch       0
fare        1
embarked    2
dtype: int64

In [14]:
df['embarked'].value_counts()

S    914
C    270
Q    123
Name: embarked, dtype: int64

In [15]:
df['embarked'].fillna(df['embarked'].value_counts().idxmax(), inplace=True)
df['embarked'].value_counts()

S    916
C    270
Q    123
Name: embarked, dtype: int64

In [16]:
df.isnull().sum()>(len(df)/2)

pclass      False
survived    False
sex         False
age         False
sibsp       False
parch       False
fare        False
embarked    False
dtype: bool

In [17]:
df = pd.get_dummies(data=df, columns=['sex'],drop_first=True)
df.head()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,embarked,sex_male
0,1,1,29.0,0,0,211.3375,S,0
1,1,1,0.92,1,2,151.55,S,1
2,1,0,2.0,1,2,151.55,S,0
3,1,0,30.0,1,2,151.55,S,1
4,1,0,25.0,1,2,151.55,S,0


In [18]:
df.corr()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,sex_male
pclass,1.0,-0.312469,-0.379761,0.060832,0.018322,-0.558629,0.124617
survived,-0.312469,1.0,-0.045882,-0.027825,0.08266,0.244265,-0.528693
age,-0.379761,-0.045882,1.0,-0.19132,-0.126307,0.179805,0.059834
sibsp,0.060832,-0.027825,-0.19132,1.0,0.373587,0.160238,-0.109609
parch,0.018322,0.08266,-0.126307,0.373587,1.0,0.221539,-0.213125
fare,-0.558629,0.244265,0.179805,0.160238,0.221539,1.0,-0.185523
sex_male,0.124617,-0.528693,0.059834,-0.109609,-0.213125,-0.185523,1.0


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    1309 non-null   int64  
 1   survived  1309 non-null   int64  
 2   age       1309 non-null   float64
 3   sibsp     1309 non-null   int64  
 4   parch     1309 non-null   int64  
 5   fare      1308 non-null   float64
 6   embarked  1309 non-null   object 
 7   sex_male  1309 non-null   uint8  
dtypes: float64(2), int64(4), object(1), uint8(1)
memory usage: 73.0+ KB


In [20]:
df['fare'].isnull().sum()

1

In [21]:
df['fare'].fillna(df['fare'].value_counts().idxmax(), inplace=True)

In [22]:
df['fare'].isnull().sum()

0

In [23]:
df.drop('pclass',axis=1,inplace=True)

In [26]:
df.drop('embarked',axis=1,inplace=True)

## Preparing data for training and testing

In [27]:
X = df.drop(['survived'],axis=1)
y = df['survived']

In [28]:
X.info()
y.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1309 non-null   float64
 1   sibsp     1309 non-null   int64  
 2   parch     1309 non-null   int64  
 3   fare      1309 non-null   float64
 4   sex_male  1309 non-null   uint8  
dtypes: float64(2), int64(2), uint8(1)
memory usage: 42.3 KB


0    1
1    1
2    0
3    0
4    0
Name: survived, dtype: int64

In [29]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=67)

In [30]:
y_test

920     0
540     1
1207    0
1098    0
951     0
       ..
390     0
1139    0
1000    1
549     1
165     1
Name: survived, Length: 393, dtype: int64

## Applying Linear Regression

In [31]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [32]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

In [33]:
lr.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [34]:
predictions = lr.predict(X_test)

In [35]:
predictions

array([0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0,

In [36]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [37]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.79      0.81      0.80       235
           1       0.70      0.67      0.69       158

    accuracy                           0.75       393
   macro avg       0.74      0.74      0.74       393
weighted avg       0.75      0.75      0.75       393



In [38]:
print(confusion_matrix(y_test,predictions))

[[190  45]
 [ 52 106]]


In [39]:
pd.DataFrame(confusion_matrix(y_test, predictions), columns=['true survived', 'true not survived'], index=['predicted survived', 'predicted not survived'])

Unnamed: 0,true survived,true not survived
predicted survived,190,45
predicted not survived,52,106


In [40]:
print(accuracy_score(y_test,predictions))

0.7531806615776081


## Applying Decision Tree

In [41]:
from sklearn import tree

In [42]:
tr = tree.DecisionTreeClassifier()

In [43]:
tr.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [44]:
tr_predictions = tr.predict(X_test)

In [None]:
pd.DataFrame(confusion_matrix(y_test, tr_predictions), 
             columns=['true survived', 'True Not Survived'], 
             index=['Predicted Survived', 'Predicted Not Survived'])