# Titanic-Machine Learning from Disaster

In [1]:
#Importing Required libraries and then importing the training dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Titanic_data=pd.read_csv(r'C:\Users\Abhishek Purohit\Downloads\Titanic_train.csv')
Titanic_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Let's perform some Data Wrangling and drop certain columns that obviously can't decide the survival..

In [2]:
#Dropping Name, Ticket, PassengerId column
Titanic_data.drop(['Name','Ticket','PassengerId'],axis=1,inplace=True)

In [3]:
#Let's have a look at the datatypes
Titanic_data.dtypes

Survived      int64
Pclass        int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
Fare        float64
Cabin        object
Embarked     object
dtype: object

In [4]:
#Lets's see missing values in each column
Titanic_data.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [5]:
#looking at shape of dataframe
Titanic_data.shape

(891, 9)

Having looked at the no. of null values in Cabin column and comparing that with total no. of rows in dataframe, I've decided to drop the 'Cabin' column-

In [6]:
Titanic_data.drop('Cabin',axis=1,inplace=True)

In [7]:
#Filling NaN values in Age column with mean age
Titanic_data.Age.fillna(Titanic_data.Age.mean(),inplace=True)

In [8]:
Titanic_data.Embarked.value_counts()
#Thus we saw that 'S' is most frequent value in Embarked column

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [9]:
#Replacing NaN values in Embarked column with most frequent value 
Titanic_data.Embarked.fillna('S',inplace=True)

In [10]:
Titanic_data.isnull().sum()
#Thus we see now there are no null values

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [11]:
Titanic_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


Now let's create dummies for the categorical variables and protect ourselves from dummy variable trap...

In [12]:
Sex_dummy=pd.get_dummies(Titanic_data.Sex,drop_first=True)
Embarked_dummy=pd.get_dummies(Titanic_data.Embarked,drop_first=True)
PClass_dummy=pd.get_dummies(Titanic_data.Pclass,drop_first=True)

In [13]:
Titanic_data.drop(['Sex','Pclass','Embarked'],axis=1,inplace=True)

In [14]:
Titanic_data=pd.concat([Titanic_data,Sex_dummy,Embarked_dummy,PClass_dummy],axis=1)

In [15]:
Titanic_data.head()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,male,Q,S,2,3
0,0,22.0,1,0,7.25,1,0,1,0,1
1,1,38.0,1,0,71.2833,0,0,0,0,0
2,1,26.0,0,0,7.925,0,0,1,0,1
3,1,35.0,1,0,53.1,0,0,1,0,0
4,0,35.0,0,0,8.05,1,0,1,0,1


In [16]:
#Creating Feature and target variable  
X=Titanic_data.drop('Survived',axis=1)
y=Titanic_data.Survived

Now let us employ Backward Elimination method to remove those variables which aren't needed as features

In [17]:
import statsmodels.api as sm


In [18]:
x1=X.values
y1=y.values

x1=np.append(arr=np.ones((Titanic_data.shape[0],1)).astype(int),values=x1,axis=1)

In [19]:
x_opt=x1[:,[0,1,2,5,7,8,9]]

In [20]:
regressor_OLS=sm.OLS(endog=y1,exog=x_opt).fit()

In [21]:
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.397
Model:,OLS,Adj. R-squared:,0.393
Method:,Least Squares,F-statistic:,97.16
Date:,"Tue, 28 Apr 2020",Prob (F-statistic):,9.31e-94
Time:,00:35:34,Log-Likelihood:,-396.32
No. Observations:,891,AIC:,806.6
Df Residuals:,884,BIC:,840.2
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.1884,0.052,23.014,0.000,1.087,1.290
x1,-0.0058,0.001,-5.417,0.000,-0.008,-0.004
x2,-0.0427,0.012,-3.565,0.000,-0.066,-0.019
x3,-0.5007,0.027,-18.235,0.000,-0.555,-0.447
x4,-0.0725,0.030,-2.459,0.014,-0.130,-0.015
x5,-0.1666,0.040,-4.187,0.000,-0.245,-0.089
x6,-0.3606,0.033,-10.786,0.000,-0.426,-0.295

0,1,2,3
Omnibus:,37.954,Durbin-Watson:,1.93
Prob(Omnibus):,0.0,Jarque-Bera (JB):,41.82
Skew:,0.527,Prob(JB):,8.3e-10
Kurtosis:,3.131,Cond. No.,156.0


In [22]:
#y_pred=lr.predict(x_test)
from sklearn.model_selection import StratifiedKFold
folds=StratifiedKFold(n_splits=10)

In [23]:
def get_score(model,X_train,X_test,y_train,y_test):
    model.fit(X_train,y_train)
    return model.score(X_test,y_test)

In [64]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
scores_lr=[]
scores_rf=[]
scores_svm=[]

for train_index,test_index in folds.split(x_opt,y):
    X_train,X_test,y_train,y_test=x_opt[train_index],x_opt[test_index],y[train_index],y[test_index]
    
    scores_lr.append(get_score(LogisticRegression(C=0.07,solver='liblinear'), X_train,X_test,y_train,y_test))
    scores_svm.append(get_score(SVC(), X_train,X_test,y_train,y_test))
    scores_rf.append(get_score(RandomForestClassifier(n_estimators=50), X_train,X_test,y_train,y_test))
    

#from sklearn.metrics import confusion_matrix
#A=confusion_matrix(y_test,y_pred)
#(A[0][0]+A[1][1])/(A[0][1]+A[1][0]+A[0][0]+A[1][1])

In [65]:
np.array(scores_lr).mean()

0.8136704119850187

In [66]:
np.array(scores_rf).mean()

0.7969662921348315

In [67]:
np.array(scores_svm).mean()

0.6386267166042447

In [68]:
Titanic_test=pd.read_csv(r'C:\Users\Abhishek Purohit\Downloads\Titanic_test.csv')

In [69]:
Titanic_test.drop(['Name','Ticket','Cabin','PassengerId'],axis=1,inplace=True)

In [70]:
Titanic_test.Age.fillna(Titanic_test.Age.mean(),inplace=True)


In [71]:
Titanic_test.Fare.fillna(Titanic_test.Fare.mean(),inplace=True)

In [72]:
Titanic_test.isnull().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [73]:
Sex_dummy1=pd.get_dummies(Titanic_test.Sex,drop_first=True)
Embarked_dummy1=pd.get_dummies(Titanic_test.Embarked,drop_first=True)
PClass_dummy1=pd.get_dummies(Titanic_test.Pclass,drop_first=True)

In [74]:
Titanic_test.drop(['Sex','Pclass','Embarked'],axis=1,inplace=True)

In [75]:
Titanic_test=pd.concat([Titanic_test,Sex_dummy1,Embarked_dummy1,PClass_dummy1],axis=1)

In [76]:
Titanic_test.dtypes

Age      float64
SibSp      int64
Parch      int64
Fare     float64
male       uint8
Q          uint8
S          uint8
2          uint8
3          uint8
dtype: object

In [77]:
X2=Titanic_test.values

In [78]:
Titanic_test.head()

Unnamed: 0,Age,SibSp,Parch,Fare,male,Q,S,2,3
0,34.5,0,0,7.8292,1,1,0,0,1
1,47.0,1,0,7.0,0,0,1,0,1
2,62.0,0,0,9.6875,1,1,0,1,0
3,27.0,0,0,8.6625,1,0,1,0,1
4,22.0,1,1,12.2875,0,0,1,0,1


In [79]:
X2=np.append(arr=np.ones((Titanic_test.shape[0],1)).astype(int),values=X2,axis=1)
x2_opt=X2[:,[0,1,2,5,7,8,9]]


In [80]:
lr=LogisticRegression(C=0.07,solver='liblinear')
lr.fit(x_opt,y1)
y_pred_test=lr.predict(x2_opt)

In [81]:
y_pred_test.shape

(418,)

In [82]:
Titanic_test.shape

(418, 9)

In [83]:
Titanic_test1=pd.read_csv(r'C:\Users\Abhishek Purohit\Downloads\Titanic_test.csv')
survived_prediction=pd.Series(y_pred_test)

In [84]:
df=pd.concat([Titanic_test1.PassengerId,survived_prediction],axis=1)
df.rename(columns={0:'Survived'},inplace=True)

In [85]:
df.to_csv(r'C:\Users\Abhishek Purohit\Downloads\Titanic_submission3_kaggle.csv')