In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [2]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
train_data.head()

test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
test_data.head()

In [26]:
#Adding "child" like gender
def male_female_child(passenger):
    age,sex = passenger
    
    if age<10:
       return 'child'
    else :
       return sex

train_data['person']=train_data[['Age','Sex']].apply(male_female_child,axis=1)
test_data['person']=test_data[['Age','Sex']].apply(male_female_child,axis=1)

sns.countplot('person',data=train_data,palette='Blues')
sns.factorplot('person','Survived',data=train_data)

In [30]:
#Find family size and add aloneness
#https://triangleinequality.wordpress.com/2013/09/08/basic-feature-engineering-with-the-titanic-data/
train_data['Family_Size']=train_data['SibSp']+train_data['Parch']
train_data['Alone']=train_data['Family_Size']
train_data['Alone'].loc[train_data['Family_Size']>0] = 0
train_data['Alone'].loc[train_data['Family_Size']==0] = 1
train_data.head()

test_data['Family_Size']=test_data['SibSp']+test_data['Parch']
test_data['Alone']=test_data['Family_Size']
test_data['Alone'].loc[test_data['Family_Size']>0] = 0
test_data['Alone'].loc[test_data['Family_Size']==0] = 1
test_data.head()

sns.countplot('Family_Size',data=train_data,palette='Blues')
sns.factorplot('Family_Size','Survived',data=train_data)

sns.countplot('Alone',data=train_data,palette='Blues')
sns.factorplot('Alone','Survived',data=train_data)

In [31]:
#Fill embarked
train_data[train_data['Embarked'].isnull()] #It gives only one person doesn't have Embarked info.
                                            #https://www.encyclopedia-titanica.org/titanic-survivor/amelia-icard.html
                                            #She embarked on Southampten
train_data['Embarked'].fillna('S', inplace=True)

test_data[test_data['Embarked'].isnull()]
test_data['Embarked'].fillna('S', inplace=True)

sns.countplot('Embarked',data=train_data,palette='Blues')
sns.factorplot('Embarked','Survived',data=train_data)

In [35]:
#Fill age by mean
# Filling the missing values in Age with the medians of Sex and Pclass groups
#https://www.kaggle.com/gunesevitan/titanic-advanced-feature-engineering-tutorial/notebook?cellIds=14&kernelSessionId=27280410
#sns.lmplot('Age','Survived',hue='Sex',data=titanic_df)

age_by_pclass_sex = train_data.groupby(['person', 'Pclass']).median()['Age']
train_data['Age'] = train_data.groupby(['person', 'Pclass'])['Age'].apply(lambda x: x.fillna(x.median()))

age_by_pclass_sex = test_data.groupby(['person', 'Pclass']).median()['Age']
test_data['Age'] = test_data.groupby(['person', 'Pclass'])['Age'].apply(lambda x: x.fillna(x.median()))

train_data.head()
test_data.head()

sns.lmplot('Age','Survived',hue='person',data=train_data,palette='winter')
sns.lmplot('Age','Survived',hue='Pclass',data=train_data,palette='winter')

In [40]:
#Categorize by fare prices
#https://www.kaggle.com/sid321axn/titanic-survival-eda-feature-engineering/notebook
#temp = train_data
#temp.loc[temp.Fare > 200, 'Fare']=np.nan # 
#temp.loc[temp.Fare > 105, 'Fare']=np.nan 
#temp.loc[temp.Fare > 45, 'Fare']=np.nan
#sns.lmplot('Fare','Survived',hue='person',data=temp,palette='winter')

def Class(Fare):

        if Fare <=45:

          return 'A'
    
        elif Fare >45 and Fare<=105:

         return 'B'
    
        elif Fare >105 and Fare<=200:

         return 'C'
    
        elif Fare > 200:

         return 'D'

train_data['Class'] = train_data['Fare'].apply(Class)
train_data.head()

test_data['Class'] = test_data['Fare'].apply(Class)
test_data.head()

sns.lmplot('Fare','Survived',data=train_data,palette='winter')
sns.countplot('Class',data=train_data,palette='Blues')
sns.factorplot('Class','Survived',data=train_data)

In [41]:
#Categorize by Ages

def Class(Age):

        if Age <=10:

          return 'C'
    
        elif Age >10 and Age<=60:

         return 'M'
    
        elif Age >60:

         return 'O'
    
train_data['Age_Cate'] = train_data['Age'].apply(Class)
train_data.head()

test_data['Age_Cate'] = test_data['Age'].apply(Class)
test_data.head()

sns.lmplot('Age','Survived',data=train_data,palette='winter')
sns.countplot('Age_Cate',data=train_data,palette='Blues')
sns.factorplot('Age_Cate','Survived',data=train_data)

In [43]:
#https://www.kaggle.com/anandhuh/titanic-simple-solution-top-9/notebook
Title_Dictionary = {"Capt": "Officer","Col": "Officer","Major": "Officer","Jonkheer": "Royalty",
                    "Don": "Royalty","Sir" : "Royalty","Dr": "Officer","Rev": "Officer",
                    "the Countess":"Royalty","Mme": "Mrs","Mlle": "Miss","Ms": "Mrs",
                    "Mr" : "Mr","Mrs" : "Mrs","Miss" : "Miss","Master" : "Master","Lady" : "Royalty"}

train_data['Title'] = train_data['Name'].map(lambda name:name.split(',')[1].split('.')[0].strip())
train_data['Title'] = train_data.Title.map(Title_Dictionary)
train_data.head()

titles = set()
for name in test_data['Name']:
    titles.add(name.split(',')[1].split('.')[0].strip())

test_data['Title'] = test_data['Name'].map(lambda name:name.split(',')[1].split('.')[0].strip())
test_data['Title'] = test_data.Title.map(Title_Dictionary)
test_data.head()

sns.countplot('Title',data=train_data,palette='Blues')
sns.factorplot('Title','Survived',data=train_data)

Model Building

In [12]:
y = train_data["Survived"]
features = ["Pclass", "person", "Embarked",'Alone','Age_Cate','Class', 'Family_Size','Title']
X = pd.get_dummies(train_data[features])

X_test = pd.get_dummies(test_data[features])
X_test['Title_Royalty'] = 0

X_train_scaled = X
X_train = X
y_train = y

In [13]:
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [14]:
from sklearn.svm import LinearSVC
lsvc = LinearSVC(random_state=1, tol=1e-5)
cv = cross_val_score(lsvc,X_train_scaled,y_train,cv=5)
print(cv)
print(cv.mean())

In [15]:
#I usually use Naive Bayes as a baseline for my classification tasks 
gnb = GaussianNB()
cv = cross_val_score(gnb,X_train_scaled,y_train,cv=5)
print(cv)
print(cv.mean())

In [16]:
lr = LogisticRegression(max_iter = 2000)
cv = cross_val_score(lr,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

In [17]:
dt = tree.DecisionTreeClassifier(random_state = 1)
cv = cross_val_score(dt,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

In [18]:
knn = KNeighborsClassifier()
cv = cross_val_score(knn,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

In [19]:
rf = RandomForestClassifier(random_state = 1)
cv = cross_val_score(rf,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

In [20]:
svc = SVC(probability = True)
cv = cross_val_score(svc,X_train_scaled,y_train,cv=5)
print(cv)
print(cv.mean())

In [21]:
from xgboost import XGBClassifier
xgb = XGBClassifier(random_state =1)
cv = cross_val_score(xgb,X_train_scaled,y_train,cv=5)
print(cv)
print(cv.mean())

In [22]:
#Voting classifier takes all of the inputs and averages the results. For a "hard" voting classifier each classifier gets 1 vote "yes" or "no" and the result is just a popular vote. For this, you generally want odd numbers
#A "soft" classifier averages the confidence of each of the models. If a the average confidence is > 50% that it is a 1 it will be counted as such
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(estimators = [('lr',lr),('knn',knn),('rf',rf),('gnb',gnb),('svc',svc),('xgb',xgb)], voting = 'soft') 

In [23]:
cv = cross_val_score(voting_clf,X_train_scaled,y_train,cv=5)
print(cv)
print(cv.mean())

In [24]:
model = SVC(probability = True)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")