In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Intro

After importing the basic libraries needed I like to start the analysis of the data by declaring variables that read the csv files for train and test data. I also like to print out the head of both along with the info to get a quick overview of what I'm working with.

In [3]:
#asign variables to train and test data
#and print out their info to get an idea of what they contain

train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
print(train_data.head())

test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
print(test_data.head())

train_data['train_test'] = 1
test_data['train_test'] = 0

train_data.info()
test_data.info()



In [4]:
#more analysis of the training data
train_data.describe()

In [5]:
#breaking up training data into numeric and catagorical variables

numeric_train = train_data[['Age', 'SibSp', 'Parch', 'Fare']]
catagoric_train = train_data[['Survived', 'Pclass', 'Sex', 'Ticket', 'Cabin', 'Embarked']]
 
#create a heatmap for how each variable correlates with each other
correleate = numeric_train.corr()
print(correleate)
sns.heatmap(correleate)

# Feature Engineering

1. Analyze how family size effected survival rate and the average age of survivors

2. Check how many people had multiple cabins and if that impacted survival rates

3. See how ticket types had an impact on survival rates

In [6]:
#creating table to see the average age of survivors and family size
pd.pivot_table(train_data, index = 'Survived', values = ['Age', 'SibSp'])

In [7]:
catagoric_train.Cabin
train_data['cabin_multiple'] = train_data.Cabin.apply(lambda x:0 if pd.isna(x) else len(x.split(' ')))

train_data['cabin_multiple'].value_counts()

pd.pivot_table(train_data, index = 'Survived', columns = 'cabin_multiple', 
               values = 'Ticket' , aggfunc = 'count')

In [8]:
#compares survival rates over cabins, n means cabin was null in dataset 
train_data['cabin_adv'] = train_data.Cabin.apply(lambda x: str(x)[0])

print(train_data.cabin_adv.value_counts())
pd.pivot_table(train_data, index='Survived', columns='cabin_adv', 
               values = 'Name', aggfunc='count')

In [9]:
train_data['numeric_ticket'] = train_data.Ticket.apply(lambda x:1 if x.isnumeric() else 0)


pd.pivot_table(train_data, index='Survived', columns='numeric_ticket', 
               values = 'Ticket', aggfunc = 'count')

In [10]:
train_data['cabin_adv'] = train_data.Cabin.apply(lambda x: str(x)[0])

pd.pivot_table(train_data,index='Survived',columns='cabin_adv', 
               values = 'Name', aggfunc='count')

# Data Preprocessing for Model

1. Use only relevant variables 

2. Get our training and test data to have the same columns

3. Add any null values we find with a randomization of the mean +/- standarad deviation or median

4. Normalize fare data value 

5. Then split train and test values of x and y

In [None]:
#combine all data
all_data = pd.concat([train_data, test_data])

#place values where they might be missing
all_data['cabin_multiple'] = all_data.Cabin.apply(lambda x: 0 if pd.isna(x) 
                                                  else len(x.split(' ')))

all_data['cabin_adv'] = all_data.Cabin.apply(lambda x: str(x)[0])
all_data['numeric_ticket'] = all_data.Ticket.apply(lambda x: 1 if x.isnumeric() else 0)
all_data['ticket_letters'] = all_data.Ticket.apply(lambda x: 
                                                   ''.join(x.split(' ')[:-1]).replace('.','')
                                                   .replace('/','').lower() 
                                                   if len(x.split(' ')[:-1]) >0 else 0)

all_data.Age = all_data.Age.fillna(train_data.Age.median())
all_data.Fare = all_data.Fare.fillna(train_data.Fare.median())

all_data['norm_fare'] = np.log(all_data.Fare+1)
all_data['norm_fare'].hist()

all_dummies = pd.get_dummies(all_data[['Pclass','Sex','Age','Parch','norm_fare',
                                        'cabin_adv','cabin_multiple',
                                       'numeric_ticket', 'train_test']])

#split to train for x and y and test for x now that the data is updated
X_train = all_dummies[all_dummies.train_test == 1].drop(['train_test'], axis =1)
X_test = all_dummies[all_dummies.train_test == 0].drop(['train_test'], axis =1)

y_train = all_data[all_data.train_test==1].Survived

In [29]:
from sklearn.preprocessing import StandardScaler

scale = StandardScaler()
all_dummies_scaled = all_dummies.copy()
all_dummies_scaled[['Age', 'Parch', 'norm_fare']] = scale.fit_transform (all_dummies_scaled[['Age', 'Parch', 'norm_fare']])

X_train_scaled = all_dummies_scaled[all_dummies_scaled.train_test == 1].drop(['train_test'], 
                                                                             axis =1)

X_test_scaled = all_dummies_scaled[all_dummies_scaled.train_test == 0].drop(['train_test'],
                                                                           axis =1)
y_train = all_data[all_data.train_test==1].Survived

# **Model Building**

Going to test multiple models to see which one works best for this specific data set.
Just because one model tends to produce the best results that doesn't mean that it will always be that way for all possible data sets.

In [44]:
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

gnb = GaussianNB()
cv = cross_val_score(gnb, X_train_scaled, y_train, cv=5)

print("Testing Naive Bayes:")
print(cv)
print(cv.mean(),"\n")

lr = LogisticRegression(max_iter = 2000)
cv = cross_val_score(lr, X_train, y_train, cv=5)

print("Testing Logistic Regression:")
print(cv)
print(cv.mean(),"\n")

dt = tree.DecisionTreeClassifier(random_state = 1)
cv = cross_val_score(dt,X_train_scaled,y_train,cv=5)

print("Testing Decision Tree Classifier:")
print(cv)
print(cv.mean(),"\n")


knn = KNeighborsClassifier()
cv = cross_val_score(knn,X_train_scaled,y_train,cv=5)

print("Testing K Neighbor Classifier:")
print(cv)
print(cv.mean(),"\n")


svc = SVC(probability = True)
cv = cross_val_score(svc,X_train_scaled,y_train,cv=5)

print("Testing SVC:")
print(cv)
print(cv.mean(),"\n")

rf = RandomForestClassifier()
cv = cross_val_score(rf, X_train_scaled, y_train, cv=5)

print("Testing Random Forest:")
print(cv)
print(cv.mean(),"\n")

In [42]:
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import RandomizedSearchCV 

#function to get best score and best parameters for a training model
def clf_performance(classifier, model_name):
    print(model_name)
    print('Best Score: ' + str(classifier.best_score_))
    print('Best Parameters: ' + str(classifier.best_params_))
    

#testing K Neighbors Classifier
knn = KNeighborsClassifier()
param_grid = {'n_neighbors' : [3,5,7,9],
              'weights' : ['uniform', 'distance'],
              'algorithm' : ['auto', 'ball_tree','kd_tree'],
              'p' : [1,2]}
clf_knn = GridSearchCV(knn, param_grid = param_grid, cv = 5, verbose = True, n_jobs = -1)
best_clf_knn = clf_knn.fit(X_train_scaled,y_train)
clf_performance(best_clf_knn,'KNN')

In [47]:
#testing SVC
svc = SVC(probability = True)
param_grid = tuned_parameters = [{'kernel': ['rbf'], 'gamma': [.1,.5,1,2,5,10],
                                  'C': [.1, 1, 10, 100, 1000]},
                                 {'kernel': ['linear'], 'C': [.1, 1, 10, 100, 1000]},
                                 {'kernel': ['poly'], 'degree' : [2,3,4,5], 'C': [.1, 1, 10, 100, 1000]}]
clf_svc = GridSearchCV(svc, param_grid = param_grid, cv = 5, verbose = True, n_jobs = -1)
best_clf_svc = clf_svc.fit(X_train_scaled,y_train)
clf_performance(best_clf_svc,'SVC')

In [46]:
#testing random forest
rf = RandomForestClassifier(random_state = 1)
param_grid =  {'n_estimators': [400,450,500,550],
               'criterion':['gini','entropy'],
                                  'bootstrap': [True],
                                  'max_depth': [15, 20, 25],
                                  'max_features': ['auto','sqrt', 10],
                                  'min_samples_leaf': [2,3],
                                  'min_samples_split': [2,3]}
                                  
clf_rf = GridSearchCV(rf, param_grid = param_grid, cv = 5, verbose = True, n_jobs = -1)
best_clf_rf = clf_rf.fit(X_train_scaled,y_train)
clf_performance(best_clf_rf,'Random Forest')

In [48]:
best_rf = best_clf_rf.best_estimator_.fit(X_train_scaled,y_train)
feat_importances = pd.Series(best_rf.feature_importances_, index=X_train_scaled.columns)
feat_importances.nlargest(20).plot(kind='barh')

# Final Submission

1. Did voting hard and soft for (KNN, RF and SVC)
2. Finalized the final data
3. Placed that data into a dataframe
4. Submitted the final answer

In [54]:
from sklearn.ensemble import VotingClassifier

best_knn = best_clf_knn.best_estimator_
best_svc = best_clf_svc.best_estimator_
best_rf = best_clf_rf.best_estimator_

voting_clf_hard = VotingClassifier(estimators = [('knn',best_knn),('rf',best_rf),('svc',best_svc)], voting = 'hard')  
voting_clf_all = VotingClassifier(estimators = [('knn',best_knn),('rf',best_rf),('svc',best_svc)], voting = 'soft') 
voting_clf_soft = VotingClassifier(estimators = [('knn',best_knn),('rf',best_rf),('svc',best_svc)], voting = 'soft') 


print('voting_clf_hard :',cross_val_score(voting_clf_hard,X_train,y_train,cv=5))
print('voting_clf_hard mean :',cross_val_score(voting_clf_hard,X_train,y_train,cv=5).mean())

print('voting_clf_soft :',cross_val_score(voting_clf_soft,X_train,y_train,cv=5))
print('voting_clf_soft mean :',cross_val_score(voting_clf_soft,X_train,y_train,cv=5).mean())

print('voting_clf_all :',cross_val_score(voting_clf_all,X_train,y_train,cv=5))
print('voting_clf_all mean :',cross_val_score(voting_clf_all,X_train,y_train,cv=5).mean())

In [55]:
#making the predictions
voting_clf_hard.fit(X_train_scaled, y_train)
voting_clf_soft.fit(X_train_scaled, y_train)
voting_clf_all.fit(X_train_scaled, y_train)

best_rf.fit(X_train_scaled, y_train)
y_hat_vc_hard = voting_clf_hard.predict(X_test_scaled).astype(int)
y_hat_rf = best_rf.predict(X_test_scaled).astype(int)
y_hat_vc_soft =  voting_clf_soft.predict(X_test_scaled).astype(int)
y_hat_vc_all = voting_clf_all.predict(X_test_scaled).astype(int)

In [57]:
#Updating final data and building the submissions with the predictions made
final_data = {'PassengerId': test_data.PassengerId, 'Survived': y_hat_vc_all}
submission = pd.DataFrame(data=final_data)

#submitting all the final submissions
submission.to_csv('submission.csv', index=False)