In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy

In [2]:
testSet = pd.read_csv(r"C:\Users\yug\Desktop\Projects\AIML\Titanic\test.csv")
trainSet = pd.read_csv(r"C:\Users\yug\Desktop\Projects\AIML\Titanic\train.csv")

trainSet = trainSet.rename(columns={"Pclass": "TClass", "SibSp": "Sibl_Sp", "Parch": "ParChl"})
trainSet = trainSet.drop(axis=1, columns=["Ticket", "Cabin", "Name", "PassengerId"])

testSet = testSet.rename(columns={"Pclass": "TClass", "SibSp": "Sibl_Sp", "Parch": "ParChl"})
testSet = testSet.drop(axis=1, columns=["Ticket", "Cabin", "Name", "PassengerId"])

print(trainSet.shape)
print(testSet.shape)
testSet.head()

(891, 8)
(418, 7)


Unnamed: 0,TClass,Sex,Age,Sibl_Sp,ParChl,Fare,Embarked
0,3,male,34.5,0,0,7.8292,Q
1,3,female,47.0,1,0,7.0,S
2,2,male,62.0,0,0,9.6875,Q
3,3,male,27.0,0,0,8.6625,S
4,3,female,22.0,1,1,12.2875,S


In [3]:
trainLabels = trainSet["Survived"].copy()
trainSet = trainSet.drop(axis=1, columns=["Survived"])

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameColSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return X[self.attribute_names]

In [5]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
valCols = ["TClass", "Age", "Sibl_Sp", "ParChl", "Fare"]
labelCols = ["Sex", "Embarked"]
valPipeline = Pipeline([
    ("valColSelect", DataFrameColSelector(valCols)),
    ("ageImputer", SimpleImputer())
])

labelPipeline = Pipeline([
    ("labelColSelect", DataFrameColSelector(labelCols)),
    ("labelEncoder", OneHotEncoder(sparse=False)),
])

fullPipeline = FeatureUnion(transformer_list=[
    ("val_Pl", valPipeline),
    ("cat_Pl", labelPipeline),   
])

In [6]:
prepTrainSet = fullPipeline.fit_transform(X=trainSet)
prepTestSet = fullPipeline.transform(X=testSet)
print(prepTrainSet.shape)
print(prepTestSet.shape)
prepTrainSet
# Note to self; Don't leave large IDs that will DEFINITELY skew your data.

(891, 11)
(418, 11)


array([[ 3.        , 22.        ,  1.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 1.        , 38.        ,  1.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 3.        , 26.        ,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [ 3.        , 29.69911765,  1.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 1.        , 26.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 3.        , 32.        ,  0.        , ...,  1.        ,
         0.        ,  0.        ]])

In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

KNNClf = KNeighborsClassifier()
KNNClf.fit(prepTrainSet, trainLabels)
KNNClf.predict(prepTestSet)

SVCClf = SVC(gamma="auto")
SVCClf.fit(prepTrainSet, trainLabels)
SVCClf.predict(prepTestSet)

RFCClf = RandomForestClassifier(n_estimators=70, max_features="log2")
RFCClf.fit(prepTrainSet, trainLabels)
RFCClf.predict(prepTestSet)

GBCClf = GradientBoostingClassifier()
GBCClf.fit(prepTrainSet, trainLabels)
GBCResults = GBCClf.predict(prepTestSet)

In [8]:
from sklearn.model_selection import cross_val_score

scoresKNN = cross_val_score(KNNClf, prepTrainSet, trainLabels, cv=10)
scoresSVC = cross_val_score(SVCClf, prepTrainSet, trainLabels, cv=10)
scoresRFC = cross_val_score(RFCClf, prepTrainSet, trainLabels, cv=10)
scoresGBC = cross_val_score(GBCClf, prepTrainSet, trainLabels, cv=10)
print("KNN Mean Scores:", scoresKNN.mean())
print("SVC Mean Scores:", scoresSVC.mean())
print("RFC Mean Scores:", scoresRFC.mean())
print("GBC Mean Scores:", scoresGBC.mean())

KNN Mean Scores: 0.7093757802746566
SVC Mean Scores: 0.7329712858926343
RFC Mean Scores: 0.8148813982521848
GBC Mean Scores: 0.8316729088639201


In [9]:
GBCClf.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'deviance',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': None,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [10]:
from sklearn.model_selection import GridSearchCV
RFCClf = GradientBoostingClassifier()
RFCGrid = GridSearchCV(estimator=RFCClf, param_grid=[
    {"n_estimators": [70, 75, 80, 100, 120]},
    {"max_features" :["sqrt", "log2", None]}
])

gridSearchResults = RFCGrid.fit(prepTrainSet, trainLabels)


In [11]:
gridSearchResults.best_params_

{'n_estimators': 100}

In [18]:
GBCClf = GradientBoostingClassifier()
GBCClf.fit(prepTrainSet, trainLabels)
GBCResults = GBCClf.predict(prepTestSet)

finalData = pd.DataFrame()

testSet = pd.read_csv(r"C:\Users\yug\Desktop\Projects\AIML\Titanic\test.csv")
PassIDs = testSet["PassengerId"].copy()

finalData["PassengerId"] = PassIDs
finalData["Survived"] = pd.DataFrame(GBCResults)

finalData.to_csv(r"C:\Users\yug\Desktop\Projects\AIML\Titanic\finalData.csv", index=False)