In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy
%matplotlib inline

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
fullTrainSet = pd.read_csv(r"C:\Users\yug\Desktop\Projects\AIML\Titanic Spaceship\cleanedUpTrain.csv")

xSet = fullTrainSet.iloc[:, :9].copy()
xSet = xSet.drop(["PassengerId", "Cabin", "Name"], axis=1)
ySet = fullTrainSet.iloc[:, 9].copy()
ySet = LabelBinarizer().fit_transform(ySet)

xTrain, xTest, yTrain, yTest = train_test_split(xSet,ySet, test_size=0.2)

In [21]:
corrMatrix = fullTrainSet.corr()
print(corrMatrix["Transported"].sort_values(ascending=True))
# For now I'll keep all features, but reminder to check heatmap using Random Forests and go from there

TotalExpenditure   -0.197107
Age                -0.074233
VIP                -0.037261
CryoSleep           0.458258
Transported         1.000000
Name: Transported, dtype: float64


In [22]:
# For a custom encoder that can encode multiple columns at once
from sklearn.preprocessing import LabelEncoder
classIndices = ["HomePlanet","CryoSleep","VIP","Destination"]
class MultiClassEncoder:
    def __init__(self, classIndices=None):
        self.classIndices = classIndices

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        xCopy = X.copy()
        for Indice in self.classIndices:
            xCopy.loc[:, Indice] = LabelEncoder.fit_transform(LabelEncoder, y=xCopy.loc[:, Indice])
        return xCopy
    
    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [23]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler
labelPipeline = Pipeline([
    ("LabelEncoder", MultiClassEncoder(classIndices)),
])
numPipeline = Pipeline([
    ("stdScaler", StandardScaler())
])

prepxTrain = labelPipeline.fit_transform(xTrain)
prepxTrain = numPipeline.fit_transform(prepxTrain)
prepyTrain = np.ravel(yTrain)

prepxTest = labelPipeline.fit_transform(xTest)
prepxTest = numPipeline.fit_transform(prepxTest)
prepyTest = np.ravel(yTest)

In [24]:
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

ridgeClf = RidgeClassifier(alpha=0.1)
ridgeClf.fit(prepxTrain,prepyTrain)
yPredRidge = ridgeClf.predict(prepxTest)
print(np.sqrt(accuracy_score(yTest,yPredRidge)))

decTreeClf = DecisionTreeClassifier()
decTreeClf.fit(prepxTrain,prepyTrain)
yPredTree = decTreeClf.predict(prepxTest)
print(np.sqrt(accuracy_score(yTest,yPredTree)))

SVCClf = LinearSVC(C=0.001)
SVCClf.fit(prepxTrain,prepyTrain)
yPredSVC = SVCClf.predict(prepxTest)
print(np.sqrt(accuracy_score(yTest,yPredSVC)))

logReg = LogisticRegression(C=50)
logReg.fit(prepxTrain,prepyTrain)
yPredLogReg = logReg.predict(prepxTest)
print(np.sqrt(accuracy_score(yTest,yPredLogReg)))

0.8461256967212962
0.8007328443254957
0.8461256967212962
0.8468050430344449


In [25]:
# Param Grid searching for a promising model for Logistic Regression
from sklearn.model_selection import GridSearchCV

myGrid = GridSearchCV(estimator=logReg ,param_grid={"C":[1,10,50,0.1,0.001,0.5], "solver": ["lbfgs", "liblinear"]})
myGrid.fit(prepxTrain,prepyTrain)
myGrid.best_params_

{'C': 0.001, 'solver': 'liblinear'}

In [26]:
from sklearn.base import clone, ClassifierMixin
from sklearn.ensemble import RandomForestClassifier
RFCClf = RandomForestClassifier(bootstrap=False)

class ClassifierEarlyStopping():
    def __init__(self, estimator, nEstimators):
        self.estimator = estimator
        self.nEstimators = nEstimators
    def estimatorFit(self):
        estimator = clone(self.estimator)
        estimator.n_estimators = 1
        estimator.warm_start = True
        return estimator
        
    def earlyStopping(self,X,y, xVal, yVal):
        est = self.estimatorFit()
        est.n_estimators = self.nEstimators

        errors= []
        for nEst in range(1, est.n_estimators+1):
            est.n_estimators = nEst
            est.fit(X,y)
            yPred = est.predict(xVal)
            errors.append(accuracy_score(yVal, yPred))
        bestError = np.amax(errors)
        return bestError

ae = ClassifierEarlyStopping(RFCClf, 200).earlyStopping(prepxTrain,prepyTrain, prepxTest,prepyTest)
print(ae)

0.6647498562392179


In [27]:
from sklearn.ensemble import GradientBoostingClassifier

GBReg = GradientBoostingClassifier(learning_rate=0.5, n_estimators=250, warm_start=True)
GBReg.fit(prepxTrain,prepyTrain)
errors = []
for yPred in GBReg.staged_predict(prepxTest):
    errors = errors + [np.sqrt(accuracy_score(prepyTest, yPred))]
bestModel = np.argmax(errors) + 1
print(bestModel)
bestGBRReg = GradientBoostingClassifier(learning_rate=0.5, n_estimators=bestModel)
bestGBRReg.fit(prepxTrain,prepyTrain)
yPredGBR = bestGBRReg.predict(prepxTest)
print(np.sqrt(accuracy_score(prepyTest, yPredGBR)))

3
0.8542421961772491


The best model seems to be a GradientBoostingRegressor so far.

In [28]:
fullTestSet = pd.read_csv(r"C:\Users\yug\Desktop\Projects\AIML\Titanic Spaceship\cleanedUpTest.csv")
fullTestSet = fullTestSet.iloc[:, :13].copy()
passengerID = fullTestSet.iloc[:, 0].copy()
fullTestSet = fullTestSet.drop(["PassengerId", "Cabin", "Name"], axis=1)
prepTestSet = labelPipeline.fit_transform(fullTestSet)
prepTestSet = numPipeline.fit_transform(prepTestSet)
finalPreds = bestGBRReg.predict(prepTestSet)
print(finalPreds)
finalPreds = finalPreds.tolist()

for i in range(len(finalPreds)):
    if(finalPreds[i] == 1):
        finalPreds[i] = True
    elif(finalPreds[i] == 0):
        finalPreds[i] = False

[1 0 1 ... 1 0 1]


In [29]:
finalData = pd.DataFrame()
finalData["PassengerId"] = passengerID
finalData["Transported"] = pd.DataFrame(finalPreds)
finalData.to_csv(r"C:\Users\yug\Desktop\Projects\AIML\Titanic Spaceship\finalPreds.csv", index=False)