# KAGGLE COMPITITION - TITANIC: MACHINE LEARNING FROM DISASTER

# Classifier + Scaling + PCA + Hyperparameter Tuning

* BY ASHISH H GARUD

In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import sklearn as skl
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import warnings # current version of seaborn generates a bunch of warnings that will be ignore
warnings.filterwarnings('ignore')

# 1. DATASET PREPRATION AND PREPROCESSING

# Data Collection

In [35]:
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")

# Data Preprocessing

* Data New Features Creation

In [36]:
#train_test_data = [train, test]
for dataset in [train]:
    train['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.')

for dataset in [test]:
    test['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.')

In [37]:
#train['Title'].nunique()
train['Title'].unique()

#train['Title'].value_counts()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'Countess',
       'Jonkheer'], dtype=object)

In [38]:
# master mister(mr.) miss(ms.) misses(mrs.) others
for dataset in [train]:
    train['Title'] = train['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Other')

    train['Title'] = train['Title'].replace('Mlle', 'Miss')
    train['Title'] = train['Title'].replace('Ms', 'Miss')
    train['Title'] = train['Title'].replace('Mme', 'Mrs')
    
for dataset in [test]:
    test['Title'] = test['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Other')

    test['Title'] = test['Title'].replace('Mlle', 'Miss')
    test['Title'] = test['Title'].replace('Ms', 'Miss')
    test['Title'] = test['Title'].replace('Mme', 'Mrs')

* Data Features Selection

In [39]:
train.drop(['PassengerId', 'Cabin', 'Name', 'Fare', 'Ticket'], axis=1, inplace=True)
test.drop(['Cabin', 'Name', 'Fare', 'Ticket'], axis=1, inplace=True)
#train.drop(['PassengerId', 'Cabin', 'Fare', 'Ticket'], axis=1, inplace=True)
#test.drop(['Cabin', 'Fare', 'Ticket'], axis=1, inplace=True)

* Data Features Conversion

In [40]:
genders = {"male": 0, "female": 1}
train['Sex'] = train['Sex'].map(genders)
test['Sex'] = test['Sex'].map(genders)

In [41]:
ports = {"S": 0, "C": 1, "Q": 2}
train['Embarked'] = train['Embarked'].map(ports)
test['Embarked'] = test['Embarked'].map(ports)

* Data Missing Handling

In [42]:
train.dropna(subset=['Embarked'], how='any', inplace=True)
test.dropna(subset=['Embarked'], how='any', inplace=True)

In [43]:
#train.groupby('Title').Age.value_counts()

xx = train[['Title', 'Age']].groupby(['Title'], as_index=False).mean()    #.set_index('Title')

#xx
#type(xx)
xx.to_dict()
#type(xx)
#xx

yy = {'Master': 4.57,
  'Miss': 21.73,
  'Mr': 32.36,
  'Mrs': 35.54,
  'Other': 45.54}

#train.Age=train.Age.fillna(value=train.Age.mean())
#test.Age=test.Age.fillna(value=train.Age.mean())
#train.Age=train.Age.fillna(value=train.Age.median())
#test.Age=test.Age.fillna(value=train.Age.median())

train['Age'] = train['Age'].mask(pd.isnull, train['Title'].map(yy))
test['Age'] = test['Age'].mask(pd.isnull, test['Title'].map(yy))
#train['Age'] = train['Age'].fillna(train['Title'].map(yy))
#df.loc[df['B'].isnull(), 'B'] = df['A'].map(dict)
#train['Age'] = train['Title'].map(yy).fillna(train['Age'])
#train['Age'] = test.Age.fillna(value = train['Title'].map(yy))

* 
 * Data Features Conversion

In [44]:
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Other": 5}
for dataset in [train]:
    train['Title'] = dataset['Title'].map(title_mapping)
    train['Title'] = dataset['Title'].fillna(0)

for dataset in [test]:
    test['Title'] = dataset['Title'].map(title_mapping)
    test['Title'] = dataset['Title'].fillna(0)

* feature scaling (rescaling) - Data Standardization
  * Standardize features by removing the mean and scaling to unit variance
  * Normalization

In [45]:
from sklearn.preprocessing import StandardScaler

In [53]:
sc = StandardScaler()

In [48]:
train.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked',
       'Title'],
      dtype='object')

In [49]:
test.columns

Index(['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked',
       'Title'],
      dtype='object')

# 2.

In [60]:
from sklearn import model_selection

In [55]:
Xtrain = train.drop(['Survived'], axis=1)
ytrain = train['Survived']

Xtest = test.drop("PassengerId", axis=1)
#ytest

In [63]:
train_scaled = sc.fit_transform(Xtrain)
test_scaled = sc.transform(Xtest)

Xtrain = pd.DataFrame(train_scaled, columns = Xtrain.columns)
Xtest = pd.DataFrame(test_scaled, columns = Xtest.columns)

# 3. MODELING

In [57]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Perceptron
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
#from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score

from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# Model evaluation and testing

* Leave One Out (LOO) Cross Validation

In [61]:
from sklearn.model_selection import LeaveOneOut

In [64]:
# Spot check algorithms
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
models.append(('PCT', Perceptron()))
models.append(('RFC', RandomForestClassifier()))
#models.append(('MLP', MLPClassifier()))


# evaluate each model
results = []
names = []
results_mean = []
for name, model in models:
    loocv = model_selection.LeaveOneOut()
    cv_results = model_selection.cross_val_score(model, Xtrain, ytrain, cv = loocv, scoring = 'accuracy')  
    
    results.append(cv_results)
    names.append(name)
    results_mean.append(cv_results.mean())
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
    
    fitting  = model.fit(Xtrain, ytrain)
    predictions = model.predict(Xtest)
    df = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': predictions}).set_index('PassengerId')
    df.to_csv('%sAgeTitleMap00.csv' %name)

ValueError: Unknown label type: 'continuous'