# Attempt on 4/17/2018 to perform the following on Titanic Dataset
 - Data Imputing
 - Create Dummy variables
 - Scale/ Transform the data 
 - Model Selection
 - Build Classification Models 
 - Tune Hyperparameters using GridSearchCV
 - Assess the Model


#### Libraries for Data Imputing

In [139]:
from sklearn.preprocessing import Imputer


#### Libraries for Dummy Variable Creation

In [140]:
import pandas as pd # Pandas has getdummies() function which converts char attributes to numeric 
import numpy as np 

#### Libraries to Scale the Data 

In [141]:
from sklearn.preprocessing import StandardScaler # This function is used during pipeline process
from sklearn.preprocessing import scale


#### Libraries for model split and selection

In [142]:
from sklearn.model_selection import train_test_split # to split training and testing datasets
from sklearn.grid_search import GridSearchCV # Grid search for Hyperparameter tuning and Cross Validation

#### Library to Pipeline various steps

In [143]:
from sklearn.pipeline import Pipeline

#### Libraries to Import Machine Learning Models/Classifiers

In [144]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier  # Different types of Classifiers


#### Libraries to Import Metrics from Classification Models

In [145]:
from sklearn.metrics import classification_report,confusion_matrix

## Importing Data and Cleaning the data 

In [146]:
df_train = pd.read_csv('data/train.csv')

In [147]:
df_train.info() # Basic Understanding of Data Structure and Missing Data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


###### Please note there are missing values in Age and Cabin

In [148]:
df_train.head(10) # To have an idea on the data by looking at first 10 observations

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [149]:
df_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [150]:
df_train_2 = df_train.drop(["Cabin"],axis=1) # Droping Cabin attribute as it has <25 % of Data

In [151]:
df_train_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(4)
memory usage: 76.6+ KB


In [152]:
df_train_3 = df_train_2.drop(["PassengerId","Name","Ticket"],axis=1)

In [153]:
df_train_3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Embarked    889 non-null object
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


##### Convert Character Attributes to Numeric  -- Dummy variables

In [154]:
df_train_4 = pd.get_dummies(df_train_3, drop_first=True)

In [155]:
df_train_4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
Survived      891 non-null int64
Pclass        891 non-null int64
Age           714 non-null float64
SibSp         891 non-null int64
Parch         891 non-null int64
Fare          891 non-null float64
Sex_male      891 non-null uint8
Embarked_Q    891 non-null uint8
Embarked_S    891 non-null uint8
dtypes: float64(2), int64(4), uint8(3)
memory usage: 44.5 KB


In [156]:
df_train_4.head(10)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,7.25,1,0,1
1,1,1,38.0,1,0,71.2833,0,0,0
2,1,3,26.0,0,0,7.925,0,0,1
3,1,1,35.0,1,0,53.1,0,0,1
4,0,3,35.0,0,0,8.05,1,0,1
5,0,3,,0,0,8.4583,1,1,0
6,0,1,54.0,0,0,51.8625,1,0,1
7,0,3,2.0,3,1,21.075,1,0,1
8,1,3,27.0,0,2,11.1333,0,0,1
9,1,2,14.0,1,0,30.0708,0,0,0


##### Seperating Feature and Target Variables from Training dataset

In [157]:
y = df_train_4[["Survived"]]

In [158]:
type(y)

pandas.core.frame.DataFrame

In [159]:
X = df_train_4.drop(['Survived'], axis=1)

In [160]:
type(X)

pandas.core.frame.DataFrame

In [161]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
Pclass        891 non-null int64
Age           714 non-null float64
SibSp         891 non-null int64
Parch         891 non-null int64
Fare          891 non-null float64
Sex_male      891 non-null uint8
Embarked_Q    891 non-null uint8
Embarked_S    891 non-null uint8
dtypes: float64(2), int64(3), uint8(3)
memory usage: 37.5 KB


##### Imputing Mean Values for Age Attribute

In [162]:
imp = Imputer(missing_values='NaN', strategy='mean', axis=1)

In [163]:
imp.fit(X)

Imputer(axis=1, copy=True, missing_values='NaN', strategy='mean', verbose=0)

In [164]:
X = imp.transform(X)

In [165]:
df_X = pd.DataFrame(X)

In [166]:
df_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
0    891 non-null float64
1    891 non-null float64
2    891 non-null float64
3    891 non-null float64
4    891 non-null float64
5    891 non-null float64
6    891 non-null float64
7    891 non-null float64
dtypes: float64(8)
memory usage: 55.8 KB


### Building Classification Models on the Training Dataset

#### Logistic Regression

In [167]:
logreg = LogisticRegression()

In [168]:
type(y)

pandas.core.frame.DataFrame

In [169]:
y = np.array(y)

In [170]:
type(y)

numpy.ndarray

In [171]:
type(X)

numpy.ndarray

In [172]:
logreg.fit(X, y)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

##### Logistic Regression Model Performance on Training Data 

In [173]:
logreg.score(X, y)  

0.80134680134680136

#### KNN Classifier Model

In [174]:
knn = KNeighborsClassifier()

In [175]:
knn.fit(X, y)

  """Entry point for launching an IPython kernel.


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

##### KNN Model Performance on Training Data 

In [176]:
knn.score(X, y)

0.80134680134680136

#### Decision Tree Classifier

In [177]:
decisionTree = DecisionTreeClassifier()

In [178]:
decisionTree.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

##### Decision Tree Performance on Training Data 

In [179]:
decisionTree.score(X, y)

0.98204264870931535

#### Support Vector Classifier 

In [180]:
svc = SVC()

In [181]:
svc.fit(X, y)

  y = column_or_1d(y, warn=True)


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

##### Decision Tree Performance on Training Data 

In [182]:
svc.score(X, y)

0.88664421997755327

#### Random Forest Classification

In [183]:
randomForest = RandomForestClassifier()
randomForest.fit(X, y)

  


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

##### Random Forest Performance on Training Data

In [184]:
randomForest.score(X, y)

0.96857463524130194

#### Scale/ Normalize the Training Data 

In [185]:
X_Scaled = scale(X)

#### Decision Tree on Scaled Data

In [186]:
decisionTree.fit(X_Scaled, y)
decisionTree.score(X_Scaled, y)

0.98204264870931535

### Predictions on Test Dataset using Decision Tree

In [187]:
df_test = pd.read_csv("data/test.csv")

In [188]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


#### Drop unused and high missing value attributes

In [189]:
df_test_1 = df_test.drop(["PassengerId","Name","Ticket","Cabin"], axis =1)

#### Create Dummy variables on Character Attributes - All the Features should be numeric

In [190]:
df_test_2 = pd.get_dummies(df_test_1, drop_first=True) # drop_first=true makes sure that one attribute from n dummies is droped so that the attribute doesn't have duplicate data

In [191]:
df_test_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 8 columns):
Pclass        418 non-null int64
Age           332 non-null float64
SibSp         418 non-null int64
Parch         418 non-null int64
Fare          417 non-null float64
Sex_male      418 non-null uint8
Embarked_Q    418 non-null uint8
Embarked_S    418 non-null uint8
dtypes: float64(2), int64(3), uint8(3)
memory usage: 17.6 KB


#### Impute the missing data in Age attribute

In [192]:
Imp2 = Imputer(missing_values='NaN', strategy='mean', axis=1)

In [193]:
imp.fit(df_test_2)

Imputer(axis=1, copy=True, missing_values='NaN', strategy='mean', verbose=0)

In [194]:
X_test = imp.transform(df_test_2)

#### Scale/Normalize the test dataset

In [195]:
X_test_Scaled = scale(X_test) 

#### Decision Tree on Test Data and Predictions

In [196]:
decisionTree.fit(X_Scaled, y)
y_pred= decisionTree.predict(X_test_Scaled)

In [197]:
df_predict = pd.DataFrame(y_pred)

In [198]:
df_predict.to_csv('data/Predicted_Titanic_4_17_2018.csv')

#### Logistic Regression on Test Data and Predictions 

In [219]:
logreg.fit(X_Scaled, y)
y_pred_logreg = logreg.predict(X_test_Scaled)

In [220]:
df_predict_logreg = pd.DataFrame(y_pred_logreg)

In [221]:
df_predict_logreg.to_csv('data/Predicted_Titanic_4_17_2018_Sub2.csv')

#### Knn on Test Data and Predictions 

In [202]:
knn.fit(X_Scaled, y)
y_pred_knn = knn.predict(X_test_Scaled)

  """Entry point for launching an IPython kernel.


In [203]:
df_predict_knn = pd.DataFrame(y_pred_knn)
df_predict_knn.to_csv('data/Predicted_Titanic_4_17_2018_Sub3.csv')

#### Random Forest on Test Data and Predictions 

In [211]:
print('Type of the Target Attribute is : ',type(y))
print('Type of the Target Attribute is : ',type(X_Scaled))

Type of the Target Attribute is :  <class 'numpy.ndarray'>
Type of the Target Attribute is :  <class 'numpy.ndarray'>


In [215]:
y= np.ravel(y)

In [217]:
randomForest.fit(X_Scaled, y)
y_pred_Ranfor = randomForest.predict(X_test_Scaled)

In [218]:
df_predict_Ranfor = pd.DataFrame(y_pred_Ranfor)
df_predict_Ranfor.to_csv('data/Predicted_Titanic_4_17_2018_Sub4.csv')

- Next Steps are to perform Cross validation and Hyperparameter tuning and see the score
- May be not droping 'Cabin' attribute and trying to use it by imputing might improve the score

### Building Logistic Regression and Random Forest Models using Hyperparameter tuning and GridSearchCV

- Logistic Regression Hyperparameter Tuning 

In [223]:
# Creating hyperparameter Grid for Logistic Regression
c_space = np.logspace(-5, 8, 15)
param_grid = {'C': c_space, 'penalty': ['l1', 'l2']}

In [224]:
# Instatntiate GridSearchCV for logistic Regression with 5- fold cross validation
logreg_cv = GridSearchCV(logreg, param_grid, cv = 5)

In [225]:
# Fit the Model on Scaled Training dataset and Predict the model on Scaled test dataset
logreg_cv.fit(X_Scaled, y)
y_pred_logreg_cv = logreg_cv.predict(X_test_Scaled)

In [226]:
# Save the csv file to submit 
df_predict_logreg_cv = pd.DataFrame(y_pred_logreg_cv)
df_predict_logreg_cv.to_csv('data/Predicted_Titanic_4_18_2018_logreg_cv_Sub5.csv')

In [227]:
# Best parameters for Logistic Regression on this Model 
print("Tuned Logistic Regression Parameter: {}".format(logreg_cv.best_params_))
print("Tuned Logistic Regression Accuracy: {}".format(logreg_cv.best_score_))


Tuned Logistic Regression Parameter: {'C': 0.0061054022965853268, 'penalty': 'l1'}
Tuned Logistic Regression Accuracy: 0.7867564534231201


- Random Forest Hyperparameter Tuning

In [249]:
# Creating hyperparameter Grid for Random forest. The below are few parameters that need to be optimized. There are many
param_grid_r = {'max_depth': np.arange(3, 10)}

In [250]:
# Instatntiate GridSearchCV for Random forest with 5- fold cross validation
randomForest_cv = GridSearchCV(randomForest, param_grid_r, cv = 5)

In [251]:
# Fit the Model on Scaled Training dataset and Predict the model on Scaled test dataset
randomForest_cv.fit(X_Scaled, y)
y_pred_randomForest_cv = randomForest_cv.predict(X_test_Scaled)

In [252]:
# Save the csv file to submit 
df_predict_RF_cv = pd.DataFrame(y_pred_randomForest_cv)
df_predict_RF_cv.to_csv('data/Predicted_Titanic_4_18_2018_RF_cv_Sub6.csv')

In [None]:
# Trying to tune n_estimators 

In [253]:
param_grid_r_2 = {"n_estimators": np.arange(5,15)}

In [254]:
# Instatntiate GridSearchCV for Random forest with 5- fold cross validation
randomForest_cv_2 = GridSearchCV(randomForest, param_grid_r_2, cv = 5)

In [255]:
# Fit the Model on Scaled Training dataset and Predict the model on Scaled test dataset
randomForest_cv_2.fit(X_Scaled, y)
y_pred_randomForest_cv_2 = randomForest_cv_2.predict(X_test_Scaled)

In [256]:
# Save the csv file to submit 
df_predict_RF_cv_2 = pd.DataFrame(y_pred_randomForest_cv_2)
df_predict_RF_cv_2.to_csv('data/Predicted_Titanic_4_18_2018_RF_cv_2_Sub7.csv')