In [16]:
# data analysis and wrangling
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport

# visualization
import pydotplus
from sklearn.tree import export_graphviz
import seaborn as sns
import matplotlib.pyplot as plt

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

# machine learning
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

#preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import OrdinalEncoder


import warnings
warnings.filterwarnings('ignore')


# I. Acquire data

The Python Pandas packages helps us work with our datasets. We start by acquiring the training and testing datasets into Pandas DataFrames. We also combine these datasets to run certain operations on both datasets together.

In [17]:
train_data = pd.read_csv('../../dataset/titanic/train.csv')
test_data = pd.read_csv('../../dataset/titanic/test.csv')

# II. Exploratory Data Analysis

# III. Let's build our first model : baseline

# IV. Let's build improve our model



## a. Preprocessing

In [22]:
train_df = train_data.copy()
test_df = test_data.copy()

#Preprocessing : with mean for train
train_df['Age'].fillna(train_df['Age'].mean(),inplace=True)
train_df['Embarked'].fillna('X',inplace=True)
train_df['Cabin'].fillna('XX',inplace=True)
train_df['Sex'] = train_df['Sex'].map({'female':0,'male':1})

#categories1 = [['Sex']]
#for cat in categories1:
#lb = OrdinalEncoder()
#lb.fit(X[cat])

#X[cat] = lb.transform(X[cat])
#X_test[cat] = lb.transform(X_test[cat])

    
    
#Preprocessing : with mean for test (don't use the statistic of the test in the train!!!!!!!)
test_df['Age'].fillna(test_df['Age'].mean(),inplace=True)
test_df['Embarked'].fillna('X',inplace=True)
test_df['Cabin'].fillna('XX',inplace=True)
test_df['Sex'] = test_df['Sex'].map({'female':0,'male':1})
test_df['Fare'].fillna(test_df['Fare'].mean(),inplace=True)

## b. Feature encoding

In [23]:
#For difference between get_dummies and OneHotEncoder
#https://stackoverflow.com/questions/36631163/pandas-get-dummies-vs-sklearns-onehotencoder-what-are-the-pros-and-cons
#One Hot encoding
#categories2 = ['Embarked']
#print(pd.get_dummies(X, columns=categories2))

encoder = OneHotEncoder(handle_unknown="ignore")
encoder.fit(train_df[['Embarked']])    # Assume for simplicity all features are categorical.
# Apply the encoder for train
a = encoder.transform(train_df[['Embarked']])
other = pd.DataFrame(data=a.toarray(),columns=['Embarked_1','Embarked_2','Embarked_3','Embarked_4'])
train_df = train_df.join(other,lsuffix='_caller', rsuffix='_other')

# Apply the encoder for test
a = encoder.transform(test_df[['Embarked']])
other = pd.DataFrame(data=a.toarray(),columns=['Embarked_1','Embarked_2','Embarked_3','Embarked_4'])
test_df = test_df.join(other,lsuffix='_caller', rsuffix='_other')

In [24]:
X = train_df[['Pclass', 'Sex','Age','SibSp','Parch','Fare',
              'Embarked_1','Embarked_2','Embarked_3','Embarked_4']]

sub_test = test_df[['Pclass', 'Sex','Age','SibSp','Parch','Fare',
              'Embarked_1','Embarked_2','Embarked_3','Embarked_4']]


## c. Feature Scaling
<a id="feature_scaling" ></a>
***
Feature scaling is an important concept of machine learning models. Often times a dataset contain features highly varying in magnitude and unit. For some machine learning models, it is not a problem. However, for many other ones, its quite a problem. Many machine learning algorithms uses euclidian distances to calculate the distance between two points, it is quite a problem. 

Some algorithms may not necessarily need feature scaling, like decision trees. In contrast, neural networks are trained via gradient-based algorithms, and so feature rescaling speeds up and stabilizes training by alleviating skew in the objective function contours that often accompanies features of varying magnitude.

There are multiple ways to do feature scaling. 
<ul>
    <li><b>MinMaxScaler</b>-Scales the data using the max and min values so that it fits between 0 and 1.</li>
    <li><b>StandardScaler</b>-Scales the data so that it has mean 0 and variance of 1.</li>
    <li><b>RobustScaler</b>-Scales the data similary to Standard Scaler, but makes use of the median and scales using the interquertile range so as to aviod issues with large outliers.</b>
 </ul>

So, data leakage is possible when data statistics are used

**Doc on line:**
- https://sebastianraschka.com/faq/docs/scale-training-test.html
- https://www.quora.com/Should-scaling-be-done-on-both-training-data-and-test-data-for-machine-learning-Can-one-do-scaling-on-only-the-training-data

In [10]:
X = train_df[['Fare','SibSp','Parch','Sex']].values
sub_test = test_df[['Fare', 'SibSp','Parch','Sex']].values
y = train_df['Survived']

In [11]:
# Preprcessing of train
scl = MinMaxScaler()
X_scaled_minmax = scl.fit_transform(X)

scl = StandardScaler()
X_scaled_std = scl.fit_transform(X)

#preprocessing of test
scl = MinMaxScaler()
X_test_scaled_minmax = scl.fit_transform(sub_test)

scl = StandardScaler()
X_test_scaled_std = scl.fit_transform(sub_test)

In [26]:
#X = X_scaled_std
#sub_test = X_scaled_std

cv = StratifiedKFold(n_splits=5,random_state=10,shuffle=True)

results_df_test = pd.DataFrame()
results_df_train = pd.DataFrame(data=np.zeros((5,2)),columns=['Train_error', 'Test_error'])

fold=0

for train_index, test_index in cv.split(X, y):
    X_train = X.loc[train_index]
    y_train = y.loc[train_index]

    X_test = X.loc[test_index]
    y_test = y.loc[test_index]

    model = KNeighborsClassifier()
    model.fit(X_train,  y_train)

    pred_sub   = model.predict_proba(sub_test)[:,1]
    results_df_test['fold_'+str(fold)] = pred_sub
    
    pred_train = model.predict(X_train)
    pred_test = model.predict(X_test)
    
    score = 1 - accuracy_score(y_train,pred_train)
    results_df_train.loc[fold,'Train_error'] = round(score*100,2)
    
    score = 1 - accuracy_score(y_test,pred_test)    
    results_df_train.loc[fold,'Test_error'] = round(score*100,2)
    
    
    
    fold +=1

#Mean strategy
preds = (results_df_test.mean(axis=1) >=0.5).astype(int)

my_final_sub = pd.read_csv('../../dataset/titanic/test.csv')[['PassengerId']]
my_final_sub['Survived'] = preds

my_final_sub.to_csv('submission_knn.csv', index=False)
#Kaggle Score : 0.76076

In [27]:
1- model.score(X,y)

0.21773288439955107

In [28]:
results_df_train.mean()

Train_error    20.120
Test_error     30.974
dtype: float64

sans : 0.1728395061728395
minmax : 0.16049382716049387
std : 0.16273849607182944
