In [None]:
import math, time, random, datetime
%matplotlib inline
#Data Manipulation
import numpy as np
import pandas as pd

#Visualization
import matplotlib.pyplot as plt
import missingno
import seaborn as sns
plt.style.use('seaborn-whitegrid')

#Preprocessing
from sklearn.preprocessing import (OneHotEncoder, 
LabelEncoder, label_binarize)

#Machine learning
import catboost
from sklearn.model_selection import train_test_split
from sklearn import (model_selection, tree, preprocessing,
metrics, linear_model)
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import (LinearRegression, 
LogisticRegression, SGDClassifier)
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier, Pool, cv

#rebels and ignore warnings
import warnings
warnings.filterwarnings('ignore')


In [None]:
#Import train and test data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
gender_submission = pd.read_csv('gender_submission.csv') #e.g of what a submisision should look like

In [None]:
#View the training data
train.head(5)

In [None]:
train.Age.plot.hist()
#train.Fare.plot.hist()

In [None]:
#View the test data
test.head()

In [None]:
#View the example submission dataframe
gender_submission.head()

In [None]:
#see the data statistics
train.describe()

In [None]:
#graphic of missing values
missingno.matrix(train, figsize = (10,3))

In [None]:
train.columns

In [None]:
#to see the number of missing values
train.isnull().sum()

# To Perform data analysis, we are going to see two new dataframe:
    i. to explore discretised variables
    ii. to explore continuous variables

In [None]:
df_bin = pd.DataFrame() #for discretised continuous variables
df_con = pd.DataFrame() # for continuous variables

In [None]:
train.dtypes

In [None]:
train.head(2)

In [None]:
#get the info of the number of people survived
fig = plt.figure(figsize = (5,1))
sns.countplot(y ='Survived', data = train)
print(train.Survived.value_counts())

In [None]:
#adding  this to subset dataframes
df_bin['Survived'] = train['Survived']
df_con['Survived'] = train['Survived']

In [None]:
df_bin.head()
print(len(df_bin))

In [None]:
df_con.head()

In [None]:
#to get the idea of the data distribution according to the passenger  class
sns.distplot(train.Pclass)

In [None]:
#give the number of missing variables in Pclass
train.Pclass.isnull().sum()

In [None]:
#since there  is no missing  values adding Pclass in the sub-dataframes
df_bin['Pclass'] = train['Pclass']
df_con['Pclass'] = train['Pclass']

In [None]:
df_bin.head()

In [None]:
#Gives the total  number of different names in the data
train.Name.value_counts()

In [None]:
#gives the visualization of Sex distribution
plt.figure(figsize = (5,1))
sns.countplot(y = 'Sex', data = train)


In [None]:
#Checking the  missing value in the sex column
train.Sex.isnull().sum()

In [None]:
#adding to the sub-dataframe
df_bin['Sex'] = train['Sex']
df_bin['Sex'] = np.where(df_bin['Sex'] == 'female', 1,0)
df_con['Sex'] = train['Sex']

In [None]:
df_con.head()

In [None]:
#givees the visualization of the Sex variable compared to Survivla
fig = plt.figure(figsize = (10,10))
sns.distplot(df_bin.loc[df_bin['Survived'] == 1]['Sex'], kde_kws = {'label':'Survived'})
sns.distplot(df_bin.loc[df_bin['Survived'] == 0]['Sex'], kde_kws = {'label':'Did not survived'})

In [None]:
#missing data in Age
train.Age.isnull().sum()

Function to create count and distribution visualizations

In [None]:
def plotCountDist(data,bin_df, label_column, target_column, 
                  figsize, use_bin_df = False):
    """
    Function to plot counts and distributions of a 
    label variable and target variable side by side.
    data = target dataframe
    bin_df = binned dataframe for countplot
    label_column = binary labelled column
    target_column = column we want to view counts and distributions
    use_bin_df = whether or not to use the bin_df, default False
    """
    if use_bin_df:
        fig = plt.figure(figsize = figsize)
        plt.subplot(1,2,1)
        sns.countplot(y = target_column, data = bin_df);
        plt.subplot(1,2,2)
        sns.distplot(data.loc[data[label_column] == 1][target_column],
                     kde_kws = {"label":"survived"})
        sns.distplot(data.loc[data[label_column] == 0][target_column],
                    kde_kws = {"label":"Did not survived"})
        
    else:
        fig = plt.figure(figsize = figsize)
        plt.subplot(1,2,1)
        sns.countplot(y = target_column, data = data);
        plt.subplot(1,2,2)
        sns.distplot(data.loc[data[label_column] == 1][target_column],
                     kde_kws = {"label":"survived"})
        sns.distplot(data.loc[data[label_column] == 0][target_column],
                    kde_kws = {"label":"Did not survived"})


In [None]:
train.SibSp.isnull().sum()

In [None]:
train.SibSp.value_counts()

In [None]:
df_bin['SibSp'] = train['SibSp']
df_con['SibSp'] = train['SibSp']

In [None]:
#Visualise the counts of sibling and spouse and the distribution of the values
#against survived
plotCountDist(train, 
              bin_df = df_bin,
              label_column = 'Survived',
              target_column = 'SibSp',
              figsize = (10,5))

In [None]:
train.Parch.isnull().sum()

In [None]:
train.Parch.value_counts()

In [None]:
#add parch to the sub-dataframes
df_bin['Parch'] = train['Parch']
df_con['Parch'] = train['Parch']


In [None]:
#visualise the counts of Parch and the distribution of the values against survived
plotCountDist(train,
             bin_df = df_bin,
             label_column = 'Survived',
             target_column = 'Parch',
             figsize = (10,5))

In [None]:
train.head(2)

In [None]:
df_con.head()

In [None]:
train.Ticket.isnull().sum()

In [None]:
#how many types of tickets were there?
sns.countplot(y = 'Ticket', data = train)


In [None]:
train.Ticket.value_counts()

In [None]:
#number of unique tickets
print("There are {} unique Tickets values.".format
      (len(train.Ticket.unique())))

In [None]:
train.Fare.isnull().sum()

In [None]:
sns.countplot(y='Fare', data = train)

In [None]:
train.Fare.dtype

In [None]:
len(train.Fare.unique())

In [None]:
#add Fare to the sub-dataframe
df_con['Fare'] = train['Fare']
df_bin['Fare'] = pd.cut(train['Fare'], bins = 5)

In [None]:
df_bin.head()

In [None]:
df_bin.Fare.value_counts()

In [None]:
df_con.head()

In [None]:
plotCountDist(data = train,
             bin_df = df_bin,
             label_column= 'Survived',
             target_column= 'Fare',
             figsize= (20,10),
             use_bin_df = True)

In [None]:
train.Cabin.isnull().sum()

In [None]:
train.Cabin.value_counts()

In [None]:
train.Embarked.isnull().sum()

In [None]:
train.Embarked.value_counts()

In [None]:
sns.countplot(y = 'Embarked', data = train)

In [None]:
#add Embarked to subdataframe
df_bin['Embarked'] = train['Embarked']
df_con['Embarked'] = train['Embarked']

In [None]:
#Drop missing rows from the data
print(len(df_con))
df_con = df_con.dropna(subset = ['Embarked'])
df_bin =df_bin.dropna(subset = ['Embarked'])
print(len(df_con))

In [None]:
#to drop a column from the dataframe in pandas
#df_bin.drop(['Name'], axis = 1)


In [None]:
df_bin.head()

In [None]:
df_con.head()

In [None]:
#One-hot encode binned variables
one_hot_cols = df_bin.columns.tolist()
one_hot_cols.remove('Survived')
df_bin_enc = pd.get_dummies(df_bin, columns = one_hot_cols)
df_bin_enc.head()

In [None]:
df_con.head()

In [None]:
df_con_enc = df_con.apply(LabelEncoder().fit_transform)
df_con_enc.head()

In [None]:
#one hot enconde the categorical columns
df_embarked_one_hot = pd.get_dummies(df_con['Embarked'], prefix = 'embarked')
df_sex_one_hot = pd.get_dummies(df_con['Sex'], prefix = 'sex')
df_plcass_one_hot = pd.get_dummies(df_con['Pclass'], prefix = 'pclass')

In [None]:
#combine the one hot encoded columns with df_con_enc
df_con_enc = pd.concat([df_con,
                       df_embarked_one_hot,
                       df_sex_one_hot,
                       df_plcass_one_hot], axis = 1)
#Drop the original categorical columns (because now they've been one hot eoncoded)
df_con_enc = df_con_enc.drop(['Pclass','Sex','Embarked'], axis = 1)

In [None]:
df_con_enc.head()

Machine Learning Models


In [None]:
#Selecting the dataframe we want to use first for predictions
selected_df = df_con_enc

In [None]:
selected_df.head()

In [None]:
#Split the dataframe into data and lables
TrainData = selected_df.drop('Survived',axis = 1)
TrainLabels = selected_df.Survived

In [None]:
TrainData.shape

In [None]:
TrainData.head()

In [None]:
TrainLabels.shape, TrainData.shape

In [None]:
#Fit function that runs the requested algorithm and returns the accuracy metrics
from sklearn.model_selection import cross_val_predict
def fit_ml_algorithm(algorithm, TrainData, TrainLabels, cv):
    """
    algorithm: learning algorithm
    TrainData: Train Input Data
    TrainLables: Train Output Data
    Cross-Validation (CV): The training data is split into k smaller sets
                           - The model is trained on (k-1) of the folds as training data;
                           - The resulting model is validated on the remianign part of 
                             the data (i.e., it is used to compute a preformance measure 
                             such as accuracy).
    n_jobs: The number of CPUs to used to do the computation. 
            -1 means using all the processors.
    """
    start_time = time.time()
    model = algorithm.fit(TrainData, TrainLabels)
    acc = round(model.score(TrainData, TrainLabels)*100,2)
    
    #Cross Validation
    train_pred = model_selection.cross_val_predict(algorithm, TrainData, 
                                                 TrainLabels,
                                                 cv = cv,
                                                 n_jobs = -1)
    acc_cv = round(metrics.accuracy_score(TrainLabels, train_pred)*100,2)
    log_time = (time.time() - start_time)
    
    print("Accuracy: ", acc)
    print("Acuracy CV 10-Fold: ", acc_cv)
    print("Running Time: ", datetime.timedelta(seconds = log_time))
    
    
    

In [None]:
#Logistic Regression
fit_ml_algorithm(LogisticRegression(),TrainData, TrainLabels,10)

In [None]:
#K-Nearest Neighbours
fit_ml_algorithm(KNeighborsClassifier(),TrainData, TrainLabels,10)

In [None]:
#Gaussian Naive Nayes
fit_ml_algorithm(GaussianNB(),TrainData, TrainLabels,10)

In [None]:
#Linear Support Vector Machines (SVC)
fit_ml_algorithm(LinearSVC(),TrainData, TrainLabels,10)

In [None]:
#Stochastic Gradient Descent
fit_ml_algorithm(SGDClassifier(),TrainData, TrainLabels,10)

In [None]:
#Decision Tree Classifier

fit_ml_algorithm(DecisionTreeClassifier(),TrainData, TrainLabels,10)

In [None]:
#Gradient Boost Trees
fit_ml_algorithm(GradientBoostingClassifier(),TrainData, TrainLabels,10)

In [None]:
#CatBoost ALgorithm
fit_ml_algorithm(CatBoostClassifier(),TrainData, TrainLabels,10)

In [None]:
cat_features = np.where(TrainData.dtypes != np.float)[0]
cat_features

In [None]:
train_pool = Pool(TrainData, TrainLabels, cat_features)

catboost_model = CatBoostClassifier(iterations=1000,
                                    custom_loss=['Accuracy'],
                                    loss_function='Logloss')

# Fit CatBoost model
catboost_model.fit(train_pool,
                   plot=True)

# CatBoost accuracy
acc_catboost = round(catboost_model.score(TrainData, TrainLabels) * 100, 2)

In [None]:
# How long will this take?
start_time = time.time()

# Set params for cross-validation as same as initial model
cv_params = catboost_model.get_params()

# Run the cross-validation for 10-folds (same as the other models)
cv_data = cv(train_pool,
             cv_params,
             fold_count=10,
             plot=True)

# How long did it take?
catboost_time = (time.time() - start_time)

# CatBoost CV results save into a dataframe (cv_data), let's withdraw the maximum accuracy score
acc_cv_catboost = round(np.max(cv_data['test-Accuracy-mean']) * 100, 2)

In [None]:
# Print out the CatBoost model metrics
print("---CatBoost Metrics---")
print("Accuracy: {}".format(acc_catboost))
print("Accuracy cross-validation 10-Fold: {}".format(acc_cv_catboost))
print("Running Time: {}".format(datetime.timedelta(seconds=catboost_time)))

We can see the CatBoost model has the best results. 

In [None]:
def feature_importance(model, data):
    fea_imp = pd.DataFrame({'importance': model.feature_importances_, 
                           'features': data.columns})
    fea_imp = fea_imp.sort_values(['importance','features'], ascending=[True, False]).iloc[-10:]
    _ = fea_imp.plot(kind = 'barh', x = 'features', y = 'importance', figsize = (20,10))
    return fea_imp
#plt.savefig('catboost_feature_importance.png')
feature_importance(catboost_model, TrainData)

Note:
Precision: A metric which measures a models ability to correctly identify onely relevant instances
Recall: A metric which measures a models ability to find all the relevant cases in a dataset
Combination of precision and recall gives F1 score


In [None]:
metrics = ['Precision', 'Recall', 'F1', 'AUC']
eval_metrics = catboost_model.eval_metrics(train_pool, metrics = metrics, plot = True)

In [None]:
for metric in metrics:
    print(str(metric)+":{}".format(np.mean(eval_metrics[metric])))

Low recall means there's a higher amount of false negatives(predicting did not survibed when it actually survived

Precision is higher means less false positive(i.e predicting survibed when it actually not survibed)



Submission into kaggle:

In [None]:
TrainData.head(2)

In [None]:
test.head(1)

In [None]:
#One hot encode the columns of the test data
test_embarked_one_hot = pd.get_dummies(test['Embarked'],
                                       prefix = 'embarked')

test_sex_one_hot = pd.get_dummies(test['Sex'],
                                       prefix = 'sex')

test_pclass_one_hot = pd.get_dummies(test['Pclass'],
                                       prefix = 'pclass')

In [None]:
#Combine the test one hot encoded columns with the test data
test1 = pd.concat([test,
                 test_embarked_one_hot,
                  test_sex_one_hot,
                 test_pclass_one_hot], axis = 1)
test1.head(2)

In [None]:
test1 = test1.iloc[:, 5:]  #gives all rows and columns from a to b
test1.head(2)

In [None]:
test1 = test1.drop(['Cabin', 'Embarked'], axis = 1)
test1 = test1.drop(['Ticket'], axis = 1)
test1.head(2)

In [None]:
TrainData.head(2)

Now the train and test dataset has same type of data with equal number of columns

In [None]:
test1.shape, TrainData.shape

Now test and train data are in same format. Now we can make prediction on the test data

In [None]:
#making a prediction using the catboost model on the test dataset
predictions = catboost_model.predict(test1)
predictions

In [None]:
#Create a submission dataframe and append the relevant columns
submission = pd.DataFrame()
submission['PassengerId'] = test['PassengerId']
submission['Survived']  = predictions #assigning the model predictions on the test dataset
submission.head()

In [None]:
#The submission data should look like the following
gender_submission.head()

In [None]:
#Converting submission dataframe 'Survived' column into integers
submission['Survived'] = submission['Survived'].astype(int)
submission.head()

In [None]:
#checking the test and submission dataframe has same length or no
if len(submission) == len(test):
    print("Submission dataframe has same length as test dataframe: {} rows".format(len(submission)))
else:
    print("Dataframe mismatched and  not be able to submit to Kaggle")

In [None]:
#converting submission dataframe into csv format
submission.to_csv('../catboost_sibmission.csv', index = False)
print("submissin CSV is ready")

In [None]:
#checking the submission csv to make it is in right format
submission_check = pd.read_csv("../catboost_sibmission.csv")
submission_check.head()