In [1]:
import pandas as pd 
import numpy as np 
from pycaret.classification import ClassificationExperiment

### Loading the training and testing datasets

In [10]:
train_df = pd.read_csv('train.csv') 
test_df = pd.read_csv('test.csv') 
print(f'Train set shape: {train_df.shape}')
print(f'Test set shape: {test_df.shape}')

Train set shape: (891, 12)
Test set shape: (418, 11)


Previewing the training set

In [11]:
train_df.head(5)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Getting a list of all the titles for the training set and storing it in a list 

In [13]:
of_title_list=['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
                    'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
                    'Don', 'Jonkheer']         
names = list(train_df['Name'])
title_list = [] 
for i in names: 
    title_list.append(i.split()[1][:-1])
print(title_list)


['Mr', 'Mrs', 'Miss', 'Mrs', 'Mr', 'Mr', 'Mr', 'Master', 'Mrs', 'Mrs', 'Miss', 'Miss', 'Mr', 'Mr', 'Miss', 'Mrs', 'Master', 'Mr', 'Planke', 'Mrs', 'Mr', 'Mr', 'Miss', 'Mr', 'Miss', 'Mrs', 'Mr', 'Mr', 'Miss', 'Mr', 'Don', 'Mrs', 'Miss', 'Mr', 'Mr', 'Mr', 'Mr', 'Mr', 'Planke', 'Miss', 'Mrs', 'Mrs', 'Mr', 'Miss', 'Miss', 'Mr', 'Mr', 'Miss', 'Mr', 'Mrs', 'Master', 'Mr', 'Mrs', 'Mrs', 'Mr', 'Mr', 'Miss', 'Mr', 'Miss', 'Master', 'Mr', 'Miss', 'Mr', 'Master', 'Mr', 'Master', 'Mrs', 'Mr', 'Miss', 'Mr', 'Mr', 'Miss', 'Mr', 'Mr', 'Mr', 'Mr', 'Mr', 'Mr', 'Master', 'Miss', 'Mr', 'Mr', 'Miss', 'Mr', 'Miss', 'Mrs', 'Mr', 'Mr', 'Miss', 'Mr', 'Mr', 'Mr', 'Mr', 'Mr', 'Mr', 'Mr', 'Mr', 'Mr', 'Mrs', 'Mr', 'Miss', 'Mr', 'Mr', 'Mr', 'Mr', 'Mr', 'Miss', 'Mr', 'Mr', 'Miss', 'Mr', 'Miss', 'Mr', 'Miss', 'Miss', 'Mr', 'Mr', 'Mr', 'Mr', 'Miss', 'Mr', 'Mr', 'Mr', 'Miss', 'Mr', 'Master', 'Mr', 'Mr', 'Miss', 'Mr', 'Mr', 'Mr', 'Mrs', 'Mrs', 'Mr', 'Mr', 'Miss', 'Mr', 'Mr', 'Mr', 'Mrs', 'Miss', 'Mrs', 'Mr', 'Mr', 'Mr'

Replacing the individual names with the titles 

In [14]:
train_df['Name'] = title_list
train_df.rename(columns = {'Name':'Title'}, inplace = True,errors='ignore')
train_df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Title,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,Mr,male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,Mrs,female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,Miss,female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,Mrs,female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,Mr,male,35.0,0,0,373450,8.05,,S


Checking the number of unique titles in the dataframe 

In [15]:
print(f"Number of unique titles: {train_df['Title'].nunique()}")

Number of unique titles: 31


In [16]:
train_df['Title'].unique() 

array(['Mr', 'Mrs', 'Miss', 'Master', 'Planke', 'Don', 'Rev', 'Billiard',
       'de', 'Walle', 'Dr', 'Pelsmaeker', 'Mulder', '', 'Steen', 'Carlo',
       'Mme', 'Impe', 'Ms', 'Major', 'Gordon', 'Messemaeker', 'Mlle',
       'Col', 'Capt', 'Velde', 'th', 'Shawah', 'Jonkheer', 'Melkebeke',
       'Cruyssen'], dtype=object)

Filtering the training dataframe to only contain titles from the list

In [24]:
filtered_train_df = train_df[train_df['Title'].isin(of_title_list)]
print(f'Filtered train set shape: {filtered_train_df.shape}')
filtered_train_df['Title'].unique()

Filtered train set shape: (866, 12)


array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Mlle', 'Col', 'Capt', 'Jonkheer'], dtype=object)

In [30]:
filtered_train_df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Title,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,Mr,male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,Mrs,female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,Miss,female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,Mrs,female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,Mr,male,35.0,0,0,373450,8.05,,S


Reducing the number of titles down to four: Mr, Mrs, Miss, Master 

In [33]:
def replace_titles(x):
    title=x['Title']
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
        return 'Mr'
    if title in ['Countess', 'Mme']:
        return 'Mrs'
    if title in ['Mlle', 'Ms']:
        return 'Miss'
    if title =='Dr':
        if x['Sex']=='Male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title
#Applying the function to the dataframe 
filtered_train_df['Title'] = filtered_train_df.apply(replace_titles,axis=1)
filtered_train_df['Title'].unique()

array(['Mr', 'Mrs', 'Miss', 'Master'], dtype=object)

In [34]:
filtered_train_df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Title,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,Mr,male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,Mrs,female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,Miss,female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,Mrs,female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,Mr,male,35.0,0,0,373450,8.05,,S


In [35]:
print(f"Number of unique titles: {filtered_train_df['Title'].nunique()}")

Number of unique titles: 4


For the cabins we have numbers, let's replace the individual unique cabin numbers to categories

In [42]:
filtered_train_df['Cabin'] = filtered_train_df['Cabin'].fillna('U')

In [45]:
cabin_list = list(filtered_train_df['Cabin'])
new_list = [] 
for i in cabin_list: 
    new_list.append(list(i)[0])
filtered_train_df['Cabin'] = new_list

In [46]:
filtered_train_df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Title,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,Mr,male,22.0,1,0,A/5 21171,7.25,U,S
1,2,1,1,Mrs,female,38.0,1,0,PC 17599,71.2833,C,C
2,3,1,3,Miss,female,26.0,0,0,STON/O2. 3101282,7.925,U,S
3,4,1,1,Mrs,female,35.0,1,0,113803,53.1,C,S
4,5,0,3,Mr,male,35.0,0,0,373450,8.05,U,S


Getting the family size for every passenger in the dataframe

In [47]:
filtered_train_df['Family_Size']=filtered_train_df['SibSp']+filtered_train_df['Parch']
filtered_train_df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Title,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Family_Size
0,1,0,3,Mr,male,22.0,1,0,A/5 21171,7.25,U,S,1
1,2,1,1,Mrs,female,38.0,1,0,PC 17599,71.2833,C,C,1
2,3,1,3,Miss,female,26.0,0,0,STON/O2. 3101282,7.925,U,S,0
3,4,1,1,Mrs,female,35.0,1,0,113803,53.1,C,S,1
4,5,0,3,Mr,male,35.0,0,0,373450,8.05,U,S,0


In [48]:
filtered_train_df['Age*Class']=filtered_train_df['Age']*filtered_train_df['Pclass']
filtered_train_df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Title,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Family_Size,Age*Class
0,1,0,3,Mr,male,22.0,1,0,A/5 21171,7.25,U,S,1,66.0
1,2,1,1,Mrs,female,38.0,1,0,PC 17599,71.2833,C,C,1,38.0
2,3,1,3,Miss,female,26.0,0,0,STON/O2. 3101282,7.925,U,S,0,78.0
3,4,1,1,Mrs,female,35.0,1,0,113803,53.1,C,S,1,35.0
4,5,0,3,Mr,male,35.0,0,0,373450,8.05,U,S,0,105.0


In [49]:
filtered_train_df['Fare_Per_Person']=filtered_train_df['Fare']/(filtered_train_df['Family_Size']+1)
filtered_train_df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Title,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Family_Size,Age*Class,Fare_Per_Person
0,1,0,3,Mr,male,22.0,1,0,A/5 21171,7.25,U,S,1,66.0,3.625
1,2,1,1,Mrs,female,38.0,1,0,PC 17599,71.2833,C,C,1,38.0,35.64165
2,3,1,3,Miss,female,26.0,0,0,STON/O2. 3101282,7.925,U,S,0,78.0,7.925
3,4,1,1,Mrs,female,35.0,1,0,113803,53.1,C,S,1,35.0,26.55
4,5,0,3,Mr,male,35.0,0,0,373450,8.05,U,S,0,105.0,8.05


Removing the unneeded columns, PassengerID and Ticket 

In [50]:
filtered_train_df.drop(columns=['PassengerId','Ticket'], inplace=True,errors='ignore')
filtered_train_df.head(5)

Unnamed: 0,Survived,Pclass,Title,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Family_Size,Age*Class,Fare_Per_Person
0,0,3,Mr,male,22.0,1,0,7.25,U,S,1,66.0,3.625
1,1,1,Mrs,female,38.0,1,0,71.2833,C,C,1,38.0,35.64165
2,1,3,Miss,female,26.0,0,0,7.925,U,S,0,78.0,7.925
3,1,1,Mrs,female,35.0,1,0,53.1,C,S,1,35.0,26.55
4,0,3,Mr,male,35.0,0,0,8.05,U,S,0,105.0,8.05


Machine Learning!

In [51]:
s = ClassificationExperiment() 
s.setup(filtered_train_df,target='Survived',session_id=123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Survived
2,Target type,Binary
3,Original data shape,"(866, 13)"
4,Transformed data shape,"(866, 26)"
5,Transformed train set shape,"(606, 26)"
6,Transformed test set shape,"(260, 26)"
7,Ordinal features,1
8,Numeric features,8
9,Categorical features,4


<pycaret.classification.oop.ClassificationExperiment at 0x2926a68c0>

In [52]:
best = s.compare_models() 

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.8236,0.8656,0.7614,0.7813,0.7695,0.6268,0.6286,0.175
lda,Linear Discriminant Analysis,0.8203,0.8611,0.7658,0.7745,0.7679,0.6215,0.6238,0.02
ridge,Ridge Classifier,0.8187,0.0,0.7572,0.7752,0.7644,0.6171,0.6191,0.019
ada,Ada Boost Classifier,0.8136,0.8504,0.7697,0.7602,0.7615,0.6089,0.6127,0.028
gbc,Gradient Boosting Classifier,0.812,0.8644,0.7263,0.7798,0.7497,0.5995,0.6028,0.031
lightgbm,Light Gradient Boosting Machine,0.7937,0.8447,0.7047,0.7549,0.7249,0.5605,0.5649,0.303
xgboost,Extreme Gradient Boosting,0.7922,0.8468,0.7139,0.7454,0.7273,0.5598,0.5621,0.03
nb,Naive Bayes,0.7906,0.8248,0.8125,0.6972,0.7491,0.5713,0.5782,0.021
rf,Random Forest Classifier,0.7839,0.8568,0.7051,0.7326,0.7151,0.5417,0.5451,0.049
et,Extra Trees Classifier,0.7823,0.8323,0.7005,0.734,0.7136,0.5384,0.5419,0.046


In [None]:
s.evaluate_model(best)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [55]:
pred_holdout = s.predict_model(best)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.8308,0.8805,0.8,0.7692,0.7843,0.6452,0.6455


In [56]:
s.save_model(best,'best_pipeline')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=/var/folders/_f/1mj9zrs94csb737st13h9w3c0000gn/T/joblib),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['Pclass', 'Age', 'SibSp', 'Parch',
                                              'Fare', 'Family_Size', 'Age*Class',
                                              'Fare_Per_Person'],
                                     transformer=SimpleImputer(add_indicator=False,
                                                               copy=True,
                                                               fill_value=None,
                                                               keep_empty_features=False,
                                                               missing...
                  TransformerWrapper(exclude=None, include=None,
                                     transformer=CleanColumnNames(match='[\\]\\[\\,\\{\\}\\"\\:]+'))),
                