In [None]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [None]:
pd.set_option('precision', 2)

In [None]:
train_data = pd.read_csv('./titanic_train.csv')
test_data = pd.read_csv('./titanic_test.csv')

In [None]:
train_data.columns.values

In [None]:
train_data.describe()

In [None]:
train_data.describe(include = 'object')

In [None]:
mean_age = train_data['Age'].mean()
def process_age(data):
    return pd.DataFrame(data['Age'].fillna(mean_age), columns = ['Age'])
new_age = process_age(train_data)
new_age.describe()

In [None]:
mode_embarked = train_data['Embarked'].mode()[0]
new_embarked = pd.DataFrame(train_data['Embarked'].fillna(mode_embarked), columns = ['Embarked'])
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

embarked_label_encoder = LabelEncoder()
embarked_integer_encoded = embarked_label_encoder.fit_transform(new_embarked)
embarked_integer_encoded = embarked_integer_encoded.reshape(len(embarked_integer_encoded), 1)
embarked_one_hot_encoder = OneHotEncoder(sparse = False)
embarked_one_hot_encoder.fit(embarked_integer_encoded)
def process_embarked(data):
    data = pd.DataFrame(data['Embarked'].fillna(mode_embarked), columns = ['Embarked'])
    integer_encoded = embarked_label_encoder.transform(data['Embarked'])
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    labels = ['Embarked__' + str(i) for i in range(len(embarked_label_encoder.classes_))]
    return pd.DataFrame(embarked_one_hot_encoder.transform(integer_encoded), columns = labels)
new_embarked = process_embarked(train_data)
new_embarked.head()

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(train_data['Sex'])
one_hot_encoder = OneHotEncoder(sparse = False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = one_hot_encoder.fit_transform(integer_encoded)
def process_sex(data):
    int_encoded = label_encoder.transform(data['Sex'])
    int_encoded = int_encoded.reshape(len(int_encoded), 1)
    col_names = ['Sex__' + str(i) for i in range(len(label_encoder.classes_))]
    return pd.DataFrame(one_hot_encoder.transform(int_encoded), columns = col_names)
process_sex(train_data).head()

In [None]:
def process(data):
    processed_age = process_age(data)
    processed_embarked = process_embarked(data)
    processed_sex = process_sex(data)
    processed = pd.DataFrame()
    processed = pd.concat([processed
                           , processed_age
                           , processed_embarked
                           , processed_sex
                           , data[['Pclass', 'SibSp', 'Parch', 'Fare']]], axis = 1)
    return processed
train_processed = process(train_data)
train_labels = train_data['Survived']
train_processed.describe()

In [None]:
train_processed.columns.values

In [None]:
test_processed = process(test_data)
test_processed.describe()

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
cross_val_score(LogisticRegression(), train_processed, train_labels, cv = 5).mean()

To improve the performance of the model we need to scale features, specially `Age` and `Fare`.
We have 3 options.
* ### standard scaling: assumes the data to be normally distributed
* ### min-max scaling: sensitive to outliers
* ### robust scaling: uses inter-quantile range, less sensitive to outliers

Hence we need to find if the data is normally distributed or if there are outliers in the data

In [None]:
import seaborn as sb
import matplotlib.pyplot as plt
sb.distplot(train_processed[['Age']])
plt.show()

`Age` does not seem normally ditributed. It is worth noting that `Age` had missing values and we imputed the same with the `mean`. Number of missing values were 891 - 714 = 177 which is a large proportion of data. So imputing blindly with mean might not be the best strategy. We will try to fix this later. Let us check if it has outliers.

In [None]:
sb.boxplot(train_processed[['Age']])
plt.show()

Seems like there are a lot of outliers. Hence we should use robust scaler.

In [None]:
from sklearn.preprocessing import RobustScaler
age_scaler = RobustScaler()
age_scaler.fit_transform(train_processed[['Age']])
def process_age_2(data):
    return pd.DataFrame(age_scaler.transform(data[['Age']]), columns = ['Age'])
process_age_2(train_processed).describe()

    

This preprocessing makes some age values as negative and the mean age to be 0. This does not sync very well with real world, but we will see if the model is affected by this.

Let us turn to `Fare`

In [None]:
sb.distplot(train_processed[['Fare']])

`Fare` seems to be a skewed distribution. Lets also look for presence of outliers.

In [None]:
sb.boxplot(train_processed[['Fare']])

clearly there are outliers in this data. Let us use robust scaling again and compare model performance.

In [None]:
fare_scaler = RobustScaler()
fare_scaler.fit_transform(train_processed[['Fare']])

def process_fare_2(data):
    return pd.DataFrame(fare_scaler.transform(data[['Fare']]), columns = ['Fare'])

process_fare_2(train_processed).describe()

In [None]:
mean_age = train_data['Age'].mean()
def process_age(data):
    return pd.DataFrame(data['Age'].fillna(mean_age), columns = ['Age'])

def process_2(data):
    processed_age = process_age(data)
    processed_age = process_age_2(processed_age)
    processed_fare = process_fare_2(data)
    processed_embarked = process_embarked(data)
    processed_sex = process_sex(data)
    processed = pd.DataFrame()
    processed = pd.concat([processed
                           , processed_age
                           , processed_fare
                           , processed_embarked
                           , processed_sex
                           , data[['Pclass', 'SibSp', 'Parch']]], axis = 1)
    return processed

train_processed_2 = process_2(train_data)
test_processed_2 = process_2(test_data)
print(train_processed_2.describe())
cross_val_score(LogisticRegression(), train_processed_2, train_labels, cv = 5).mean()

This processing has reduced the accuracy of our model. But remember out imputation of the age may not be right from the first place. And we have not removed outliers.

We need to diagnose if model is underfitting or overfitting.

In [None]:
from sklearn.model_selection import train_test_split
trainX, valX, trainY, valY = train_test_split(train_processed, train_labels, test_size = 0.2, random_state = 73, shuffle = True)
model = LogisticRegression()
model.fit(trainX, trainY)
print('training score:', model.score(trainX, trainY))
print('validation score:', model.score(valX, valY))

from sklearn.metrics import classification_report
print(classification_report(valY, model.predict(valX)))

The precision and recall when passenger survived is low. Model is biased towards predicting non survival of passenger. Training score is similar to validation score. This is a sign of underfitting.
Let us add more features.

In [None]:
train_data.columns.values

In [None]:
train_data['Name'].map(lambda x: x.split(',')[1].split('.')[0].strip()).unique()
title_dictionary = {
    "Capt": "Officer",
    "Col": "Officer",
    "Major": "Officer",
    "Jonkheer": "Royalty",
    "Don": "Royalty",
    "Dona": "Royalty",
    "Sir" : "Royalty",
    "Dr": "Officer",
    "Rev": "Officer",
    "the Countess":"Royalty",
    "Mme": "Mrs",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr" : "Mr",
    "Mrs" : "Mrs",
    "Miss" : "Miss",
    "Master" : "Master",
    "Lady" : "Royalty"
}
def get_title(name):
    title = name.split(',')[1].split('.')[0].strip()
    return title_dictionary[title]

title_label_encoder = LabelEncoder()
title_integer_encoded = title_label_encoder.fit_transform(train_data['Name'].map(get_title))
title_one_hot_encoder = OneHotEncoder(sparse = False)
title_integer_encoded = title_integer_encoded.reshape(len(title_integer_encoded), 1)
title_onehot_encoded = title_one_hot_encoder.fit_transform(title_integer_encoded)

def process_name(data):
    titles = data['Name'].map(get_title)
    integer_encoded = title_label_encoder.transform(titles)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    one_hot_encoded = title_one_hot_encoder.transform(integer_encoded)
    col_names = ['Title__' + str(i) for i in range(len(title_label_encoder.classes_))]
    return pd.DataFrame(one_hot_encoded, columns = col_names)

process_name(train_data).head()

In [None]:
mean_age_mr = train_data[train_data['Name'].map(get_title) == 'Mr']['Age'].mean()
mean_age_mrs = train_data[train_data['Name'].map(get_title) == 'Mrs']['Age'].mean()
mean_age_miss = train_data[train_data['Name'].map(get_title) == 'Miss']['Age'].mean()
mean_age_royalty = train_data[train_data['Name'].map(get_title) == 'Royalty']['Age'].mean()
mean_age_officer = train_data[train_data['Name'].map(get_title) == 'Officer']['Age'].mean()
mean_age_master = train_data[train_data['Name'].map(get_title) == 'Master']['Age'].mean()

title_to_mean_age = {
    'Mr': mean_age_mr
    , 'Mrs': mean_age_mrs
    , 'Miss': mean_age_miss
    , 'Royalty': mean_age_royalty
    , 'Officer': mean_age_officer
    , 'Master': mean_age_master
}
import math
def process_age_3(data):
    ages = []
    for idx, row in data.iterrows():
        if math.isnan(row['Age']):
            ages.append(title_to_mean_age[get_title(row['Name'])])
        else:
            ages.append(row['Age'])
    return pd.DataFrame(ages, columns = ['Age'])
process_age_3(train_data).head()

In [None]:
def process_3(data):
    processed_age = process_age_3(data)
    processed_embarked = process_embarked(data)
    processed_sex = process_sex(data)
    processed_name = process_name(data)
    processed = pd.DataFrame()
    processed = pd.concat([processed
                           , processed_age
                           , processed_embarked
                           , processed_sex
                           , processed_name
                           , data[['Fare', 'Pclass', 'SibSp', 'Parch']]], axis = 1)
    return processed
train_processed_3 = process_3(train_data)
test_processed_3 = process_3(test_data)
cross_val_score(LogisticRegression(), train_processed_3, train_labels, cv = 5).mean()    


In [None]:
trainX, valX, trainY, valY = train_test_split(train_processed_3, train_labels, test_size = 0.2, random_state = 73, shuffle = True)
model_3 = LogisticRegression()
model_3.fit(trainX, trainY)
print('training score:', model_3.score(trainX, trainY))
print('validation score:', model_3.score(valX, valY))

from sklearn.metrics import classification_report
print(classification_report(valY, model_3.predict(valX)))
print(model_3.coef_)

In [None]:
train_processed_3.columns.values

The `Fare` column has very less weight. We can check how model performs by dropping that feature

In [None]:
train_processed_4 = train_processed_3.drop(['Fare'], axis = 1)
print(cross_val_score(LogisticRegression(), train_processed_4, train_labels, cv = 5).mean() )
model_4 = LogisticRegression()
model_4.fit(train_processed_4, train_labels)
print(classification_report(valY, model_4.predict(valX.drop(['Fare'], axis = 1))))
print(model.coef_)

We have improved k fold cross validation slightly. Let us try by dropping age feature as well.


In [None]:
train_processed_5 = train_processed_3.drop(['Fare', 'Age'], axis = 1)
print(cross_val_score(LogisticRegression(), train_processed_5, train_labels, cv = 5).mean() )
model_5 = LogisticRegression()
model_5.fit(train_processed_5, train_labels)
print(classification_report(valY, model_5.predict(valX.drop(['Fare', 'Age'], axis = 1))))
print(model_5.coef_)

In [None]:
test_processed_4 = test_processed_3.drop(['Fare'], axis = 1)
predictions_4 = pd.DataFrame(model_4.predict(test_processed_4), columns = ['Survived'])
results_4 = pd.concat([test_data[['PassengerId']], predictions_4], axis = 1)
results_4.to_csv('output_4.csv', index = False)

In [None]:
test_processed_5 = test_processed_3.drop(['Age', 'Fare'], axis = 1)
predictions_5 = pd.DataFrame(model_5.predict(test_processed_5), columns = ['Survived'])
results_5 = pd.concat([test_data[['PassengerId']], predictions_5], axis = 1)
results_5.to_csv('output_5.csv', index = False)

In [None]:
train_data.columns.values

In [None]:
cabins = train_data['Cabin'].fillna('U')
cabins = cabins.map(lambda x: x[0])
cabins.unique()

In [None]:
error_examples = valX[valY != model_5.predict(valX.drop(['Fare', 'Age'], axis = 1))]
train_data.iloc[error_examples.index]

The examples above are misclassified examples from our validation set.
I look into the data to find correlation between features and their survival.
In these examples passengers of class 1 have always survived but we predicted them otherwise.
Another interesting point to note is the dead people in these examples are all females.
Most of these dead passengers have paid low ticket price.
While designing the model we had dropped the `Fare` feature since the weight of this feature in our logistic regression model was very low and we concluded that the model is trying too had to incorporate this information.
It might be a better idea if we categorize / discretize the fares.

In [None]:
quantiles = [0, 0.15, 0.3, 0.45, 0.6, 0.75, 1.0]
_, fare_bins = pd.qcut(train_data['Fare'], q = quantiles, retbins = True)
def process_fare(data):
    fares = data['Fare'].fillna(data['Fare'].mean())
    return pd.cut(fares, bins = fare_bins, labels = [1, 2, 3, 4, 5, 6], include_lowest = True).to_frame(name = 'Fare_class')
fare_classes = process_fare(train_data)
fare_classes.describe()

In [None]:
def process_4(data):
    processed_age = process_age_3(data)
    processed_embarked = process_embarked(data)
    processed_sex = process_sex(data)
    processed_name = process_name(data)
    processed_fare = process_fare(data)
    processed = pd.DataFrame()
    processed = pd.concat([processed
#                            , processed_age
                           , processed_embarked
                           , processed_sex
                           , processed_name
                           , processed_fare
                           , data[['Pclass', 'SibSp', 'Parch']]], axis = 1)
    return processed
train_processed_4 = process_4(train_data)
test_processed_4 = process_4(test_data)
model_6 = LogisticRegression()
model_6.fit(train_processed_4, train_labels)
print(cross_val_score(model_6, train_processed_4, train_labels, cv = 5).mean())
print(model_6.coef_)
trainX, valX, trainY, valY = train_test_split(train_processed_4, train_labels, test_size = 0.2, random_state = 73, shuffle = True)
print(classification_report(valY, model_6.predict(valX)))

In [None]:
test_processed_6 = process_4(test_data)
print(test_data.describe())
print(train_data['Fare'].describe())
predictions_6 = pd.DataFrame(model_6.predict(test_processed_6), columns = ['Survived'])
results_6 = pd.concat([test_data[['PassengerId']], predictions_6], axis = 1)
results_6.to_csv('output_6.csv', index = False)