In [None]:
import pandas as pd

In [None]:
pd.set_option('precision', 2)

In [None]:
train_data = pd.read_csv('./titanic_train.csv')
test_data = pd.read_csv('./titanic_test.csv')

In [None]:
train_data.columns.values

In [None]:
train_data.describe()

In [None]:
train_data.describe(include = 'object')

In [None]:
mean_age = train_data['Age'].mean()
def process_age(data):
    return pd.DataFrame(data['Age'].fillna(mean_age), columns = ['Age'])
new_age = process_age(train_data)
new_age.describe()

In [None]:
mode_embarked = train_data['Embarked'].mode()[0]
new_embarked = pd.DataFrame(train_data['Embarked'].fillna(mode_embarked), columns = ['Embarked'])
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

embarked_label_encoder = LabelEncoder()
embarked_integer_encoded = embarked_label_encoder.fit_transform(new_embarked)
embarked_integer_encoded = embarked_integer_encoded.reshape(len(embarked_integer_encoded), 1)
embarked_one_hot_encoder = OneHotEncoder(sparse = False)
embarked_one_hot_encoder.fit(embarked_integer_encoded)
def process_embarked(data):
    data = pd.DataFrame(data['Embarked'].fillna(mode_embarked), columns = ['Embarked'])
    integer_encoded = embarked_label_encoder.transform(data['Embarked'])
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    labels = ['Embarked__' + str(i) for i in range(len(embarked_label_encoder.classes_))]
    return pd.DataFrame(embarked_one_hot_encoder.transform(integer_encoded), columns = labels)
new_embarked = process_embarked(train_data)
new_embarked

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(train_data['Sex'])
one_hot_encoder = OneHotEncoder(sparse = False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = one_hot_encoder.fit_transform(integer_encoded)
def process_sex(data):
    int_encoded = label_encoder.transform(data['Sex'])
    int_encoded = int_encoded.reshape(len(int_encoded), 1)
    col_names = ['Sex__' + str(i) for i in range(len(label_encoder.classes_))]
    return pd.DataFrame(one_hot_encoder.transform(int_encoded), columns = col_names)
process_sex(train_data)

In [None]:
def process(data):
    processed_age = process_age(data)
    processed_embarked = process_embarked(data)
    processed_sex = process_sex(data)
    processed = pd.DataFrame()
    processed = pd.concat([processed
                           , processed_age
                           , processed_embarked
                           , processed_sex
                           , data[['Pclass', 'SibSp', 'Parch', 'Fare']]], axis = 1)
    return processed
train_processed = process(train_data)
train_labels = train_data['Survived']
train_processed.describe()

In [None]:
train_processed.columns.values

In [None]:
test_processed = process(test_data)
test_processed.describe()

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
cross_val_score(LogisticRegression(), train_processed, train_labels, cv = 5).mean()

To improve the performance of the model we need to scale features, specially `Age` and `Fare`.
We have 3 options.
* ### standard scaling: assumes the data to be normally distributed
* ### min-max scaling: sensitive to outliers
* ### robust scaling: uses inter-quantile range, less sensitive to outliers

Hence we need to find if the data is normally distributed or if there are outliers in the data

In [None]:
import seaborn as sb
import matplotlib.pyplot as plt
sb.distplot(train_processed[['Age']])
plt.show()

`Age` does not seem normally ditributed. It is worth noting that `Age` had missing values and we imputed the same with the `mean`. Number of missing values were 891 - 714 = 177 which is a large proportion of data. So imputing blindly with mean might not be the best strategy. We will try to fix this later. Let us check if it has outliers.

In [None]:
sb.boxplot(train_processed[['Age']])
plt.show()

Seems like there are a lot of outliers. Hence we should use robust scaler.

In [None]:
from sklearn.preprocessing import RobustScaler
age_scaler = RobustScaler()
age_scaler.fit_transform(train_processed[['Age']])
def process_age_2(data):
    return pd.DataFrame(age_scaler.transform(data[['Age']]), columns = ['Age'])
process_age_2(train_processed).describe()

    

This preprocessing makes some age values as negative and the mean age to be 0. This does not sync very well with real world, but we will see if the model is affected by this.

Let us turn to `Fare`

In [None]:
sb.distplot(train_processed[['Fare']])

`Fare` seems to be a skewed distribution. Lets also look for presence of outliers.

In [None]:
sb.boxplot(train_processed[['Fare']])

clearly there are outliers in this data. Let us use robust scaling again and compare model performance.

In [None]:
fare_scaler = RobustScaler()
fare_scaler.fit_transform(train_processed[['Fare']])

def process_fare_2(data):
    return pd.DataFrame(fare_scaler.transform(data[['Fare']]), columns = ['Fare'])

process_fare_2(train_processed).describe()

In [None]:
mean_age = train_data['Age'].mean()
def process_age(data):
    return pd.DataFrame(data['Age'].fillna(mean_age), columns = ['Age'])

def process_2(data):
    processed_age = process_age(data)
    processed_age = process_age_2(processed_age)
    processed_fare = process_fare_2(data)
    processed_embarked = process_embarked(data)
    processed_sex = process_sex(data)
    processed = pd.DataFrame()
    processed = pd.concat([processed
                           , processed_age
                           , processed_fare
                           , processed_embarked
                           , processed_sex
                           , data[['Pclass', 'SibSp', 'Parch']]], axis = 1)
    return processed

train_processed_2 = process_2(train_data)
test_processed_2 = process_2(test_data)
print(train_processed_2.describe())
cross_val_score(LogisticRegression(), train_processed_2, train_labels, cv = 5).mean()

This processing has reduced the accuracy of our model. But remember out imputation of the age may not be right from the first place. And we have not removed outliers.