In [415]:
import math

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer, RobustScaler, Normalizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

df_train = pd.read_csv('D:\\Projects\\kaggle_titanic\\data\\train.csv')
df_test = pd.read_csv('D:\\Projects\\kaggle_titanic\\data\\test.csv')
data = [df_train, df_test]

In [416]:
# Data Preprocessing
# Fill blanks for Age, Fare, Embarked

median_age = df_train.Age.median()
median_fare = df_train.Fare.median()
mode_embarked = df_train['Embarked'].mode()
for d in data:
    #d.Age = d.Age.fillna(median_age)
    d.Fare = d.Fare.fillna(median_fare)
    d.Embarked = d.Embarked.fillna(mode_embarked)
    

for dataset in data:
    mean = df_train["Age"].mean()
    std = df_test["Age"].std()
    is_null = dataset["Age"].isnull().sum()
    # compute random numbers between the mean, std and is_null
    rand_age = np.random.randint(mean - std, mean + std, size = is_null)
    # fill NaN values in Age column with random values generated
    age_slice = dataset["Age"].copy()
    age_slice[np.isnan(age_slice)] = rand_age
    dataset["Age"] = age_slice
    dataset["Age"] = df_train["Age"].astype(int)

In [417]:
# Feature Engineering
#   Standard Scale Age and Fare
age_scaler = PowerTransformer()
age_data = np.array(data[0].Age).reshape(-1, 1)
age_scaler.fit(age_data)
data[0].Age = age_scaler.transform(np.array(data[0].Age).reshape(-1, 1))
data[1].Age = age_scaler.transform(np.array(data[1].Age).reshape(-1, 1))

fare_scaler = PowerTransformer()
fare_data = np.array(data[0].Fare).reshape(-1, 1)
fare_scaler.fit(fare_data)
data[0].Fare = fare_scaler.transform(np.array(data[0].Fare).reshape(-1, 1))
data[1].Fare = fare_scaler.transform(np.array(data[1].Fare).reshape(-1, 1))

for dataset in data:
    dataset['relatives'] = dataset['SibSp'] + dataset['Parch']
    dataset.loc[dataset['relatives'] > 0, 'travelled_alone'] = 'No'
    dataset.loc[dataset['relatives'] == 0, 'travelled_alone'] = 'Yes'

#   Onehot Sex, Class, Embarked
cat_col = ['Sex', 'Pclass', 'Embarked', 'travelled_alone']
for d in [0, 1]:
    for col in cat_col:
        temp_d = pd.get_dummies(data[d][col])
        data[d] = pd.merge(left = data[d], right = temp_d, left_index=True, right_index=True)
        data[d] = data[d].drop(columns=col)



In [418]:
use_cols = ['Yes', 'No','Age', 'female', 'male', 1, 2, 3, 'Q', 'C', 'S']
x_train = data[0][use_cols]
y_train = data[0]['Survived']
x_test = data[1][use_cols]

model = LogisticRegression()
model.fit(x_train.values, y_train.values)
print(model.score(x_train.values, y_train.values))
y_test_results = model.predict(x_test.values)

0.7968574635241302


In [419]:
# submit
y_test_results = pd.DataFrame(y_test_results, columns=['Survived'])
subm = pd.merge(left=data[1], right=y_test_results, left_index=True, right_index=True)
subm = subm[['PassengerId', 'Survived']]
pd.DataFrame.to_csv(subm, 'd:\\Projects\\kaggle_titanic\\submission.csv', index = None)