In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [2]:
train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
train_df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
train_df = train_df.drop_duplicates()
test_df = test_df.drop_duplicates()
train_df.dropna(subset=['Embarked'], inplace=True)
test_df.dropna(subset=['Embarked'], inplace=True)

In [5]:
train_df.loc[train_df['Cabin'].isna()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
884,885,0,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.0500,,S
885,886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.1250,,Q
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S


Need to deal with NaN, here since not a large amount of data, it was fine to drop two lines of NaN in the embarked column, but with these other ones best to imputate in some way. Will try mean imputation and if results are poor will try something else like ML imputation. Will remove Cabin column for now as almost 90% of the column being NaN, later will use R to determine if the missingness is significant statistically.

In [6]:
train_df.drop('Cabin',axis=1,inplace=True)
test_df.drop('Cabin',axis=1,inplace=True)

Response Variable is binary, 

In [7]:
categorical_columns = ['Sex','Embarked', 'Pclass']
numerical_columns = ['Age','Fare','SibSp','Parch']
scaler = StandardScaler()
encoder = OneHotEncoder()

numerical_transformer = Pipeline(steps=[('mean_imputer',SimpleImputer(strategy='mean')),('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[('onehot_encoder',OneHotEncoder())])
preprocessor = ColumnTransformer(transformers=[('numerical',numerical_transformer,numerical_columns),('categorical',categorical_transformer,categorical_columns)])


print(preprocessor)

ColumnTransformer(transformers=[('numerical',
                                 Pipeline(steps=[('mean_imputer',
                                                  SimpleImputer()),
                                                 ('scaler', StandardScaler())]),
                                 ['Age', 'Fare', 'SibSp', 'Parch']),
                                ('categorical',
                                 Pipeline(steps=[('onehot_encoder',
                                                  OneHotEncoder())]),
                                 ['Sex', 'Embarked', 'Pclass'])])


In [8]:
preprocessed_X_train = preprocessor.fit_transform(train_df)
column_names = preprocessor.get_feature_names_out()
preprocessed_X_train = pd.DataFrame(preprocessed_X_train, columns=column_names)
preprocessed_X_train.to_csv("../data/preprocessed_train_data.csv")
preprocessed_X_train

Unnamed: 0,numerical__Age,numerical__Fare,numerical__SibSp,numerical__Parch,categorical__Sex_female,categorical__Sex_male,categorical__Embarked_C,categorical__Embarked_Q,categorical__Embarked_S,categorical__Pclass_1,categorical__Pclass_2,categorical__Pclass_3
0,-5.896199e-01,-0.500240,0.431350,-0.474326,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,6.448480e-01,0.788947,0.431350,-0.474326,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,-2.810029e-01,-0.486650,-0.475199,-0.474326,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,4.133853e-01,0.422861,0.431350,-0.474326,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4,4.133853e-01,-0.484133,-0.475199,-0.474326,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
884,-2.038487e-01,-0.384475,-0.475199,-0.474326,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
885,-8.210826e-01,-0.042213,-0.475199,-0.474326,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
886,-5.482138e-16,-0.174084,0.431350,2.006119,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
887,-2.810029e-01,-0.042213,-0.475199,-0.474326,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
