In [120]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [122]:
df = pd.read_csv('tested.csv', usecols=['Age', 'Sex', 'Pclass', 'Fare', 'Embarked', 'Survived'])

In [124]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked
0,0,3,male,34.5,7.8292,Q
1,1,3,female,47.0,7.0,S
2,0,2,male,62.0,9.6875,Q
3,0,3,male,27.0,8.6625,S
4,1,3,female,22.0,12.2875,S


In [126]:
df.isnull().mean()*100

Survived     0.000000
Pclass       0.000000
Sex          0.000000
Age         20.574163
Fare         0.239234
Embarked     0.000000
dtype: float64

In [128]:
x = df.drop(columns='Survived')
y = df['Survived']
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=17)

In [130]:
x_train.sample(5)

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked
177,1,male,54.0,55.4417,C
145,3,male,31.0,18.0,S
217,1,male,57.0,164.8667,S
259,3,male,21.0,7.775,S
342,3,male,,69.55,S


In [132]:
numerical_features = ['Age', 'Fare']
numerical_transformer = Pipeline(steps=[
    
    # Missing values are in Age & Fare Columns (imputation)
    ('imputer', SimpleImputer(strategy='mean')),

    # Normalize the distribution using standard scaler
    ('scaler', StandardScaler())
])

categorical_features = ['Sex', 'Embarked']
categorical_transformer = Pipeline(steps=[

    # Missing values in sex and embarked are filled by most_frequent one
    ('imputer', SimpleImputer(strategy='most_frequent')),

    # one hot encoding for sex and embarked
    ('ohe', OneHotEncoder(handle_unknown='ignore'))
])

In [134]:
# Transform the columns for imputations and Encoding
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_features),
    ('cat', categorical_transformer, categorical_features)
])

In [95]:
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

In [97]:
# shows the basic flow of the work
from sklearn import set_config
set_config(display='diagram')
clf

In [136]:
# finding the best parameter for imputation
param_grid = {
    'preprocessor__num__imputer__strategy' : ['mean', 'median'],
    'preprocessor__cat__imputer__strategy' : ['most_frequent', 'constant'],

    'classifier__C' : [.1, 1, 10, 100]
}
grid_search = GridSearchCV(clf, param_grid, cv=10)

In [138]:
grid_search.fit(x_train, y_train)
print('best parameters: ')
print(grid_search.best_params_)

best parameters: 
{'classifier__C': 0.1, 'preprocessor__cat__imputer__strategy': 'most_frequent', 'preprocessor__num__imputer__strategy': 'mean'}


In [140]:
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)

In [142]:
# check the accuracy of the model
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

1.0

In [144]:
# cross check the accuracy with cross_val_score with 10 folds
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf, x, y, cv=10)
print("Cross-validated accuracy:", scores.mean())

Cross-validated accuracy: 1.0
