In [1]:
import pandas as pd 
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier 
import seaborn as sns
from sklearn.metrics import accuracy_score
import pickle
from matplotlib import pyplot as plt
sns.set_theme(context = 'talk', style = 'darkgrid')

In [2]:
train = pd.read_csv(r'C:\Users\akobe\OneDrive\Desktop\Lighthouse\After\Kaggle-Titanic-Machine-Learning-from-Disaster\Data\train_transformed.csv',  index_col = [0])
test = pd.read_csv(r'C:\Users\akobe\OneDrive\Desktop\Lighthouse\After\Kaggle-Titanic-Machine-Learning-from-Disaster\Data\test_transformed.csv',  index_col = [0])

In [3]:
train.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,Family_size,Age_class,Fare_per_person,Age_group
0,1,0,3,0,22.0,1,0,7.25,S,Mr,1,66.0,3.62,Y
1,2,1,1,1,38.0,1,0,71.28,C,Mrs,1,38.0,35.64,A


In [4]:
train['Age_group'].unique()

array(['Y', 'A', 'C', 'E'], dtype=object)

In [5]:
test.head(2)

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,Family_size,Age_class,Fare_per_person,Age_group
0,892,3,0,34.5,0,0,7.83,Q,Mr,0,103.5,7.83,A
1,893,3,1,47.0,1,0,7.0,S,Mrs,1,141.0,3.5,A


In [6]:
#fare and embarked removed as they are related to class 
# 56% of passengers that embarked at Cherbourg were 1st class 
train = train.drop(columns = ['PassengerId', 'Fare', 'Embarked'])
test = test.drop(columns = ['PassengerId', 'Fare', 'Embarked'])

In [7]:
# split data into target variable and features 

#train set 
X_train = train.drop(columns=['Survived']) #features 
y_train = train['Survived'] #target variable 

#test set 
#X_test = test.drop(columns=['Survived']) #features 
#y_test = test['Survived'] #target variable

In [9]:
#creating a pipeline - to determine the best hyperparameters 

#how to handle numerical and categorical variables 
numeric_transform = Pipeline([('scaling', StandardScaler())])
categorical_transform = Pipeline([('one-hot-encode', OneHotEncoder(handle_unknown='ignore', sparse=False))])

#indicating numerical/catergorical columns 
preprocessing_tips = ColumnTransformer([('numeric', numeric_transform, ['Pclass', 'Age', 'SibSp', 'Parch', 'Family_size',
                                                                           'Age_class', 'Fare_per_person']),
                                         ('categorical', categorical_transform, ['Sex', 'Title', 'Age_group'])])

#pipeline 
pipeline = Pipeline(steps = [('preprocesing', preprocessing_tips),
                             ('select_best', SelectKBest()),
                             ('classifier', LogisticRegression())])

#parameters grid 
param_grid = {'classifier': [LogisticRegression(), SVC(), RandomForestClassifier(), RidgeClassifier()],
              'select_best__k': [3,4,5]}

grid = GridSearchCV(pipeline, param_grid = param_grid, cv=5)
grid.fit(X_train, y_train)

best_model = grid.best_estimator_
best_hyperparams = grid.best_params_
#best_acc = grid.score(X_, y_test)

print(f'Best test set is achieved with hyperparameters: {best_hyperparams}')
#print(f'Best test set accuracy: {best_acc}\nAchieved with hyperparameters: {best_hyperparams}')

Best test set is achieved with hyperparameters: {'classifier': RandomForestClassifier(), 'select_best__k': 5}


In [11]:
#creating a pipeline 
#model with best hyperparameters 

#how to handle numerical and categorical variables 
numeric_transform = Pipeline([('scaling', StandardScaler())])
categorical_transform = Pipeline([('one-hot-encode', OneHotEncoder(handle_unknown='ignore', sparse=False))])

#indicating numerical/catergorical columns 
preprocessing_tips = ColumnTransformer([('numeric', numeric_transform, ['Pclass', 'Age', 'SibSp', 'Parch', 'Family_size',
                                                                           'Age_class', 'Fare_per_person']),
                                         ('categorical', categorical_transform, ['Sex', 'Title', 'Age_group'])])

#pipeline 
pipeline = Pipeline(steps = [('preprocesing', preprocessing_tips),
                             ('select_best', SelectKBest(k = 5)),
                             ('classifier', RandomForestClassifier())])

pipeline.fit(X_train, y_train)

best_features = pipeline.named_steps['select_best'].get_support()

print(f'The best features are: {best_features}')

The best features are: [ True False False False False  True False  True  True False False False
  True False False False False False]


In [12]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Title,Family_size,Age_class,Fare_per_person,Age_group
0,3,0,22.0,1,0,Mr,1,66.0,3.62,Y
1,1,1,38.0,1,0,Mrs,1,38.0,35.64,A
2,3,1,26.0,0,0,Miss,0,78.0,7.92,A
3,1,1,35.0,1,0,Mrs,1,35.0,26.55,A
4,3,0,35.0,0,0,Mr,0,105.0,8.05,A


In [15]:
# Save the model
with open('model.pkl', 'wb') as f:
    pickle.dump(pipeline, f)

In [16]:
# saving the columns
model_columns = list(X_train.columns)
with open('model_columns.pkl','wb') as file:
    pickle.dump(model_columns, file)