In [1]:
import pandas as pd 
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier 
import seaborn as sns
from sklearn.metrics import accuracy_score
import pickle
from matplotlib import pyplot as plt
sns.set_theme(context = 'talk', style = 'darkgrid')

In [7]:
data = pd.read_csv(r'C:\Users\akobe\OneDrive\Desktop\Lighthouse\After\Kaggle-Titanic-Machine-Learning-from-Disaster\Data\train_transformed.csv',  index_col = [0])

In [15]:
data.head(2)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,Family_size,Age_class,Fare_per_person,Age_group
0,0,3,0,22.0,1,0,7.25,S,Mr,1,66.0,3.62,Y
1,1,1,1,38.0,1,0,71.28,C,Mrs,1,38.0,35.64,A


In [12]:
data = data.drop(columns = ['PassengerId'])

In [16]:
# split data into target variable and features 

X_train = data.drop(columns=['Survived']) #features 
y_train = data['Survived'] #target variable 

In [None]:
#creating a pipeline - to determine the best hyperparameters 

#how to handle numerical and categorical variables 
numeric_transform = Pipeline([('scaling', StandardScaler())])
categorical_transform = Pipeline([('one-hot-encode', OneHotEncoder(handle_unknown='ignore', sparse=False))])

#indicating numerical/catergorical columns 
preprocessing_tips = ColumnTransformer([('numeric', numeric_transform, ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Family_size',
                                                                           'Age_class', 'Fare_per_person']),
                                         ('categorical', categorical_transform, ['Sex', 'Embarked', 'Title', 'Age_group'])])

#pipeline 
pipeline = Pipeline(steps = [('preprocesing', preprocessing_tips),
                             ('select_best', SelectKBest()),
                             ('classifier', LogisticRegression())])

#parameters grid 
param_grid = {'classifier': [LogisticRegression(), SVC(), RandomForestClassifier(), RidgeClassifier()],
              'select_best__k': [3,4,5,6]}

grid = GridSearchCV(pipeline, param_grid = param_grid, cv=5)
grid.fit(X_train, y_train)

best_model = grid.best_estimator_
best_hyperparams = grid.best_params_
best_acc = grid.score(X_test, y_test)

print(f'Best test set accuracy: {best_acc}\nAchieved with hyperparameters: {best_hyperparams}')