In [59]:
import sys
import os
sys.path.append(os.path.abspath("../src"))

from utils import evaluate_model

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import joblib
import warnings
warnings.filterwarnings('ignore')

In [60]:
df = pd.read_csv('../train.csv')

In [61]:
features = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
X = df[features]
y = df["Survived"]

In [62]:
X["Age"].fillna(X["Age"].median(), inplace=True)
X["Embarked"].fillna(X["Embarked"].mode()[0], inplace=True)

In [63]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

categorical_cols = ["Sex", "Embarked"]
numerical_cols = ["Pclass", "Age", "SibSp", "Parch", "Fare"]

preprocess = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ]
)

In [64]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

model = Pipeline(steps=[
    ('preprocess', preprocess),
    ('classifier', LogisticRegression())
])

In [65]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [66]:
param_grid_lr = {
    'classifier__C': [0.01, 0.1, 1, 10],
    'classifier__solver': ['liblinear', 'lbfgs']
}

grid_lr = GridSearchCV(estimator=model, param_grid=param_grid_lr, cv=5, scoring='accuracy', n_jobs=-1)

grid_lr.fit(X_train, y_train)

print("Best parameters for Logistic Regression:", grid_lr.best_params_)
print("Best cross-validation accuracy:", grid_lr.best_score_)

Best parameters for Logistic Regression: {'classifier__C': 0.1, 'classifier__solver': 'lbfgs'}
Best cross-validation accuracy: 0.8006303555599331


In [67]:
best_log_reg = grid_lr.best_estimator_
y_pred_lr = best_log_reg.predict(X_test)

print("Logistic Regression Test Accuracy:", accuracy_score(y_test, y_pred_lr))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_lr))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_lr))

Logistic Regression Test Accuracy: 0.8044692737430168

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.89      0.85       110
           1       0.79      0.67      0.72        69

    accuracy                           0.80       179
   macro avg       0.80      0.78      0.79       179
weighted avg       0.80      0.80      0.80       179

Confusion Matrix:
[[98 12]
 [23 46]]


In [68]:
rf_model = Pipeline(steps=[
    ('preprocess', preprocess),
    ('classifier', RandomForestClassifier(n_estimators=200, random_state=42))
])

svc_model = Pipeline(steps=[
    ('preprocess', preprocess),
    ('classifier', SVC(kernel='rbf', probability=True, random_state=42))
])

knn_model = Pipeline(steps=[
    ('preprocess', preprocess),
    ('classifier', KNeighborsClassifier(n_neighbors=5))
])

In [69]:
models = {
    'Best_LogReg' : best_log_reg,
    'Random Forest': rf_model,
    'SVC': svc_model,
    'KNN': knn_model
}

results = {}

for name, m in models.items():
    m.fit(X_train, y_train)
    preds = m.predict(X_test)
    acc = accuracy_score(y_test, preds)
    results[name] = acc
    print(f"{name} accuracy: {acc:.4f}")

results

Best_LogReg accuracy: 0.8045
Random Forest accuracy: 0.8268
SVC accuracy: 0.6201
KNN accuracy: 0.6704


{'Best_LogReg': 0.8044692737430168,
 'Random Forest': 0.8268156424581006,
 'SVC': 0.6201117318435754,
 'KNN': 0.6703910614525139}

In [70]:
best_model_name = max(results, key=results.get)
best_model = models[best_model_name]

print(f"Best model is: {best_model_name} with accuracy: {results[best_model_name]:.4f}")

joblib.dump(best_model, 'best_titanic_model.pkl')
print("Best model saved as 'best_titanic_model.pkl'")

Best model is: Random Forest with accuracy: 0.8268
Best model saved as 'best_titanic_model.pkl'
