<a href="https://www.kaggle.com/code/begumarici/titanic-model-comparison-and-tuning?scriptVersionId=190305649" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Loading the data and data overview

In [None]:
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

# Handling with the Outliers

In [None]:
# Visualising the outliers
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.boxplot(x=train['Age'])
plt.title('Age Boxplot')

plt.subplot(1, 2, 2)
sns.boxplot(x=train['Fare'])
plt.title('Fare Boxplot')

In [None]:
# Fare 
Q1 = train['Fare'].quantile(0.25)
Q3 = train['Fare'].quantile(0.75)
IQR = Q3 - Q1
fare_lower_bound = Q1 - 1.5 * IQR
fare_upper_bound = Q3 + 1.5 * IQR

train = train[(train['Fare'] >= fare_lower_bound) & (train['Fare'] <= fare_upper_bound)]

# Age
Q1 = train['Age'].quantile(0.25)
Q3 = train['Age'].quantile(0.75)
IQR = Q3 - Q1
age_lower_bound = Q1 - 1.5 * IQR
age_upper_bound = Q3 + 1.5 * IQR

train = train[(train['Age'] >= age_lower_bound) & (train['Age'] <= age_upper_bound)]

In [None]:
train.loc[train['Fare'] > fare_upper_bound, 'Fare'] = fare_upper_bound
train.loc[train['Fare'] < fare_lower_bound, 'Fare'] = fare_lower_bound

train.loc[train['Age'] > age_upper_bound, 'Age'] = age_upper_bound
train.loc[train['Age'] < age_lower_bound, 'Age'] = age_lower_bound

# Filling missing values

In [None]:
train['Age'].fillna(train['Age'].median(), inplace=True)
test['Age'].fillna(test['Age'].median(), inplace=True)

train['Embarked'].fillna(train['Embarked'].mode()[0], inplace=True)
test['Embarked'].fillna(test['Embarked'].mode()[0], inplace=True)

test['Fare'] = test['Fare'].fillna(test['Fare'].median())

# Converting categorical variables

In [None]:
train['Sex'] = train['Sex'].map({'male': 1, 'female': 0})
test['Sex'] = test['Sex'].map({'male': 1, 'female': 0})

In [None]:
train['Embarked'] = train['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})
test['Embarked'] = test['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})


In [None]:
train.isnull().sum()

In [None]:
test.isna().sum()

# Dropping unnecessary columns

In [None]:
train = train.drop(['Cabin', 'Name', 'Ticket'], axis=1)
test = test.drop(['Cabin', 'Name', 'Ticket'], axis=1)

# Splitting features and target variable

In [None]:
X = train.drop("Survived", axis=1)
y = train["Survived"]

In [None]:
X

In [None]:
y

# Splitting the data into training and test sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=101)

# Standardizing the Data

In [None]:
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

# Model Training and Evaluation

In [None]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=101),
    "Gradient Boosting": GradientBoostingClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
    "Support Vector Machine": SVC()
}

In [None]:
for name, model in models.items():
    if name == "Logistic Regression" or name == "Support Vector Machine" or name == "K-Nearest Neighbors":
        model.fit(scaled_X_train, y_train)
        preds = model.predict(scaled_X_test)
    else:
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
    
    print(f"Model: {name}")
    print("Accuracy:", accuracy_score(y_test, preds))
    print(confusion_matrix(y_test, preds))
    print(classification_report(y_test, preds))
    print("\n" + "="*60 + "\n")

# Hyperparameter Tuning for Logistic Regression

In [None]:
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga'],
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'max_iter': [100, 200, 300]
}

In [None]:
log_reg = LogisticRegression(random_state=101)


CV_log_reg = GridSearchCV(estimator=log_reg, param_grid=param_grid, cv=5, scoring='accuracy')
CV_log_reg.fit(scaled_X_train, y_train)
print("Best Parameters for Logistic Regression:", CV_log_reg.best_params_)

In [None]:
best_log_reg = LogisticRegression(
    C=CV_log_reg.best_params_['C'],
    penalty=CV_log_reg.best_params_['penalty'],
    solver=CV_log_reg.best_params_['solver'],
    max_iter=CV_log_reg.best_params_['max_iter'],
    random_state=101
)

In [None]:
best_log_reg.fit(scaled_X_train, y_train)

log_reg_preds = best_log_reg.predict(scaled_X_test)
print("Accuracy:", accuracy_score(y_test, log_reg_preds))
print(confusion_matrix(y_test, log_reg_preds))
print(classification_report(y_test, log_reg_preds))

# Hyperparameter Tuning for Random Forest

In [None]:
param_grid = {
    'n_estimators': [10, 50, 100, 200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [4, 6, 8, 10],
    'criterion': ['gini', 'entropy']
}


In [None]:
rfc = RandomForestClassifier(random_state=101)
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5)
CV_rfc.fit(X_train, y_train)
print("Best Parameters for Random Forest:", CV_rfc.best_params_)

# Training and Evaluating the Best Model

In [None]:
best_rfc = RandomForestClassifier(
    criterion=CV_rfc.best_params_['criterion'],
    max_depth=CV_rfc.best_params_['max_depth'],
    max_features=CV_rfc.best_params_['max_features'],
    n_estimators=CV_rfc.best_params_['n_estimators'],
    random_state=101
)

In [None]:
best_rfc.fit(X_train, y_train)
best_preds = best_rfc.predict(X_test)

In [None]:
print("Accuracy:", accuracy_score(y_test, best_preds))
print(confusion_matrix(y_test, best_preds))
print(classification_report(y_test, best_preds))

In [None]:
final_preds = best_rfc.predict(test)

# Saving the submission file

In [None]:
submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Survived": final_preds
})

In [None]:
submission.to_csv('titanic_submission.csv', index=False)