import libraries and modules

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix,classification_report,ConfusionMatrixDisplay

loading datasets,both test and train

In [None]:
train_df=pd.read_csv('/content/train.csv')

In [None]:
test_df=pd.read_csv('/content/test.csv')
submission_df0=test_df.copy()

lets have a look at the data

In [None]:
train_df.shape
test_df.head()

dropping irrelevant columns

In [None]:
train_df.drop(columns=['Cabin','PassengerId','Name','Ticket'], inplace=True)

In [None]:
train_df.describe()

missing values imputing

In [None]:
train_df.isnull().sum()

In [None]:
train_df['Age'].fillna(train_df['Age'].median(), inplace=True)
train_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace=True)

In [None]:
train_df['Embarked'].value_counts()

In [None]:
train_df = pd.get_dummies(train_df, columns=['Embarked'], drop_first=True)


In [None]:
train_df['Embarked_Q'] = train_df['Embarked_Q'].astype(int)


In [None]:
train_df['Embarked_S'] = train_df['Embarked_S'].astype(int)


In [None]:
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch']
train_df['IsAlone'] = (train_df['FamilySize'] == 0).astype(int)
train_df

In [None]:
train_df.drop(columns=['SibSp','Parch'],inplace=True)

In [None]:
train_df=pd.get_dummies(train_df, columns=['Sex'], drop_first=True).astype(int)


In [None]:
X = train_df[['Pclass', 'Age', 'Fare', 'Embarked_Q', 'Embarked_S', 'FamilySize', 'IsAlone', 'Sex_male']]
y = train_df['Survived']

Split data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
classifiers = {
    'Naive Bayes': GaussianNB(),
    'Random Forest': RandomForestClassifier(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}

param_grids = {
    'Naive Bayes': {},
    'Random Forest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10]
    },
    'Logistic Regression': {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2']
    },
    'Decision Tree': {
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    },
    'Gradient Boosting': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.001, 0.01, 0.1],
        'max_depth': [3, 5, 7]
    }
}


results = {}
for clf_name, clf in classifiers.items():
    grid_search = GridSearchCV(clf, param_grids[clf_name], cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    results[clf_name] = (best_model, grid_search.best_params_)
for clf_name, (best_model, best_params) in results.items():
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{clf_name} - Best Model Accuracy: {accuracy}, Best Parameters: {best_params}")

In [None]:
best_params_dt = results['Decision Tree'][1]
best_model_dt = DecisionTreeClassifier(**best_params_dt)
best_model_dt.fit(X_train, y_train)
y_pred_dt = best_model_dt.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print("Decision Tree - Test Accuracy:", accuracy_dt)

In [None]:
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred_dt, average='binary')
conf_matrix = confusion_matrix(y_test, y_pred_dt)

In [None]:
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)
print("Classification Report:")
print(classification_report(y_test, y_pred_dt))
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=[0, 1])
disp.plot(cmap='Blues', values_format='d')
plt.title('Confusion Matrix')
plt.show()

Unseen data or test data

In [None]:
test_df.head()

In [None]:
test_df.drop(columns=['Cabin','PassengerId','Name','Ticket'], inplace=True)

In [None]:
test_df.isnull().sum()

In [None]:
test_df['Age'].fillna(train_df['Age'].median(), inplace=True)
test_df['Fare'].fillna(train_df['Fare'].mean(), inplace=True)

In [None]:
test_df = pd.get_dummies(test_df, columns=['Embarked'], drop_first=True)

In [None]:
test_df['Embarked_Q'] = test_df['Embarked_Q'].astype(int)

In [None]:
test_df['Embarked_S'] = test_df['Embarked_S'].astype(int)

In [None]:
test_df['FamilySize'] = test_df['SibSp'] + test_df['Parch']
test_df['IsAlone'] = (test_df['FamilySize'] == 0).astype(int)
test_df

In [None]:
test_df=pd.get_dummies(test_df, columns=['Sex'], drop_first=True).astype(int)


In [None]:
test_df.drop(columns=['SibSp','Parch'],inplace=True)

In [None]:
survival_probabilities = best_model_dt.predict_proba(test_df)[:, 1]
threshold = 0.5
predictions = (survival_probabilities > threshold).astype(int)
submission_df = pd.DataFrame({
    'PassengerId': submission_df0['PassengerId'],
    'Survived': predictions
})
submission_df.to_csv('submission.csv', index=False)