In [None]:
#Load libraries into code environment

import numpy as np
import pandas as pd

In [None]:
#Load test and train datasets into code environment

test = pd.read_csv('/kaggle/input/titanic/test.csv')
train = pd.read_csv('/kaggle/input/titanic/train.csv')

#Preview and get insights into the dataset
test.info()
train.info()


In [None]:
#Cleaning the train dataset
#Fill missing values in Age column of train dataset with mean of the column

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')

age_column_train = train[['Age']]


imputer.fit(age_column_train)


train['Age'] = imputer.transform(age_column_train)




In [None]:
#Drop the columns that are less significant

train_rev = train.drop(['Name', 'Cabin', 'Ticket'], axis=1)




In [None]:
#Convert column with ordinal data to numeric data using Label Encoder

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train_rev['Pclass'] = le.fit_transform(train_rev['Pclass'])


#Convert columns with categorical data to numeric data using Label Encoder
train_rev = pd.get_dummies(train_rev, columns=['Sex', 'Embarked'])


train_rev.info()


In [None]:
#Check relationship between features and and target variable using correlation analysis

import seaborn as sns
import matplotlib.pyplot as plt

target_variable = 'Survived'
correlation_matrix = train_rev.corr()
target_correlations = correlation_matrix[target_variable]


plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='cividis', fmt=".2f")
plt.title(f'Correlation Heatmap (Target Variable: {target_variable})')
plt.show()


In [None]:
#Drop features with the lowest correlation to the target variable

train_rev = train_rev.drop(['Parch','Embarked_C', 'Embarked_Q', 'Embarked_S'], axis=1)

train_rev.head()

In [None]:
#Import the necessary sklearn libraries
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

X = train_rev.drop("Survived", axis=1)
y = train_rev[['Survived']]

#Split the train dataset
X_train, X_test ,y_train ,y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#Initializing models
models = [
    ("Logistic Regression", LogisticRegression()),
    ("Random Forest", RandomForestClassifier()),
    ("Decision Tree", DecisionTreeClassifier())
        ]

In [None]:
#Train the models and check metrics score

#Metric 1 - Accuracy

for model_name,model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f"{model_name} Accuracy: {accuracy}")


In [None]:
# Metric 2 - Precision

for model_name,model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    precision = precision_score(y_test, y_pred)
    
    print(f"{model_name} Precision: {precision}")

In [None]:
#Metric 3 - Recall

for model_name,model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    recall = recall_score(y_test, y_pred)
    
    print(f"{model_name} Recall: {recall}")

In [None]:
for model_name,model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    cm = confusion_matrix(y_test, y_pred)
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title(f"Confusion Matrix for {model_name}")
    plt.show()

In [None]:
scoring = ['accuracy', 'precision', 'recall']

In [None]:
# Perform cross-validation and evaluate models
for model_name, model in models:
    # Perform cross-validation (5-fold in this example)
    cv_results = cross_validate(model, X_train, y_train, cv=5, scoring= scoring)
    
    # Fit the model on the full training data
    model.fit(X_train, y_train)
    
    # Make predictions on the test data
    y_pred = model.predict(X_test)
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    for metric in scoring:
        scores = cv_results[f"test_{metric}"]
        print(f"{model_name} - {metric.capitalize()} Scores: {scores}")

In [None]:
#Check the test dataset

#Fill missing data in Age column with mean
age_column_test = test[['Age']]
imputer.fit(age_column_test)
test['Age'] = imputer.transform(age_column_test)

#Drop columns that were dropped in train dataset
test = test.drop(['Name', 'Cabin', 'Ticket', 'Embarked', 'Parch'], axis=1)

#Transform ordinal data to numeric data
test['Pclass'] = le.fit_transform(test['Pclass'])
#Convert columns with categorical data to numeric data using Label Encoder
test = pd.get_dummies(test, columns=['Sex']) 


#Fill the missing data in the fare column with the most frequent data

imputer2 = SimpleImputer(strategy='most_frequent')
fare_column_test = test[['Fare']]
imputer2.fit(fare_column_test)
test['Fare'] = imputer2.transform(fare_column_test)

test.info()

In [None]:
for model_name, model in models:
    if model_name == "Random Forest":
        random_forest_model = model
        
        y_train = y_train.values.ravel()
        
        random_forest_model.fit(X_train, y_train)
        
        predictions = random_forest_model.predict(test)
        

In [None]:
predictions = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': predictions})

predictions

predictions.to_csv('submissions.csv', index=False)

In [None]:

# To download the file in Jupyter Notebook

from IPython.display import FileLink
FileLink('submissions.csv')

