<a href="https://colab.research.google.com/github/aakashv-git/Projects/blob/main/Titanic_Disaster/code/Titanic_Machine_Learning_from_Disaster.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from io import StringIO
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

In [2]:
# URLs for the train and test data
train_url = 'https://raw.githubusercontent.com/aakashv-git/Projects/main/Titanic_Disaster/data/train.csv'
test_url = 'https://raw.githubusercontent.com/aakashv-git/Projects/main/Titanic_Disaster/data/test.csv'

# Load the data
train_data = pd.read_csv(train_url)
test_data = pd.read_csv(test_url)

In [3]:
# Explore the data
print(train_data.head())
print(test_data.head())
print(train_data.describe())
print(train_data.shape)

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
  

In [4]:
# Handling Missing Values and feature engineering

print(train_data.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [5]:
# Handling Missing Values

# Fill the numerical columns with median
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)

# Fill the categorical columns with mode
train_data['Embarked'].fillna(train_data['Embarked'].mode(), inplace=True)

# For the 'Cabin' column, since it has many missing values, we'll create a new feature indicating whether a cabin was assigned
train_data['HasCain'] = train_data['Cabin'].apply(lambda x: 0 if pd.isnull(x) else 1)

# drop cabin column as it has too many missing values
train_data.drop('Cabin', axis=1, inplace=True)

# Verify the missing values
print(train_data.isnull().sum())

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       2
HasCain        0
dtype: int64


In [6]:
# Feature Engineering

# Create new feature 'FamilySize'
train_data['FamilySize'] = train_data['SibSp'] + train_data['Parch'] + 1

# Create new Feature 'IsAlone'
train_data['IsAlone'] = (train_data['FamilySize'] == 1).astype(int)
train_data['IsAlone'].loc[train_data['FamilySize'] > 1] = 0  # If family size is greater than 1, set to 0 (False)
train_data['Title'] = train_data['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
train_data['FareBin'] = pd.qcut(train_data['Fare'], 4)
train_data['AgeBin'] = pd.cut(train_data['Age'].astype(int), 5)

# Encoding categorical variables
label = LabelEncoder()
train_data['Sex_Code'] = label.fit_transform(train_data['Sex'])
train_data['Embarked_Code'] = label.fit_transform(train_data['Embarked'])
train_data['Title_Code'] = label.fit_transform(train_data['Title'])
train_data['FareBin_Code'] = label.fit_transform(train_data['FareBin'])
train_data['AgeBin_Code'] = label.fit_transform(train_data['AgeBin'])

# Drop unnecessary columns
drop_cols = ['PassengerId', 'Name', 'Ticket', 'Sex', 'Embarked', 'Title', 'FareBin', 'AgeBin']
train_data.drop(columns=drop_cols, inplace=True)

# Verifying the changes
print(train_data.head())
print(train_data.info())


   Survived  Pclass   Age  SibSp  Parch     Fare  HasCain  FamilySize  \
0         0       3  22.0      1      0   7.2500        0           2   
1         1       1  38.0      1      0  71.2833        1           2   
2         1       3  26.0      0      0   7.9250        0           1   
3         1       1  35.0      1      0  53.1000        1           2   
4         0       3  35.0      0      0   8.0500        0           1   

   IsAlone  Sex_Code  Embarked_Code  Title_Code  FareBin_Code  AgeBin_Code  
0        0         1              2          11             0            1  
1        0         0              0          12             3            2  
2        1         0              2           8             1            1  
3        0         0              2          12             3            2  
4        1         1              2          11             1            2  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 colu

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['IsAlone'].loc[train_data['FamilySize'] > 1] = 0  # If family size is greater than 1, set to 0 (False)


In [7]:
# Split the data

#Select features and Target
features = train_data.drop('Survived', axis=1)
target = train_data['Survived']

# Split the data into training and Validation Sets
X_train, X_val, y_train, y_val = train_test_split(features, target, test_size=0.2, random_state=42)

# Verify the shapes of the splits
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)


(712, 13) (179, 13) (712,) (179,)


In [8]:
X_train.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,HasCain,FamilySize,IsAlone,Sex_Code,Embarked_Code,Title_Code,FareBin_Code,AgeBin_Code
331,1,45.5,0,0,28.5,1,1,1,1,2,11,2,2
733,2,23.0,0,0,13.0,0,1,1,1,2,11,1,1
382,3,32.0,0,0,7.925,0,1,1,1,2,11,1,1
704,3,26.0,1,0,7.8542,0,2,0,1,2,11,0,1
813,3,6.0,4,2,31.275,0,7,0,0,2,8,3,0


In [9]:
# Train a RandomForest model
rf_model = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

# Train the model with the best parameters
best_rf_model = grid_search.best_estimator_
best_rf_model.fit(X_train, y_train)

# Validate the model
y_pred = best_rf_model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')


Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best Parameters: {'bootstrap': True, 'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}
Validation Accuracy: 0.8379888268156425


In [10]:
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

# Define models
svm_model = SVC(probability=True, random_state=42)
xgb_model = XGBClassifier(random_state=42)

# Combine models into an ensemble
ensemble_model = VotingClassifier(estimators=[
    ('rf', best_rf_model),
    ('svm', svm_model),
    ('xgb', xgb_model)
], voting='soft')

# Fit the ensemble model
ensemble_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = ensemble_model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy with Ensemble: {accuracy}')


Validation Accuracy with Ensemble: 0.8268156424581006


In [11]:
# # Train a logistic regrssion model
# model = LogisticRegression(max_iter=1000)
# model.fit(X_train, y_train)

# # Make predictions on the validation set
# y_pred = model.predict(X_val)

# # Evaluate the model
# accuracy = accuracy_score(y_val, y_pred)
# print(f'Validation Accuracy: {accuracy}')

In [12]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(ensemble_model, X_train, y_train, cv=5)
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Score: {cv_scores.mean()}')


Cross-Validation Scores: [0.81118881 0.83916084 0.80985915 0.83098592 0.83098592]
Mean CV Score: 0.8244361272530287


In [13]:
from sklearn.feature_selection import SelectKBest, chi2

selector = SelectKBest(score_func=chi2, k=10)
X_new = selector.fit_transform(X_train, y_train)
X_val_new = selector.transform(X_val)


In [14]:
from sklearn.ensemble import StackingClassifier

estimators = [
    ('rf', best_rf_model),
    ('svm', svm_model),
    ('xgb', xgb_model)
]
stack_model = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
stack_model.fit(X_train, y_train)
y_pred_stack = stack_model.predict(X_val)
stack_accuracy = accuracy_score(y_val, y_pred_stack)
print(f'Validation Accuracy with Stacking: {stack_accuracy}')


Validation Accuracy with Stacking: 0.8324022346368715


In [15]:
# # Create a DataFrame with the results
# submission = pd.DataFrame({
#     'PassengerId': pd.read_csv(test_url)['PassengerId'],
#     'Survived': test_predictions
# })

# # Save the submission to a CSV file
# submission.to_csv('titanic_submission.csv', index=False)

# submission.head()

In [16]:
# # Hyperparameter Tuning

# # Define the parameter grid
# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [None, 10, 20, 30],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }

# # Instantiate the RandomForestClassifier
# rf = RandomForestClassifier(random_state=42)

# # Instantiate the GridSearchCV object
# grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# # Fit the model
# grid_search.fit(X_train, y_train)

# # Print the best parameters
# print(f'Best parameters: {grid_search.best_params_}')

# #predict on the validation set
# y_pred = grid_search.best_estimator_.predict(X_val)

# # Evaluate the model
# accuracy = accuracy_score(y_val, y_pred)
# print(f'Validation Accuracy with RandomForest: {accuracy}')


In [17]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [27]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,HasCain,FamilySize,IsAlone,Title,FareBin,AgeBin
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,Q,0,1,1,Mr,"(-0.001, 7.896]","(30.4, 45.6]"
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,S,0,2,0,Mrs,"(-0.001, 7.896]","(45.6, 60.8]"
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,Q,0,1,1,Mr,"(7.896, 14.454]","(60.8, 76.0]"
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,S,0,1,1,Mr,"(7.896, 14.454]","(15.2, 30.4]"
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,S,0,3,0,Mrs,"(7.896, 14.454]","(15.2, 30.4]"


In [49]:

test_data = pd.read_csv(test_url)
# Handle missing values
test_data['Age'].fillna(test_data['Age'].median(), inplace=True)
test_data['Embarked'].fillna(test_data['Embarked'].mode()[0], inplace=True)
test_data['Fare'].fillna(test_data['Fare'].median(), inplace=True)

# For the 'Cabin' column, we'll create a new feature indicating whether a cabin was assigned
test_data['HasCain'] = test_data['Cabin'].apply(lambda x: 0 if pd.isnull(x) else 1)
test_data.drop('Cabin', axis=1, inplace=True)

# Feature engineering
test_data['FamilySize'] = test_data['SibSp'] + test_data['Parch'] + 1
test_data['IsAlone'] = (test_data['FamilySize'] == 1).astype(int)
test_data['Title'] = test_data['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
test_data['FareBin'] = pd.qcut(test_data['Fare'], 4)
test_data['AgeBin'] = pd.cut(test_data['Age'].astype(int), 5)



In [50]:
label = LabelEncoder()
# Encoding categorical variables
test_data['Sex_Code'] = label.fit_transform(test_data['Sex'])
test_data['Embarked_Code'] = label.fit_transform(test_data['Embarked'])
test_data['Title_Code'] = label.fit_transform(test_data['Title'])
test_data['FareBin_Code'] = label.fit_transform(test_data['FareBin'])
test_data['AgeBin_Code'] = label.fit_transform(test_data['AgeBin'])

# Drop unnecessary columns
test_data.drop(columns=drop_cols, inplace=True)

# Verify the changes
print(test_data.head())
print(test_data.info())


# # Ensure the features in test_data match those used in training
#test_features = test_data[num_features + cat_features]

# Use the ensemble model to predict the outcomes for the test data
test_predictions = ensemble_model.predict(test_data)

   Pclass   Age  SibSp  Parch     Fare  HasCain  FamilySize  IsAlone  \
0       3  34.5      0      0   7.8292        0           1        1   
1       3  47.0      1      0   7.0000        0           2        0   
2       2  62.0      0      0   9.6875        0           1        1   
3       3  27.0      0      0   8.6625        0           1        1   
4       3  22.0      1      1  12.2875        0           3        0   

   Sex_Code  Embarked_Code  Title_Code  FareBin_Code  AgeBin_Code  
0         1              1           5             0            2  
1         0              2           6             0            3  
2         1              1           5             1            4  
3         1              2           5             1            1  
4         0              2           6             1            1  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  -----

In [54]:
# Create a DataFrame for submission
test_df = pd.read_csv(test_url)
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': test_predictions
})

# Save the submission file
submission.to_csv('submission.csv', index=False)
print("Submission file created successfully.")


Submission file created successfully.


## 2nd Attempt