<a href="https://colab.research.google.com/github/aakashv-git/Projects/blob/main/Titanic_Machine_Learning_from_Disaster.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from io import StringIO
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
# URLs for the train and test data
train_url = 'https://raw.githubusercontent.com/aakashv-git/Projects/main/Titanic_Disaster/data/train.csv'
test_url = 'https://raw.githubusercontent.com/aakashv-git/Projects/main/Titanic_Disaster/data/test.csv'

# Load the data
train_data = pd.read_csv(train_url)
test_data = pd.read_csv(test_url)

In [3]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [4]:
# Explore the data
print(train_data.head())
print(test_data.head())
print(train_data.describe())
print(train_data.shape)

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
  

In [5]:
# Handling Missing Values and feature engineering

print(train_data.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [6]:
# Handling Missing Values

# Fill the numerical columns with median
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)

# Fill the categorical columns with mode
train_data['Embarked'].fillna(train_data['Embarked'].mode(), inplace=True)

# For the 'Cabin' column, since it has many missing values, we'll create a new feature indicating whether a cabin was assigned
train_data['HasCain'] = train_data['Cabin'].apply(lambda x: 0 if pd.isnull(x) else 1)

# drop cabin column as it has too many missing values
train_data.drop('Cabin', axis=1, inplace=True)

# Verify the missing values
print(train_data.isnull().sum())

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       2
HasCain        0
dtype: int64


In [7]:
# Feature Engineering

# Create new feature 'FamilySize'
train_data['FamilySize'] = train_data['SibSp'] + train_data['Parch'] + 1

# Create new Feature 'IsAlone'
train_data['IsAlone'] = (train_data['FamilySize'] == 1).astype(int)

# Convert categorical variables into numeric values using one-hot encoding
train_data = pd.get_dummies(train_data, columns=['Sex', 'Embarked'])

# Drop columns that are not useful for the model
train_data.drop(['Name', 'Ticket', 'PassengerId'], axis=1, inplace=True)

# Verifying the changes
print(train_data.head())
print(train_data.info())


   Survived  Pclass   Age  SibSp  Parch     Fare  HasCain  FamilySize  \
0         0       3  22.0      1      0   7.2500        0           2   
1         1       1  38.0      1      0  71.2833        1           2   
2         1       3  26.0      0      0   7.9250        0           1   
3         1       1  35.0      1      0  53.1000        1           2   
4         0       3  35.0      0      0   8.0500        0           1   

   IsAlone  Sex_female  Sex_male  Embarked_C  Embarked_Q  Embarked_S  
0        0       False      True       False       False        True  
1        0        True     False        True       False       False  
2        1        True     False       False       False        True  
3        0        True     False       False       False        True  
4        1       False      True       False       False        True  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column      Non-Null Coun

In [10]:
# Split the data

#Select features and Target
features = train_data.drop('Survived', axis=1)
target = train_data['Survived']

# Split the data into training and Validation Sets
X_train, X_val, y_train, y_val = train_test_split(features, target, test_size=0.2, random_state=42)

# Verify the shapes of the splits
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)


(712, 13) (179, 13) (712,) (179,)


In [11]:
# Train a logistic regrssion model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = model.predict(X_val)

# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

Validation Accuracy: 0.8044692737430168


In [12]:
# Hyperparameter Tuning

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Instantiate the RandomForestClassifier
rf = RandomForestClassifier(random_state=42)

# Instantiate the GridSearchCV object
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit the model
grid_search.fit(X_train, y_train)

# Print the best parameters
print(f'Best parameters: {grid_search.best_params_}')

#predict on the validation set
y_pred = grid_search.best_estimator_.predict(X_val)

# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy with RandomForest: {accuracy}')


Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best parameters: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}
Validation Accuracy with RandomForest: 0.8100558659217877


In [None]:
# Ensure the test data has the same columns as the training data
missing_cols = set(features.columns) - set(test_data.columns)
for c in missing_cols:
    test_data[c] = 0
test_data = test_data[features.columns]

# Make predictions on the test dataset
test_predictions = grid_search.best_estimator_.predict(test_data)

In [13]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [14]:
# Handle missing values in the test dataset
test_data['Age'].fillna(test_data['Age'].median(), inplace=True)
test_data['Fare'].fillna(test_data['Fare'].median(), inplace=True)

# Create 'HasCabin' feature
test_data['HasCabin'] = test_data['Cabin'].apply(lambda x: 0 if pd.isnull(x) else 1)
test_data.drop('Cabin', axis=1, inplace=True)

# Create 'FamilySize' and 'IsAlone' features
test_data['FamilySize'] = test_data['SibSp'] + test_data['Parch'] + 1
test_data['IsAlone'] = (test_data['FamilySize'] == 1).astype(int)

# Convert categorical variables into numeric values using one-hot encoding
test_data = pd.get_dummies(test_data, columns=['Sex', 'Embarked'])

# Drop unnecessary columns
test_data.drop(['Name', 'Ticket', 'PassengerId'], axis=1, inplace=True)

# Ensure the test data has the same columns as the training data
missing_cols = set(features.columns) - set(test_data.columns)
for c in missing_cols:
    test_data[c] = 0
test_data = test_data[features.columns]

# Make predictions on the test dataset
test_predictions = grid_search.best_estimator_.predict(test_data)

# Create a DataFrame with the results
submission = pd.DataFrame({
    'PassengerId': pd.read_csv(test_url)['PassengerId'],
    'Survived': test_predictions
})

# Save the submission to a CSV file
submission.to_csv('titanic_submission.csv', index=False)

submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
