In [63]:
# Titanic Survival Predictor

# 1. Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import pickle

In [64]:
# 2. Load the dataset
df = pd.read_csv('titanic.csv')

In [65]:
# 3. Basic EDA
print(df.head())
print(df.info())
print(df.describe())


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
<c

In [66]:
# 4. Handle missing values
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
df.drop('Cabin', axis=1, inplace=True)

In [67]:
# 5. Feature Engineering
df['FamilySize'] = df['SibSp'] + df['Parch']
df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\\.', expand=False)

In [68]:
# Simplify titles
df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col',
                                   'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
df['Title'] = df['Title'].replace('Mlle', 'Miss')
df['Title'] = df['Title'].replace('Ms', 'Miss')
df['Title'] = df['Title'].replace('Mme', 'Mrs')


In [69]:
# Drop unused columns
df.drop(['Name', 'Ticket', 'PassengerId'], axis=1, inplace=True)
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,Title
0,0,3,male,22.0,1,0,7.2500,S,1,Mr
1,1,1,female,38.0,1,0,71.2833,C,1,Mrs
2,1,3,female,26.0,0,0,7.9250,S,0,Miss
3,1,1,female,35.0,1,0,53.1000,S,1,Mrs
4,0,3,male,35.0,0,0,8.0500,S,0,Mr
...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,0,Rare
887,1,1,female,19.0,0,0,30.0000,S,0,Miss
888,0,3,female,28.0,1,2,23.4500,S,3,Miss
889,1,1,male,26.0,0,0,30.0000,C,0,Mr


In [70]:
# Encode categorical columns
le = LabelEncoder()
for col in ['Sex', 'Embarked', 'Title']:
    df[col] = le.fit_transform(df[col])

In [71]:
# 6. Train/Test Split
X = df.drop('Survived', axis=1)
y = df['Survived']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [72]:
# 7. Train Multiple Models

# Decision Tree
dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)
dt_pred = dt.predict(x_test)
print("Decision Tree Accuracy:", accuracy_score(y_test, dt_pred))

Decision Tree Accuracy: 0.7877094972067039


In [73]:
# Random Forest
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
rf_pred = rf.predict(x_test)
print("Random Forest Accuracy:", accuracy_score(y_test, rf_pred))

Random Forest Accuracy: 0.8156424581005587


In [74]:
# Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(x_train, y_train)
lr_pred = lr.predict(x_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, lr_pred))


Logistic Regression Accuracy: 0.8100558659217877


In [75]:
# 8. Cross Validation
rf_scores = cross_val_score(rf, X, y, cv=5)
print("Random Forest Cross-Validation Score:", rf_scores.mean())

Random Forest Cross-Validation Score: 0.8148264390182662


In [76]:
# 9. Confusion Matrix and Classification Report
print("Confusion Matrix (Random Forest):")
print(confusion_matrix(y_test, rf_pred))
print("Classification Report (Random Forest):")
print(classification_report(y_test, rf_pred))

Confusion Matrix (Random Forest):
[[88 17]
 [16 58]]
Classification Report (Random Forest):
              precision    recall  f1-score   support

           0       0.85      0.84      0.84       105
           1       0.77      0.78      0.78        74

    accuracy                           0.82       179
   macro avg       0.81      0.81      0.81       179
weighted avg       0.82      0.82      0.82       179



In [79]:
# 10. Save the Best Model
with open('titanic_model.pkl', 'wb') as f:
    pickle.dump(rf, f)