In [23]:
import numpy as np
import pandas as pd
import seaborn as sns

Loading the Dataset


In [24]:
df = pd.read_csv('Titanic-Dataset.csv')
df.head()
df.shape
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

Dropping irrelevant features

In [25]:
df.drop(['PassengerId','Name','SibSp','Parch','Ticket','Cabin','Embarked'],axis='columns',inplace=True)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,male,22.0,7.25
1,1,1,female,38.0,71.2833
2,1,3,female,26.0,7.925
3,1,1,female,35.0,53.1
4,0,3,male,35.0,8.05


Looking for Null Values if Present in the dataset

In [26]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
Fare          0
dtype: int64

Only Age has Continious value , Replacing NULL with the Mean

In [27]:
df.Age = df.Age.fillna(df.Age.mean())
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,male,22.0,7.25
1,1,1,female,38.0,71.2833
2,1,3,female,26.0,7.925
3,1,1,female,35.0,53.1
4,0,3,male,35.0,8.05


In [28]:
df.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
Fare        0
dtype: int64

In [29]:
df.describe()

Unnamed: 0,Survived,Pclass,Age,Fare
count,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,32.204208
std,0.486592,0.836071,13.002015,49.693429
min,0.0,1.0,0.42,0.0
25%,0.0,2.0,22.0,7.9104
50%,0.0,3.0,29.699118,14.4542
75%,1.0,3.0,35.0,31.0
max,1.0,3.0,80.0,512.3292


Converting the Categorical Data of SeX into Integer

In [30]:
sex=pd.get_dummies(df.Sex)
sex.head()

Unnamed: 0,female,male
0,False,True
1,True,False
2,True,False
3,True,False
4,False,True


In [31]:
df=pd.concat([df,sex],axis="columns")
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,female,male
0,0,3,male,22.0,7.25,False,True
1,1,1,female,38.0,71.2833,True,False
2,1,3,female,26.0,7.925,True,False
3,1,1,female,35.0,53.1,True,False
4,0,3,male,35.0,8.05,False,True


In [32]:
df.drop(columns = ["Sex"] , axis = 'columns' , inplace = True)

In [33]:
df.head()

Unnamed: 0,Survived,Pclass,Age,Fare,female,male
0,0,3,22.0,7.25,False,True
1,1,1,38.0,71.2833,True,False
2,1,3,26.0,7.925,True,False
3,1,1,35.0,53.1,True,False
4,0,3,35.0,8.05,False,True


Dividing the Dataset into Dependent and Independent Variables

In [34]:
target = df['Survived']
inputs = df.drop(columns=['Survived'])

In [35]:
inputs.head()

Unnamed: 0,Pclass,Age,Fare,female,male
0,3,22.0,7.25,False,True
1,1,38.0,71.2833,True,False
2,3,26.0,7.925,True,False
3,1,35.0,53.1,True,False
4,3,35.0,8.05,False,True


In [36]:
target.head()


0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

Dividing the Dataset into Trainig and Testing

In [37]:
from sklearn.model_selection import train_test_split

In [38]:
X_train, X_test, Y_train, Y_test=train_test_split(inputs,target,test_size=0.2)

Scaling the data

In [39]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [43]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix


# Define a list of classifiers
classifiers = [
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    LogisticRegression(),
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    XGBClassifier()
]

# Create a dictionary to store the results
results = {}


# Train and evaluate each classifier
for clf in classifiers:
    clf_name = clf.__class__.__name__
    clf.fit(X_train_scaled, Y_train)
    Y_pred = clf.predict(X_test_scaled)

    # Calculate accuracy
    accuracy = accuracy_score(Y_test, Y_pred)
    print(f"{clf_name} Accuracy: {accuracy}")

    # Classification report
    print(f"Classification Report for {clf_name}:")
    print(classification_report(Y_test, Y_pred))

    # Confusion matrix
    print(f"Confusion Matrix for {clf_name}:")
    print(confusion_matrix(Y_test, Y_pred))
    print("="*50)

RandomForestClassifier Accuracy: 0.8435754189944135
Classification Report for RandomForestClassifier:
              precision    recall  f1-score   support

           0       0.85      0.90      0.87       105
           1       0.84      0.77      0.80        74

    accuracy                           0.84       179
   macro avg       0.84      0.83      0.84       179
weighted avg       0.84      0.84      0.84       179

Confusion Matrix for RandomForestClassifier:
[[94 11]
 [17 57]]
AdaBoostClassifier Accuracy: 0.8100558659217877
Classification Report for AdaBoostClassifier:
              precision    recall  f1-score   support

           0       0.82      0.87      0.84       105
           1       0.79      0.73      0.76        74

    accuracy                           0.81       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179

Confusion Matrix for AdaBoostClassifier:
[[91 14]
 [20 54]]
GradientBoostingClassifier

As , RandomForestCassifier has the best Accuracy , we'll go on with RandomForestClassifier

In [44]:
model_rf = RandomForestClassifier()

model_rf.fit(X_train_scaled, Y_train)


y_pred_rf = model_rf.predict(X_test_scaled)

accuracy_rf = accuracy_score(Y_test, y_pred_rf)
print("Random Forest Classifier Accuracy:", accuracy_rf)

print("Confusion Matrix for Random Forest Classifier:")
print(confusion_matrix(Y_test, y_pred_rf))

Random Forest Classifier Accuracy: 0.8324022346368715
Confusion Matrix for Random Forest Classifier:
[[92 13]
 [17 57]]


In [48]:
model_rf.score(X_test_scaled , Y_test)

0.8324022346368715

Saving the Model Using the Pickle

In [46]:
import pickle
pickle.dump(model_rf,open("model_rf.pkl",'wb'))