In [36]:
import pandas as pd
import numpy as np
import pickle 
from sklearn.model_selection import train_test_split


In [37]:
df=pd.read_csv('passenger_survival_dataset.csv')
df.head()

Unnamed: 0,Passenger_ID,Name,Age,Gender,Class,Seat_Type,Fare_Paid,Survival_Status
0,1,Passenger_1,52,Male,Economy,Window,822.34,0
1,2,Passenger_2,15,Female,Economy,Middle,732.22,0
2,3,Passenger_3,72,Male,First,Aisle,957.75,1
3,4,Passenger_4,61,Female,Business,Middle,67.32,0
4,5,Passenger_5,21,Male,Economy,Aisle,235.99,0


In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Passenger_ID     500 non-null    int64  
 1   Name             500 non-null    object 
 2   Age              500 non-null    int64  
 3   Gender           500 non-null    object 
 4   Class            500 non-null    object 
 5   Seat_Type        500 non-null    object 
 6   Fare_Paid        500 non-null    float64
 7   Survival_Status  500 non-null    int64  
dtypes: float64(1), int64(3), object(4)
memory usage: 31.4+ KB


In [39]:
# drop the Name column since its irrelevant in the model training
df=df.drop(columns=['Name','Passenger_ID'],axis=1)
df.head()

Unnamed: 0,Age,Gender,Class,Seat_Type,Fare_Paid,Survival_Status
0,52,Male,Economy,Window,822.34,0
1,15,Female,Economy,Middle,732.22,0
2,72,Male,First,Aisle,957.75,1
3,61,Female,Business,Middle,67.32,0
4,21,Male,Economy,Aisle,235.99,0


In [40]:
# find the null values in the df
df.isna().sum()

Age                0
Gender             0
Class              0
Seat_Type          0
Fare_Paid          0
Survival_Status    0
dtype: int64

In [41]:
# check for duplicated rows
df.duplicated().sum()

np.int64(0)

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Age              500 non-null    int64  
 1   Gender           500 non-null    object 
 2   Class            500 non-null    object 
 3   Seat_Type        500 non-null    object 
 4   Fare_Paid        500 non-null    float64
 5   Survival_Status  500 non-null    int64  
dtypes: float64(1), int64(2), object(3)
memory usage: 23.6+ KB


In [43]:
df['Gender'].value_counts(),df['Class'].value_counts(),df['Seat_Type'].value_counts()

(Gender
 Male      264
 Female    236
 Name: count, dtype: int64,
 Class
 Economy     300
 Business    130
 First        70
 Name: count, dtype: int64,
 Seat_Type
 Aisle     177
 Window    168
 Middle    155
 Name: count, dtype: int64)

In [44]:
df=pd.get_dummies(df,columns=['Gender'],drop_first=True,dtype='int')

In [45]:
classes={
    'Economy':0,
    'Business':1,
    'First':2
}
df['Class']=df['Class'].map(classes)

In [46]:
seats={
    'Aisle':0,
    'Middle':1,
    'Window':2
}
df['Seat_Type']=df['Seat_Type'].map(seats)


In [48]:
df.head()

Unnamed: 0,Age,Class,Seat_Type,Fare_Paid,Survival_Status,Gender_Male
0,52,0,2,822.34,0,1
1,15,0,1,732.22,0,0
2,72,2,0,957.75,1,1
3,61,1,1,67.32,0,0
4,21,0,0,235.99,0,1


In [49]:
x=df.drop('Survival_Status',axis=1)
y=df['Survival_Status']

In [53]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)


In [57]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

dt_clf=DecisionTreeClassifier()
dt_clf.fit(x_train,y_train)
pred=dt_clf.predict(x_test)

acc=accuracy_score(pred,y_test)
print(acc)

0.55


In [58]:
with open ("model.pkl",'wb') as file:
    pickle.dump(dt_clf,file)