In [167]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import *

In [168]:
train_df = pd.read_csv("Dataset/preprocessed_train.csv")
test_df = pd.read_csv("Dataset/preprocessed_test.csv")
train_df.drop("Unnamed: 0",axis=1,inplace=True)
test_df.drop("Unnamed: 0",axis=1,inplace=True)

In [169]:
train_df

Unnamed: 0,Passenger ID,Ticket Class 1,Ticket Class 2,Gender,Age,No of siblings/spouses aboard,No of parents/children aboard,Survived or not?
0,1,0,0,1,22,1,0,0
1,2,1,0,0,38,1,0,1
2,3,0,0,0,26,0,0,1
3,4,1,0,0,35,1,0,1
4,5,0,0,1,35,0,0,0
...,...,...,...,...,...,...,...,...
886,887,0,1,1,27,0,0,0
887,888,1,0,0,19,0,0,1
888,889,0,0,0,30,1,2,0
889,890,1,0,1,26,0,0,1


In [170]:
test_df

Unnamed: 0,Passenger ID,Ticket Class 1,Ticket Class 2,Gender,Age,No of siblings/spouses aboard,No of parents/children aboard
0,892,0,0,1,34,0,0
1,893,0,0,0,47,1,0
2,894,0,1,1,62,0,0
3,895,0,0,1,27,0,0
4,896,0,0,0,22,1,1
...,...,...,...,...,...,...,...
413,1305,0,0,1,30,0,0
414,1306,1,0,0,39,0,0
415,1307,0,0,1,38,0,0
416,1308,0,0,1,30,0,0


In [171]:
X_train = train_df.drop(["Passenger ID","Survived or not?"],axis=1,inplace=False)
Y_train = train_df["Survived or not?"]

In [172]:
X_test = test_df.drop(["Passenger ID"],axis=1,inplace=False)

In [173]:
X_train

Unnamed: 0,Ticket Class 1,Ticket Class 2,Gender,Age,No of siblings/spouses aboard,No of parents/children aboard
0,0,0,1,22,1,0
1,1,0,0,38,1,0
2,0,0,0,26,0,0
3,1,0,0,35,1,0
4,0,0,1,35,0,0
...,...,...,...,...,...,...
886,0,1,1,27,0,0
887,1,0,0,19,0,0
888,0,0,0,30,1,2
889,1,0,1,26,0,0


In [174]:
Y_train

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived or not?, Length: 891, dtype: int64

In [175]:
X_test

Unnamed: 0,Ticket Class 1,Ticket Class 2,Gender,Age,No of siblings/spouses aboard,No of parents/children aboard
0,0,0,1,34,0,0
1,0,0,0,47,1,0
2,0,1,1,62,0,0
3,0,0,1,27,0,0
4,0,0,0,22,1,1
...,...,...,...,...,...,...
413,0,0,1,30,0,0
414,1,0,0,39,0,0
415,0,0,1,38,0,0
416,0,0,1,30,0,0


In [176]:
regr = RandomForestClassifier()
model = regr.fit(X_train,Y_train)
model

In [177]:
Y_predict = pd.DataFrame(model.predict(X_test),columns=["Survived or not?"])
Y_predict

Unnamed: 0,Survived or not?
0,0
1,0
2,1
3,1
4,0
...,...
413,0
414,1
415,0
416,0


In [178]:
test_data = X_test.copy()
test_data["Survived or not?"] = Y_predict["Survived or not?"]
test_data

Unnamed: 0,Ticket Class 1,Ticket Class 2,Gender,Age,No of siblings/spouses aboard,No of parents/children aboard,Survived or not?
0,0,0,1,34,0,0,0
1,0,0,0,47,1,0,0
2,0,1,1,62,0,0,1
3,0,0,1,27,0,0,1
4,0,0,0,22,1,1,0
...,...,...,...,...,...,...,...
413,0,0,1,30,0,0,0
414,1,0,0,39,0,0,1
415,0,0,1,38,0,0,0
416,0,0,1,30,0,0,0


In [179]:
print(test_data["Survived or not?"][test_data["Gender"]==0].sum()*100/test_data["Survived or not?"].sum())
print(test_data["Survived or not?"][test_data["Gender"]==1].sum()*100/test_data["Survived or not?"].sum())

71.8562874251497
28.143712574850298


In [180]:
print(train_df["Survived or not?"][train_df["Gender"]==0].sum()*100/train_df["Survived or not?"].sum())
print(train_df["Survived or not?"][train_df["Gender"]==1].sum()*100/train_df["Survived or not?"].sum())

68.12865497076024
31.871345029239766


In [181]:
pp = model.feature_importances_
pp

array([0.09185569, 0.04113552, 0.34500338, 0.39662594, 0.07331059,
       0.05206888])

In [182]:
model.feature_names_in_

array(['Ticket Class 1', 'Ticket Class 2', 'Gender', 'Age',
       'No of siblings/spouses aboard', 'No of parents/children aboard'],
      dtype=object)

In [183]:
Y_train_predict = pd.DataFrame(model.predict(X_train),columns=["Survived or not?"])
accuracy_score(Y_train,Y_train_predict)

0.9158249158249159

In [184]:
precision_score(Y_train,Y_train_predict)

0.9057750759878419

In [185]:
f1_score(Y_train,Y_train_predict)

0.8882265275707899

In [186]:
test_total2 = pd.DataFrame(columns=["PassengerId","Survived"])
test_total2["PassengerId"] = test_df["Passenger ID"]
test_total2["Survived"] = Y_predict["Survived or not?"]
test_total2

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,1
3,895,1
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [187]:
test_total2.to_csv("Dataset/gender_submission.csv",index=False)