In [None]:
from warnings import filterwarnings
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

filterwarnings('ignore')


In [None]:
train = pd.read_csv("train.csv")
train

In [None]:
test = pd.read_csv("test.csv")
test.head(10)

### Test set doesn't have Survived column

In [None]:
all = pd.concat([train, test], sort = False, ignore_index=True)
all.info()

In [None]:
all.isnull().sum()

In [None]:
all['Fare'] = all['Fare'].fillna(value=all['Fare'].median())

In [None]:
all["Fare"][all['Fare']>100] = 100

In [None]:
all.info()

In [None]:
all["Embarked"].value_counts()

In [None]:
all['Embarked'] = all['Embarked'].fillna('S')

In [None]:
sns.histplot(data=all["Fare"], kde=True, color="teal")
plt.show()

In [None]:
import plotly.express as px

fig = px.bar(all.groupby(['Pclass', 'Age', 'Embarked'],as_index=False).size(), x='Pclass', y="Age", facet_col="Embarked", barmode='relative')
fig.show()

In [None]:
sns.heatmap(all.corr(),annot=True, linewidths=2);

In [None]:
all.hist();

In [None]:
all.isna().sum()
## Going to fill the age values according to Pclasses's has the most value seen in the heatmap

In [None]:
indexs = list(np.where(all["Age"].isnull())[0])

for i in indexs:
   if all["Pclass"][i] == 1:
       all["Age"][i]=np.mean(all.where(all["Pclass"]==1))["Age"]
   elif all["Pclass"][i] == 2:
       all["Age"][i]=np.mean(all.where(all["Pclass"]==2))["Age"]
   elif all["Pclass"][i] == 3:
       all["Age"][i]=np.mean(all.where(all["Pclass"]==3))["Age"]

In [None]:
# Feature Engineering
all['IsAlone']=all['SibSp'] + all['Parch']
all['IsAlone']=all['IsAlone'].apply(lambda x: 0 if x >0 else 1)

all.groupby('IsAlone')['Survived'].value_counts()[1].plot(kind='bar')
plt.title('Alone')
plt.show()
all.groupby('IsAlone')['Survived'].value_counts()[0].plot(kind='bar')
plt.title('With Family');

In [None]:
#Age
all.loc[ all['Age'] <= 16, 'Age'] = 0
all.loc[(all['Age'] > 16) & (all['Age'] <= 32), 'Age'] = 1
all.loc[(all['Age'] > 32) & (all['Age'] <= 48), 'Age'] = 2
all.loc[(all['Age'] > 48) & (all['Age'] <= 64), 'Age'] = 3
all.loc[ all['Age'] > 64, 'Age'] = 4

In [None]:
all["Title"] = all["Name"].str.extract("([A-Za-z]+)\.",expand=False)
all["Title"].value_counts()

In [None]:
all['Title'] = all['Title'].replace(['Capt', 'Dr', 'Major', 'Rev'], 'Officer')
all['Title'] = all['Title'].replace(['Lady', 'Countess', 'Don', 'Sir', 'Jonkheer', 'Dona'], 'Royal')
all['Title'] = all['Title'].replace(['Mlle', 'Ms'], 'Miss')
all['Title'] = all['Title'].replace(['Mme'], 'Mrs')
all['Title'].value_counts()

In [None]:
#Cabin
all['Cabin'] = all['Cabin'].fillna('Missing')
all['Cabin'] = all['Cabin'].str[0]
all['Cabin'].value_counts()

In [None]:
all['Family_Size'] = all['SibSp'] + all['Parch'] + 1

In [None]:
all = all.drop(["Name", "Ticket"], axis = 1)

In [None]:
all["IsAlone"] = all["IsAlone"].astype("category")

In [None]:
all_dummies = pd.get_dummies(all)
all_dummies.info()

In [None]:
all_train = all_dummies[all_dummies['Survived'].notna()]
all_train.info()
all_test = all_dummies[all_dummies['Survived'].isna()]
all_test.info()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(all_train.drop(['PassengerId','Survived'],axis=1),
                                                    all_train['Survived'], test_size=0.30,
                                                    random_state=101)

Build Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier


In [None]:
from sklearn.model_selection import GridSearchCV

parameters = {'n_estimators' : (10,30,50)
              , 'max_depth' : (3,5,7,9,10)
              , 'min_samples_split' : (4,6,8)
             }
RF_grid  = GridSearchCV(RandomForestClassifier(n_jobs = -1, oob_score= False), param_grid = parameters, cv = 5, verbose = True)
RF_grid_model = RF_grid.fit(X_train, y_train)
RF_grid_model.best_estimator_

In [None]:
RF_Model = RandomForestClassifier(max_depth=7, max_samples=None,
                       min_samples_split=6, n_estimators=10, n_jobs=-1)

In [None]:
RF_Model.fit(X_train, y_train)

In [None]:
predictions = RF_Model.predict(X_test)
predictions

In [None]:
print(f'Test : {RF_Model.score(X_test, y_test):.3f}')
print(f'Train : {RF_Model.score(X_train, y_train):.3f}')

In [None]:
TestForPred = all_test.drop(['PassengerId', 'Survived'], axis = 1)

In [None]:
t_pred = RF_Model.predict(TestForPred).astype(int)

In [None]:
PassengerId = all_test['PassengerId']

In [None]:
RF_Sub = pd.DataFrame({'PassengerId': PassengerId, 'Survived': t_pred})
RF_Sub.head()

In [None]:
RF_Sub.to_csv("RF_Class_Submission.csv", index=False)