In [None]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd 
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib import style

# Algorithms
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier, VotingClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_score, recall_score, f1_score

In [None]:
#Load Data
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

train_df.info()
print(train_df.describe())
print(train_df.head(10))

total = train_df.isnull().sum().sort_values(ascending=False)
percent_1 = train_df.isnull().sum()/train_df.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
print(missing_data.head(12))

In [None]:
# Drop unnecessary features
train_df = train_df.drop(['PassengerId'], axis=1)
train_df = train_df.drop(['Cabin'], axis=1)
test_df = test_df.drop(['Cabin'], axis=1)
train_df = train_df.drop(['Ticket'], axis=1)
test_df = test_df.drop(['Ticket'], axis=1)

In [None]:
# To fill the Null value of Age feature
data = [train_df, test_df]

for dataset in data:
    mean = train_df["Age"].mean()
    std = test_df["Age"].std()
    is_null = dataset["Age"].isnull().sum()
    rand_age = np.random.randint(mean - std, mean + std, size = is_null)
    age_slice = dataset["Age"].copy()
    age_slice[np.isnan(age_slice)] = rand_age
    dataset["Age"] = age_slice
    dataset["Age"] = train_df["Age"].astype(int)

In [None]:
# To fill the Null value of Emarked
for dataset in data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')

In [None]:
# Convert Fare column to Int.
for dataset in data:
    dataset['Fare'] = dataset['Fare'].fillna(0)
    dataset['Fare'] = dataset['Fare'].astype(int)

In [None]:
# Convetr Name column data type to Int.
titles = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}

for dataset in data:
    # extract titles
    dataset['P_Name'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    # replace titles with a more common title or as Rare
    dataset['P_Name'] = dataset['P_Name'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    dataset['P_Name'] = dataset['P_Name'].replace('Mlle', 'Miss')
    dataset['P_Name'] = dataset['P_Name'].replace('Ms', 'Miss')
    dataset['P_Name'] = dataset['P_Name'].replace('Mme', 'Mrs')
    # convert titles into numbers
    dataset['P_Name'] = dataset['P_Name'].map(titles)
    # filling NaN with 0, to get safe
    dataset['P_Name'] = dataset['P_Name'].fillna(0)

In [None]:
# Drop Name column from dataset.    
train_df = train_df.drop(['Name'], axis=1)
test_df = test_df.drop(['Name'], axis=1)

In [None]:
# Convetr Sex column data type to Int.
genders = {"male": 0, "female": 1}
data = [train_df, test_df]
for dataset in data:
    dataset['Sex'] = dataset['Sex'].map(genders)

In [None]:
# Convert embarked column to Int.
values = {"S": 0, "C": 1, "Q": 2}
data = [train_df, test_df]
for dataset in data:
    dataset['Embarked'] = dataset['Embarked'].map(values)

In [None]:
# Grouping the Age column.
for dataset in data:
    dataset['Age'] = dataset['Age'].astype(int)
    dataset.loc[ dataset['Age'] <= 11, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 11) & (dataset['Age'] <= 18), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 18) & (dataset['Age'] <= 22), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 22) & (dataset['Age'] <= 27), 'Age'] = 3
    dataset.loc[(dataset['Age'] > 27) & (dataset['Age'] <= 33), 'Age'] = 4
    dataset.loc[(dataset['Age'] > 33) & (dataset['Age'] <= 40), 'Age'] = 5
    dataset.loc[(dataset['Age'] > 40) & (dataset['Age'] <= 66), 'Age'] = 6
    dataset.loc[ dataset['Age'] > 66, 'Age'] = 6

In [None]:
# Grouping the Fare column.
data = [train_df, test_df]

for dataset in data:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[(dataset['Fare'] > 31) & (dataset['Fare'] <= 99), 'Fare']   = 3
    dataset.loc[(dataset['Fare'] > 99) & (dataset['Fare'] <= 250), 'Fare']   = 4
    dataset.loc[ dataset['Fare'] > 250, 'Fare'] = 5
    dataset['Fare'] = dataset['Fare'].astype(int)

train_df.info()    
print(train_df.head(10))

In [None]:
# prepare the data.
X_train = train_df.drop("Survived", axis=1)
Y_train = train_df["Survived"]
X_test  = test_df.drop("PassengerId", axis=1).copy()

# -------------- Random Forest Algorithm with Grid Search ------------

In [None]:
param_grid = {'criterion': ['gini', 'entropy'], 'min_samples_leaf': [1, 3, 5], 'min_samples_split': [10, 15, 20], 'n_estimators': [100, 200, 300, 400, 500],
              'max_features': ['auto', 'sqrt', 'log2'], 'max_depth' : [2, 3, 4, 5, 6, 7, 8]}

random_forest = RandomForestClassifier()
GS = GridSearchCV(random_forest, param_grid, verbose=1)
GS.fit(X_train, Y_train)
print(GS.best_params_)

RF = RandomForestClassifier(criterion = "gini", 
                                       min_samples_leaf = 1, 
                                       min_samples_split = 15,   
                                       n_estimators=100, 
                                       max_features='sqrt',
                                       max_depth=5,
                                       oob_score=True, 
                                       random_state=1, 
                                       n_jobs=1)
RF.fit(X_train, Y_train)

acc_RF = round(RF.score(X_train, Y_train) * 100, 2)
predictions = cross_val_predict(RF, X_train, Y_train, cv=3)
RF_Precision = round(precision_score(Y_train, predictions) * 100, 2)
RF_Recall = round(recall_score(Y_train, predictions) * 100, 2)
RF_F1_score = round(f1_score(Y_train, predictions) * 100, 2)

print('Accuracy: {}'.format(acc_RF))
print('Precision: {}'.format(RF_Precision))
print('Recall: {}'.format(RF_Recall))
print('F1-Score: {}'.format(RF_F1_score))