# Titanic - Evaluate with Training Data
This will split the training data and use the split set for training and evaulation of the model.

In [25]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

X = pd.read_csv("new_train.csv")
test_df = pd.read_csv("new_test.csv")
X

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,Relatives,FareRatio,Deck,AgeClass
0,0,3,0,2,1,0,7,0,0,1,3,8,6
1,1,1,1,5,1,0,71,1,1,1,35,2,5
2,1,3,1,3,0,0,7,0,2,0,7,8,9
3,1,1,1,5,1,0,53,0,1,1,26,2,5
4,0,3,0,5,0,0,8,0,0,0,8,8,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,0,3,0,0,13,0,5,0,13,8,6
887,1,1,1,2,0,0,30,0,2,0,30,1,2
888,0,3,1,5,1,2,23,0,2,3,5,8,15
889,1,1,0,3,0,0,30,1,0,0,30,2,3


In [26]:
# Split randomly 60/40
train_df = X.sample(frac=0.6,random_state=200)
evaluate_df = X.drop(train_df.index)
print(f"Train has {len(train_df)} ({len(train_df)/len(X)*100}%) and Evaluate has {len(evaluate_df)} ({len(evaluate_df)/len(X)*100}%)")

Train has 535 (60.04489337822672%) and Evaluate has 356 (39.95510662177329%)


In [27]:
X_train = train_df.drop("Survived", axis=1)
Y_train = train_df["Survived"]
X_eval  = evaluate_df.drop("Survived", axis=1)
Y_eval  = evaluate_df["Survived"]

In [39]:
from sklearn.ensemble import RandomForestClassifier

def rate_classifer_hyperparameters(estimators, depths):
    prev_best_accuracy = 0
    best_e = 0
    best_d = 0
    i = 1
    for e in estimators:
        for d in depths:
            random_forest = RandomForestClassifier(n_estimators=e, max_depth=d, random_state=1)
            random_forest.fit(X_train, Y_train)

            random_forest.score(X_train, Y_train)
            acc_train_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
            acc_eval_random_forest = round(random_forest.score(X_eval, Y_eval) * 100, 2)
            
            print(f"{i} - e={e} d={d}: Train = {acc_train_random_forest}% Eval = {acc_eval_random_forest}%")
            if acc_eval_random_forest > prev_best_accuracy:
                print("New best!")
                prev_best_accuracy = acc_eval_random_forest
                best_e = e
                best_d = d
            i = i + 1
    return best_e, best_d, prev_best_accuracy

best_e, best_d, acc = rate_classifer_hyperparameters([80,90,100,110,123,123], [3,4,5,6,7,8])
print (f"Our best e = {best_e} and d = {best_d} with a score of {acc}%")

1 - e=80 d=3: Train = 83.55% Eval = 81.74%
New best!
2 - e=80 d=4: Train = 86.17% Eval = 81.18%
3 - e=80 d=5: Train = 87.48% Eval = 81.74%
4 - e=80 d=6: Train = 89.35% Eval = 79.78%
5 - e=80 d=7: Train = 90.84% Eval = 80.34%
6 - e=80 d=8: Train = 91.96% Eval = 81.18%
7 - e=90 d=3: Train = 83.55% Eval = 81.46%
8 - e=90 d=4: Train = 86.17% Eval = 80.62%
9 - e=90 d=5: Train = 87.85% Eval = 81.74%
10 - e=90 d=6: Train = 89.72% Eval = 79.78%
11 - e=90 d=7: Train = 90.84% Eval = 80.34%
12 - e=90 d=8: Train = 92.34% Eval = 81.18%
13 - e=100 d=3: Train = 83.55% Eval = 81.46%
14 - e=100 d=4: Train = 85.98% Eval = 81.18%
15 - e=100 d=5: Train = 88.04% Eval = 82.02%
New best!
16 - e=100 d=6: Train = 88.97% Eval = 79.78%
17 - e=100 d=7: Train = 90.84% Eval = 80.9%
18 - e=100 d=8: Train = 91.78% Eval = 81.18%
19 - e=110 d=3: Train = 83.55% Eval = 81.46%
20 - e=110 d=4: Train = 85.98% Eval = 81.18%
21 - e=110 d=5: Train = 88.04% Eval = 82.02%
22 - e=110 d=6: Train = 89.16% Eval = 80.06%
23 - e=110 d

In [41]:
from sklearn.metrics import mean_absolute_error


random_forest = RandomForestClassifier(n_estimators=best_e, max_depth=best_d, random_state=1)
random_forest.fit(X_train, Y_train)
train_predictions = random_forest.predict(X_train)
eval_predictions = random_forest.predict(X_eval)

print("Training mean error: {}".format(mean_absolute_error(Y_train, train_predictions)))
print("Evaluation mean error: {}".format(mean_absolute_error(Y_eval, eval_predictions)))

Training mean error: 0.11962616822429907
Evaluation mean error: 0.1797752808988764
