In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

In [6]:
#define dataset
space_titanic_df_encoded = pd.read_excel('SpaceTitanic/train_1.xlsx',index_col=0)
space_titanic_df_encoded_test = pd.read_excel('SpaceTitanic/test_1.xlsx',index_col=0)
space_titanic_df_train, space_titanic_df_val = train_test_split(space_titanic_df_encoded, test_size=0.2, random_state=42)
X_train, y_train = space_titanic_df_train.loc[:,space_titanic_df_train.columns != 'Transported'].values.tolist(),space_titanic_df_train['Transported'].values.tolist()
X_val, y_val = space_titanic_df_val.loc[:,space_titanic_df_val.columns != 'Transported'].values.tolist(),space_titanic_df_val['Transported'].values.tolist()

In [17]:
IMPURITY_MEASURES = ['gini','entropy']
N_TREES = np.arange(start=30, stop=101, step=10)
MAX_FEATURES = np.arange(start=3, stop=len(X_train[0]))

idx = 0  

#create a dataframe to store the results using a dictionary
result = {'metrics':[],'n_tree':[],'features':[],'acc_score_val':[],'idx':[]}
best_models = []
for metrics in IMPURITY_MEASURES:
        for n_tree in N_TREES:
            for features in MAX_FEATURES:
                    random_forest =  RandomForestClassifier(bootstrap=True, 
                                                            class_weight=None, 
                                                            criterion=metrics,
                                                            max_depth=None, 
                                                            max_features=features, 
                                                            max_leaf_nodes=None,
                                                            min_impurity_decrease=0.0, 
                                                            min_samples_leaf=1, 
                                                            min_samples_split=2,
                                                            min_weight_fraction_leaf=0.0, 
                                                            n_estimators=n_tree, 
                                                            n_jobs=1,
                                                            oob_score=True, 
                                                            random_state=42, 
                                                            verbose=0,
                                                            warm_start=False)
                    random_forest.fit(X_train, y_train)
                    acc_score_val = accuracy_score(y_val, random_forest.predict(X_val))
                                    
                    print('metrics:{} max_depth:{} Features:{} Accuracy:{:.5f}'.format(metrics, n_tree, features, acc_score_val))
                    
                    #save the results in a dictionary
                    result['metrics'].append(metrics)
                    result['n_tree'].append(n_tree)
                    result['features'].append(features)
                    result['acc_score_val'].append(acc_score_val)
                    result['idx'].append(idx)
                    best_models.append(random_forest)
                    idx +=1
                    
result_df = pd.DataFrame(result)

metrics:gini max_depth:30 Features:3 Accuracy:0.78263
metrics:gini max_depth:30 Features:4 Accuracy:0.78033
metrics:gini max_depth:30 Features:5 Accuracy:0.78148
metrics:gini max_depth:30 Features:6 Accuracy:0.78378
metrics:gini max_depth:30 Features:7 Accuracy:0.77803
metrics:gini max_depth:30 Features:8 Accuracy:0.78436
metrics:gini max_depth:30 Features:9 Accuracy:0.78091
metrics:gini max_depth:30 Features:10 Accuracy:0.77631
metrics:gini max_depth:30 Features:11 Accuracy:0.78436
metrics:gini max_depth:30 Features:12 Accuracy:0.78033
metrics:gini max_depth:30 Features:13 Accuracy:0.77688
metrics:gini max_depth:30 Features:14 Accuracy:0.77918
metrics:gini max_depth:30 Features:15 Accuracy:0.78148
metrics:gini max_depth:30 Features:16 Accuracy:0.78033
metrics:gini max_depth:30 Features:17 Accuracy:0.77746
metrics:gini max_depth:30 Features:18 Accuracy:0.77918
metrics:gini max_depth:30 Features:19 Accuracy:0.77918
metrics:gini max_depth:30 Features:20 Accuracy:0.78148
metrics:gini max_

In [21]:
space_titanic_df_encoded_test

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Earth,Europa,...,P,S,A,B,C,D,E,F,G,T
0,1,27,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,1,0
1,0,19,0,0,9,0,2823,0,1,0,...,0,1,0,0,0,0,0,1,0,0
2,1,31,0,0,0,0,0,0,0,1,...,0,1,0,0,1,0,0,0,0,0
3,0,38,0,0,6652,0,181,585,0,1,...,0,1,0,0,1,0,0,0,0,0
4,0,20,0,10,0,635,0,0,1,0,...,0,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,1,34,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,1,0
4273,0,42,0,0,847,17,10,144,1,0,...,0,1,0,0,0,0,0,1,0,0
4274,1,18,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
4275,0,18,0,0,2680,0,0,523,0,1,...,1,0,0,0,0,1,0,0,0,0


In [35]:
#best model
print(result_df.loc[result_df['acc_score_val'].idxmax(),:])

#faccio la prediction con il modello addestrato su solo il train
best_rand_forest = best_models[result_df.loc[result_df['acc_score_val'].idxmax(),'idx']]
submission_1 = space_titanic_df_encoded_test.copy()
submission_1['Transported'] = best_rand_forest.predict(submission_1)
submission_1['Transported'] = np.where(submission_1['Transported']==1,True,False)
submission_1 = submission_1['Transported']


metrics           entropy
n_tree                 40
features                3
acc_score_val    0.790684
idx                   189
Name: 189, dtype: object




In [34]:
#faccio il fit con tutti i dati
best_rand_forest = RandomForestClassifier(bootstrap=True, 
                                            class_weight=None, 
                                            criterion=result_df.loc[result_df['acc_score_val'].idxmax(),'metrics'],
                                            max_depth=None, 
                                            max_features=result_df.loc[result_df['acc_score_val'].idxmax(),'features'],
                                            max_leaf_nodes=None,
                                            min_impurity_decrease=0.0, 
                                            min_samples_leaf=1, 
                                            min_samples_split=2,
                                            min_weight_fraction_leaf=0.0, 
                                            n_estimators=result_df.loc[result_df['acc_score_val'].idxmax(),'n_tree'], 
                                            n_jobs=1,
                                            oob_score=True, 
                                            random_state=42, 
                                            verbose=0,
                                            warm_start=False)
best_rand_forest.fit(X_train, y_train)

submission_2 = space_titanic_df_encoded_test.copy()
submission_2['Transported'] = best_rand_forest.predict(submission_2)
submission_2['Transported'] = np.where(submission_2['Transported']==1,True,False)
submission_2 = submission_2['Transported']               

