In [1]:
import numpy as np
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report


In [2]:
# carica il dataset
ID = 2054551
np.random.seed(ID)
data=np.genfromtxt('dataset_3Body_NSC.dat',skip_header=1)
data_mod=np.delete(data,[0,7,14,15,16,20,21],1)



In [4]:
# divide il dataset in training set e test set
X=data_mod[:,:-2]
Y=data_mod[:,-2]
M=data_mod[:,-1]
# normalizza i dati
scaler = preprocessing.StandardScaler()
X = scaler.fit_transform(X)
permutation = np.random.permutation(X.shape[0])

X = X[permutation]
Y = Y[permutation]

m_training=40000
m_test=60000

X_train, X_test = X[:m_training], X[m_training:m_training+m_test]
Y_train, Y_test = Y[:m_training], Y[m_training:m_training+m_test]


# definisci i parametri del classificatore
n_estimators = [100,400,700,1000]
max_features = ['auto', 'sqrt']
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]


In [11]:
# crea il dizionario dei parametri
n_estimators = [1000]
min_samples_split = [10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True]
random_grid = {'n_estimators': n_estimators,
               #'max_features': max_features,
               'min_samples_split': min_samples_split,
               #'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
              'max_leaf_nodes':[200,350,450]}

# crea l'oggetto RandomizedSearchCV
rfc = RandomForestClassifier(oob_score=True)
rf_random = RandomizedSearchCV(estimator=rfc, param_distributions=random_grid, 
                               n_iter=10, cv=3, verbose=2, random_state=ID, 
                               return_train_score=True) #n_jobs=-1)

# addestra il modello sui dati di training
rf_random.fit(X_train, Y_train)




Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] END bootstrap=True, max_leaf_nodes=200, min_samples_split=10, n_estimators=1000; total time= 1.1min
[CV] END bootstrap=True, max_leaf_nodes=200, min_samples_split=10, n_estimators=1000; total time= 1.1min
[CV] END bootstrap=True, max_leaf_nodes=200, min_samples_split=10, n_estimators=1000; total time= 1.0min
[CV] END bootstrap=True, max_leaf_nodes=350, min_samples_split=10, n_estimators=1000; total time= 1.1min
[CV] END bootstrap=True, max_leaf_nodes=350, min_samples_split=10, n_estimators=1000; total time= 1.1min
[CV] END bootstrap=True, max_leaf_nodes=350, min_samples_split=10, n_estimators=1000; total time= 1.1min
[CV] END bootstrap=True, max_leaf_nodes=450, min_samples_split=10, n_estimators=1000; total time= 1.2min
[CV] END bootstrap=True, max_leaf_nodes=450, min_samples_split=10, n_estimators=1000; total time= 1.2min
[CV] END bootstrap=True, max_leaf_nodes=450, min_samples_split=10, n_estimators=1000; total time= 1.2

RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(oob_score=True),
                   param_distributions={'bootstrap': [True],
                                        'max_leaf_nodes': [200, 350, 450],
                                        'min_samples_split': [10],
                                        'n_estimators': [1000]},
                   random_state=2054551, return_train_score=True, verbose=2)

In [12]:
# valuta le prestazioni del modello sul set di test
Y_pred = rf_random.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
report = classification_report(Y_test, Y_pred)

# stampa i risultati
print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Classification report:\n", report)

Accuracy: 66.57%
Classification report:
               precision    recall  f1-score   support

         0.0       0.72      0.84      0.78     32041
         1.0       0.54      0.57      0.56     12482
         2.0       0.44      0.05      0.08      5726
         3.0       0.65      0.57      0.60      9751

    accuracy                           0.67     60000
   macro avg       0.59      0.51      0.50     60000
weighted avg       0.64      0.67      0.64     60000



In [13]:
import pandas as pd
all_scores=pd.DataFrame(rf_random.cv_results_)
print(rf_random.best_params_)
print("\nAll scores on the grid:","\n",
      all_scores)

{'n_estimators': 1000, 'min_samples_split': 10, 'max_leaf_nodes': 450, 'bootstrap': True}

All scores on the grid: 
    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0      62.290666      1.073203         1.586902        0.018235   
1      66.131677      0.307079         1.729416        0.024359   
2      68.723566      0.573356         1.849472        0.116677   

  param_n_estimators param_min_samples_split param_max_leaf_nodes  \
0               1000                      10                  200   
1               1000                      10                  350   
2               1000                      10                  450   

  param_bootstrap                                             params  \
0            True  {'n_estimators': 1000, 'min_samples_split': 10...   
1            True  {'n_estimators': 1000, 'min_samples_split': 10...   
2            True  {'n_estimators': 1000, 'min_samples_split': 10...   

   split0_test_score  split1_test_score  split2_

In [14]:
print(rf_random.best_score_)

0.6682500739584637


In [16]:
rf_random.score(X_test,Y_test)

0.6657333333333333