In [1]:
import random, os, sys
import matplotlib.pyplot as plt
import numpy as np
from skimage.io import *

In [2]:
dataset = np.genfromtxt('dataset.csv', delimiter=',')

In [3]:
dataset_features = [h[0:len(h)-1] for h in dataset]
dataset_labels = [h[len(h)-1] for h in dataset]

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import classification_report
import data

In [5]:
X_train, X_test, y_train, y_test = train_test_split(dataset_features,
                                                    dataset_labels,
                                                    test_size=0.25,
                                                    random_state=42)

In [6]:
print("Train vs Test: " + str([len(X_train), len(X_test)]))

Train vs Test: [3737, 1246]


In [None]:
param_grid = {
    'n_estimators': [70, 71, 72],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [4, 5, 6, 7, 8],
    'criterion': ['gini', 'entropy']
}

rf = RandomForestClassifier(min_samples_split=10, min_samples_leaf=5, random_state=42)

grid_scores, best_score, best_params, test_score = data.validate_model(model=rf, params=param_grid,
        train_data=[X_train, y_train], test_data=[X_test, y_test])

In [None]:
print(grid_scores)
print('Random Forest best score: {}'.format(best_score))
print('Random Forest best params : {}'.format(best_params))
print('Random Forest test score : {}'.format(test_score))

In [7]:
rf = RandomForestClassifier(criterion='entropy',
                            max_depth=8,
                            max_features='auto',
                            n_estimators=71,
                            min_samples_split=10, min_samples_leaf=5, random_state=42)

kfold = KFold(n_splits=10, random_state=7)
cv_results = cross_val_score(rf, dataset_features, dataset_labels, cv=kfold, scoring="accuracy")

In [8]:
cv_results.mean()

0.6649177873819929

In [9]:
cv_results

array([0.56913828, 0.5490982 , 0.59118236, 0.71686747, 0.71485944,
       0.68473896, 0.71084337, 0.69879518, 0.70481928, 0.70883534])

In [10]:
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

In [14]:
print(classification_report(y_test, rf_pred))

             precision    recall  f1-score   support

        0.0       0.79      0.75      0.77       710
        1.0       0.69      0.74      0.72       536

avg / total       0.75      0.75      0.75      1246



In [11]:
mean_squared_error(y_test, rf_pred)

0.25441412520064205

In [12]:
np.sqrt(mean_squared_error(y_test, rf_pred))

0.5043948108383373