# Random Forest - Model Training

In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [57]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier

## Import Dataset

In [58]:
dataset = pd.read_csv('Dataset/CleanedExtractedFeatures.csv')

## Visualize Dataset

In [59]:
dataset.head()

Unnamed: 0,mean,std_dev,energy,entropy,num_peaks,lbp_0,lbp_1,ClassLabel
0,0.094581,0.058153,0.049309,1.09733,0.0,2,3,1
1,0.09397,0.051851,0.046076,1.17812,1.0,0,3,1
2,0.039831,0.020652,0.008052,1.217346,1.0,0,3,1
3,0.07755,0.050505,0.034259,1.115788,1.0,0,3,1
4,0.093627,0.051805,0.045799,1.178472,1.0,0,3,1


In [60]:
print(dataset.isnull().sum())

mean          0
std_dev       0
energy        0
entropy       0
num_peaks     0
lbp_0         0
lbp_1         0
ClassLabel    0
dtype: int64


## Seperate X-Y values, Train-Test values

In [61]:
X = dataset.drop(columns=['ClassLabel'])
Y = dataset['ClassLabel']

## Train the Model - Basics

### Train the Model

In [55]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

model = RandomForestClassifier(n_estimators=50)
model.fit(X_train, Y_train)

### Predict test set from the model and check accuracy

In [62]:
predict = model.predict(X_test)

In [63]:
accuracy_score(Y_test, predict)

0.4482758620689655

In [64]:
confusion_matrix(Y_test, predict)

array([[ 2,  3,  4,  7,  1],
       [ 0, 43,  0, 10,  2],
       [ 2,  0, 16, 11,  6],
       [10,  5,  4, 13,  8],
       [ 1,  2, 12,  8,  4]], dtype=int64)

In [65]:
print(classification_report(Y_test, predict))

              precision    recall  f1-score   support

           1       0.13      0.12      0.12        17
           2       0.81      0.78      0.80        55
           3       0.44      0.46      0.45        35
           4       0.27      0.33      0.29        40
           5       0.19      0.15      0.17        27

    accuracy                           0.45       174
   macro avg       0.37      0.37      0.37       174
weighted avg       0.45      0.45      0.45       174



### Cross validation 

In [66]:
scores = cross_val_score(model, X, Y, cv=5) # 5-fold cross-validation

print("Cross-validation scores:", scores)
print("Mean cross-validation score:", scores.mean())

Cross-validation scores: [0.39655172 0.39655172 0.43103448 0.49565217 0.46086957]
Mean cross-validation score: 0.43613193403298356


## Train the Model - Finding the BEST Case

In [None]:
n_estimators_range = range(10, 201, 10)  # n_estimators: DECISION TREES
cross_validation_scores = []

for n_estimator in n_estimators_range:
    modelRF = RandomForestClassifier(n_estimators=n_estimator, random_state=42)
    scores = cross_val_score(modelRF, X_train, Y_train, cv=10)  # 10-fold cross validation
    cross_validation_scores.append(scores.mean()) # Mean cross-validation score

# Plot the cross-validation accuracy against n_estimators values
plt.figure(figsize=(10, 6))
plt.plot(n_estimators_range, cross_val_scores, marker='o')
plt.title('Cross-Validation Accuracy for Different n_estimators Values')
plt.xlabel('Number of Trees (n_estimators)')
plt.ylabel('Cross-Validation Accuracy')
plt.grid(True)
plt.show()

# Find and print the optimal n_estimators value
optimal_n_estimators = n_estimators_range[np.argmax(cross_val_scores)]
print(f"The optimal number of trees is {optimal_n_estimators} with cross-validation accuracy of {max(cross_val_scores):.4f}")
