In [1]:
import pandas as pd 
import seaborn as sns
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split  
from sklearn.ensemble import RandomForestClassifier  
from sklearn.metrics import confusion_matrix  
from sklearn.metrics import accuracy_score
import pickle
from sklearn.metrics import f1_score , precision_score,recall_score
from sklearn.metrics import accuracy_score, classification_report
from collections import Counter
from imblearn.over_sampling import SMOTE

In [2]:
# pip install -U scikit-learn

### data prep


In [19]:
data=pd.read_csv("data/cleaned_data.csv")
data

In [4]:
def grouping_(x):
    if x<=30:
        return 0
    elif 30<x<300:
        return 50
    else:
        return 100
    
data['Degree_rotation'] = data['Degree_rotation'].apply(lambda x :grouping_(x))

In [5]:
x= data.iloc[:,:-1].values  
y= data.iloc[:,-1].values  

### data preprocessing

In [20]:
# transform the dataset
oversample = SMOTE()
x, y = oversample.fit_resample(x, y)
# summarize distribution
counter = Counter(y)
for k,v in counter.items():
 per = v / len(y) * 100
 print('Class=%d, n=%d (%.3f%%)' % (k, v, per))
# plot the distribution
plt.bar(counter.keys(), counter.values())
plt.show()

In [7]:
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size= 0.25, random_state=0, shuffle=True,stratify=y)

### model training

In [21]:
model_R= RandomForestClassifier(n_estimators= 10, criterion="entropy")  
model_R.fit(x_train, y_train)  

### model evaluation

In [9]:
y_pred_R= model_R.predict(x_test)  
y_pred_R

array([ 50,  50, 100, ...,   0, 100, 100])

In [10]:
cm= confusion_matrix(y_test, y_pred_R)  

In [22]:
accuracy = accuracy_score(y_test, y_pred_R)
# print("Accuracy Score:", accuracy)

In [13]:
print(classification_report(y_test, y_pred_R))

              precision    recall  f1-score   support

           0       0.87      0.95      0.90     15797
          50       0.92      0.79      0.85     15796
         100       0.94      0.99      0.96     15796

    accuracy                           0.91     47389
   macro avg       0.91      0.91      0.91     47389
weighted avg       0.91      0.91      0.91     47389



In [28]:
import numpy as np
from sklearn.metrics import confusion_matrix

def accuracy_for_each_element(confusion_matrix):
    num_classes = len(confusion_matrix)
    accuracy_matrix = np.zeros((num_classes, num_classes), dtype=float)
    for i in range(num_classes):
        for j in range(num_classes):
            accuracy_matrix[i, j] = round(cm[i, j]/cm.sum(axis=1)[i], 2)
    return accuracy_matrix

cm = confusion_matrix(y_test, y_pred_R)
# print('Confusion matrix:\n', cm)

accuracy_matrix = accuracy_for_each_element(cm)
# print("\nAccuracy Matrix for Each Element:", accuracy_matrix)

group_counts = [value for value in accuracy_matrix.flatten()]

labels = [v1 for v1 in group_counts]
labels = np.asarray(labels).reshape(3,3)

ht=sns.heatmap(cm, annot=labels, fmt='', cmap='Blues')
ht.set_xticklabels(['0', '50', '100'])
ht.set_yticklabels(['0', '50', '100'])

plt.title(f'Random Forest Model Size - 29MB\n{round(accuracy*100, 2)} accuracy')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# save model

In [15]:
f='model_zoo/random_forest.pkl'
pickle.dump(model_R,open(f,'wb'))

# Loading & Testing model

In [16]:
model=pickle.load(open('model_zoo/random_forest.pkl','rb'))
input_data = np.array([-722,-1918,-194,-15,-8,-1])
result= model.predict([input_data])

In [17]:
result[0]

100