In [None]:
import numpy as np
import json
from tabulate import tabulate
import random
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report

np.set_printoptions(threshold=np.inf)

In [None]:
CLASSES = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 
           'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
DATA_DIR = "Dataset/"
FILES = []
for i in range(0, 15):
    i += 1
    d = "User" + str(i) + ".json"
    FILES.append(d)
    
print(FILES)

In [None]:
def load_data():
    dataset = []
    for_print = []
    
    for file in FILES:
        with open(DATA_DIR + file, 'r') as f:
            data = json.load(f)
            
            for letter in data:
                num = len(letter["fingerprint"])
                
                if num > 0:
                    l = letter["letter"]
                    points = letter["fingerprint"]
                    
                    d = [l, num, points]
                    dataset.append(d)
                    d = []

                    ar = [l, num]
                    for_print.append(ar)
                    
        print(str(file) + " done.")
        
    return dataset, for_print



dataset, for_print = load_data()
print("\nDataset length: " + str(len(dataset)))

In [None]:
dataset, for_print = load_data()

print("\nDataset length: " + str(len(dataset)))
#print(tabulate(for_print, headers=["Letter", "Number of points"]))

In [None]:
all_points = 0

for d in dataset:
    all_points += d[1]

average_points = all_points // len(dataset)
#average_points = 350
print("Average number of points: " + str(average_points))
print("--------------------------------")

for d in dataset:
    letter_points = d[1]
    difference = letter_points - average_points
    
    if difference > 0:
        while difference > 0:
            random_index = random.randint(0, d[1]-1)
            d[2].pop(random_index)
            d[1] -= 1
            difference -= 1
        
    elif difference < 0:
        while difference < 0:
            before_index = random.randint(0, d[1]-2)
            after_index = before_index + 1
            
            new_x = (d[2][before_index][0] + d[2][after_index][0]) / 2
            new_y = (d[2][before_index][1] + d[2][after_index][1]) / 2
            new_z = (d[2][before_index][2] + d[2][after_index][2]) / 2
            
            new_point = [new_x, new_y, new_z]
            
            d[2].insert(after_index, new_point)
            d[1] += 1
                
            difference += 1

In [None]:
X = []
y = []

for label, num, points in dataset:
    X.append(points)
    y.append(label)

#print(X[0])

In [None]:
X_reshaped = []

for x in X:
    x = np.array(x).reshape((198*3))
    X_reshaped.append(x)

In [None]:
X_train , X_test , y_train, y_test = train_test_split(X_reshaped, y, test_size=0.15, stratify=y)

print("X_train length: " + str(len(X_train)) + "  y_train length: " + str(len(y_train)))
print("X_test length: " + str(len(X_test)) + "  y_test length: " + str(len(y_test)))

In [None]:
import matplotlib.pyplot as plt

# PREBROJAVANJE KOLIKO KOJEG SLOVA IMA TRAIN SET
train_label_count = [0] * 26

for c in y_train:
    train_label_count[CLASSES.index(c)] += 1
    
print(train_label_count)

figure, axis = plt.subplots()

axis.bar(CLASSES, train_label_count, width=1, edgecolor="white", linewidth=0.7)
plt.show()

In [None]:
# PREBROJAVANJE KOLIKO KOJEG SLOVA IMA TEST SET

test_label_count = [0] * 26

for c in y_test:
    test_label_count[CLASSES.index(c)] += 1
    
print(test_label_count)

figure, axis = plt.subplots()

axis.bar(CLASSES, test_label_count, width=1, edgecolor="white", linewidth=0.7)
plt.show()

In [None]:
# PREBROJAVANJE KOLIKO KOJEG SLOVA IMA U CIJELOM DATASETU

dataset_label_count = [0] * 26

for c in y:
    dataset_label_count[CLASSES.index(c)] += 1
    
print(dataset_label_count)

figure, axis = plt.subplots()

axis.bar(CLASSES, dataset_label_count, width=1, edgecolor="white", linewidth=0.7)
plt.show()

In [None]:
# USPOREDBA
plt.rcParams['figure.figsize'] = [15, 5]
figure, axis = plt.subplots()

x_axis = np.arange(len(CLASSES))
width = 0.25

axis.bar(x_axis, dataset_label_count, width, edgecolor="white", linewidth=0.7, color='darkolivegreen')
axis.bar(x_axis+width, train_label_count, width, edgecolor="white", linewidth=0.7, color='deepskyblue')
axis.bar(x_axis+width*2, test_label_count, width, edgecolor="white", linewidth=0.7, color='salmon')

plt.xlabel("Letter")
plt.ylabel("Number of")

plt.xticks(x_axis+width, CLASSES)
plt.legend(['DATASET', 'TRAIN', 'TEST'])

plt.show()

In [None]:
n_estimators = [int(x) for x in np.linspace(start = 20, stop = 150, num = 10)]
max_features = ['auto', 'sqrt', 'log2', None]
max_depth = [int(x) for x in np.linspace(10, 60, num = 6)]
max_depth.append(None)
min_samples_split = [int(x) for x in np.linspace(start = 2, stop = 10, num = 1)]
min_samples_leaf = [int(x) for x in np.linspace(start = 1, stop = 5, num = 1)]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)

In [None]:
kf = KFold(n_splits=10)
model = RandomForestClassifier()

rf_random = RandomizedSearchCV(
    estimator = model,
    param_distributions = random_grid,
    n_iter = 10,
    cv = 5,
    verbose=1,
    random_state=42,
    n_jobs = -1)

rf_random.fit(X_train, y_train)

In [None]:
rf_random.best_params_

In [None]:
best_random = rf_random.best_estimator_
y_pred = best_random.predict(X_test)

accuracy_score(y_pred,y_test)
print(classification_report(y_pred,y_test))

In [None]:
### RANDOM SEARCH TOP-4 ###

best_random = RandomForestClassifier(n_estimators=105, min_samples_split=2, min_samples_leaf=1, max_features='log2',
                                     max_depth=40, bootstrap=False)
best_random.fit(X_train, y_train)
y_pred = best_random.predict(X_test)

print(accuracy_score(y_pred,y_test))
print(classification_report(y_pred,y_test))

predictions = best_random.classes_[np.argsort(best_random.predict_proba(X_test))[:, :-4 - 1:-1]]

print(len(y_pred))
test_number = len(y_test)
counter = 0
top1 = 0
top2 = 0
top3 = 0
top4 = 0

for i in range(0, len(y_test)):
    if y_test[i] in predictions[i]:
        counter += 1
        
        if y_test[i] == predictions[i][0]:
            top1 += 1
        elif y_test[i] == predictions[i][1]:
            top2 += 1
        elif y_test[i] == predictions[i][2]:
            top3 += 1
        elif y_test[i] == predictions[i][3]:
            top4 += 1
    
counter_per = counter / test_number
top1_per = top1 / test_number
top2_per = top2 / test_number
top3_per = top3 / test_number
top4_per = top4 / test_number

import math
#counter_per = math.floor(counter_per * 10 ** 4) / 10 ** 4
print("Predicted in top 4: " + str(counter) + " ---> " + str(round(counter_per, 2)))
print("Top1: " + str(top1) + " ---> " + str(round(top1_per, 2)*100))
print("Top2: " + str(top2) + " ---> " + str(round(top2_per, 2)*100))
print("Top3: " + str(top3) + " ---> " + str(round(top3_per, 2)*100))
print("Top4: " + str(top4) + " ---> " + str(round(top4_per, 2)*100))

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {'n_estimators': [198, 200, 202],
              'max_features': ['log2'],
              'max_depth': [33, 34, 35],
              'min_samples_split': [2],
              'min_samples_leaf': [1],
              'bootstrap': [False],
              'random_state': [0, 13, 42]}

rf = RandomForestClassifier()
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 10, verbose = 2)

In [None]:
grid_search.fit(X_train, y_train)

print(grid_search.best_score_)
print(grid_search.best_params_)

In [None]:
best_grid = grid_search.best_estimator_

y_pred = best_grid.predict(X_test)

print(accuracy_score(y_pred,y_test))
print(classification_report(y_pred,y_test))

In [None]:
### GRID SEARCH TOP-4 ###

best_grid = RandomForestClassifier(n_estimators=202, min_samples_split=2, min_samples_leaf=1, max_features='log2',
                                     max_depth=33, bootstrap=False, random_state=13)
best_grid.fit(X_train, y_train)
y_pred = best_grid.predict(X_test)

print(accuracy_score(y_pred,y_test))
print(classification_report(y_pred,y_test))

predictions = best_grid.classes_[np.argsort(best_grid.predict_proba(X_test))[:, :-4 - 1:-1]]

print(len(y_pred))
test_number = len(y_test)
counter = 0
top1 = 0
top2 = 0
top3 = 0
top4 = 0

for i in range(0, len(y_test)):
    if y_test[i] in predictions[i]:
        counter += 1
        
        if y_test[i] == predictions[i][0]:
            top1 += 1
        elif y_test[i] == predictions[i][1]:
            top2 += 1
        elif y_test[i] == predictions[i][2]:
            top3 += 1
        elif y_test[i] == predictions[i][3]:
            top4 += 1
    
counter_per = counter / test_number
top1_per = top1 / test_number
top2_per = top2 / test_number
top3_per = top3 / test_number
top4_per = top4 / test_number

import math
#counter_per = math.floor(counter_per * 10 ** 4) / 10 ** 4
print("Predicted in top 4: " + str(counter) + " ---> " + str(round(counter_per, 2)))
print("Top1: " + str(top1) + " ---> " + str(round(top1_per, 2)*100))
print("Top2: " + str(top2) + " ---> " + str(round(top2_per, 2)*100))
print("Top3: " + str(top3) + " ---> " + str(round(top3_per, 2)*100))
print("Top4: " + str(top4) + " ---> " + str(round(top4_per, 2)*100))

In [None]:
scores = cross_val_score(best_grid, X_train, y_train, cv=10)
cv_result = np.mean(scores)

print ("Rezultati:", scores)
print ("Prosječni rezultat je: ", cv_result)

In [None]:
### RANDOM FOREST CONFUSION MATRIX ###

from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [30, 15]

disp = ConfusionMatrixDisplay.from_estimator(
    best_grid,
    X_test,
    y_test,
    display_labels=CLASSES,
    cmap=plt.cm.Blues
)

disp.ax_.set_title("RANDOM FOREST")

plt.savefig('RandomForestNew.png', bbox_inches='tight')
plt.show()

In [None]:
### SAVE RANDOM FOREST MODEL ###

import pickle

model_file_name = "81_95_78.pickle"

with open(model_file_name, 'wb') as f:
    pickle.dump(best_grid, f, pickle.HIGHEST_PROTOCOL)

In [None]:
### LOAD RANDOM FOREST MODEL ###

loaded_model = pickle.load(open(model_file_name, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)