# Random Forest Model (Students Dataset)

In [30]:
import numpy as np
import pandas as pd
import csv 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import tree as _tree
import json

In [45]:
#importation du dataset 
df = pd.read_csv('my_full_tcc.csv')

#taille du dataset: 135 lignes et 1909 colonnes
df.shape

(107, 1423)

In [46]:
feature_names = df.columns.tolist()

# Get the column name
pred = df.filter(regex='s1$', axis=1).columns[0]
feature_names.remove(pred)

X = df[feature_names]

# Transform your target column
y = df[pred].apply(lambda x: 0 if x < 10 else 1)

class_names = y.unique()

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Train a random forest classifier
clf = RandomForestClassifier(n_estimators=128, random_state=0)
clf.fit(X_train, y_train)


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=128,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [47]:
def tree_to_json(tree, feature_names, class_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != -2 else "undefined!"
        for i in tree_.feature
    ]

    def recurse(node):
        # Check if the node is a leaf
        is_leaf = tree_.children_left[node] == tree_.children_right[node] == -1
        if not is_leaf:
            name = feature_name[node]
            return {"name": name,
                    "children": [recurse(tree_.children_left[node]), 
                                 recurse(tree_.children_right[node])]}
        else:
            # For leaf nodes, use the most frequent class as name
            most_frequent_class_index = np.argmax(tree_.value[node])
            most_frequent_class_name = class_names[most_frequent_class_index]
            return {"name": most_frequent_class_name, "value": tree_.value[node].tolist()}

    return recurse(0)


In [48]:

forest_json = []
for i, tree in enumerate(clf.estimators_):
    accuracy = tree.score(X_test, y_test)
    tree_json = tree_to_json(tree, feature_names, class_names)
    forest_json.append({
        "id": i,
        "name": f"Arbre {i}",
        "accuracy": accuracy,
        "children": [tree_json]
    })
    
feature_importances = clf.feature_importances_

sorted_importances = np.sort(feature_importances, axis=None, kind='quicksort')[::-1]
print(sorted_importances[:10])

indices = np.argsort(feature_importances)[::-1].tolist()

feature_names_ordered = [feature_names[i] for i in indices]

importance_dict = {feature_names[i]: feature_importances[i] for i in indices}


[0.01388752 0.01172319 0.01096809 0.00916356 0.00888429 0.00884636
 0.00823901 0.00784566 0.00729171 0.00674939]


La classe NumpyEncoder étend json.JSONEncoder et remplace la méthode default pour gérer les types de données NumPy. Lors de la sérialisation JSON, si le code rencontre un type de données NumPy, il le convertit en un type de données Python équivalent qui peut être sérialisé. En particulier, il convertit np.integer en int, np.floating en float, et np.ndarray en liste.


In [49]:
class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return super(NumpyEncoder, self).default(obj)

In [99]:
with open('full_gea.js', 'w') as f:  # Modifiez l'extension du fichier en .js
    f.write("export let arbres = ")
    f.write(json.dumps(forest_json, cls=NumpyEncoder))
    f.write(";")

In [7]:
print("Nombre d'arbres dans la forêt aléatoire :", len(clf.estimators_))

Nombre d'arbres dans la forêt aléatoire : 128


In [8]:
accuracies = [tree.score(X_test, y_test) for tree in clf.estimators_]
print("L'accuracy minimale parmi tous les arbres :", min(accuracies))

L'accuracy minimale parmi tous les arbres : 0.4411764705882353


### Matrice de prédiction de chaque élève  

In [9]:

# Initialiser une matrice vide de taille (nombre d'étudiants, nombre d'arbres)
prediction_matrix = np.zeros((len(X_test), len(clf.estimators_)))

# Pour chaque arbre dans la forêt
for i, tree in enumerate(clf.estimators_):
    # Obtenir les prédictions de l'arbre pour chaque étudiant
    predictions = tree.predict(X_test)
    # Mettre à jour la colonne correspondante dans la matrice des prédictions
    prediction_matrix[:, i] = predictions


In [103]:
print(prediction_matrix)

[[1. 1. 1. ... 1. 0. 1.]
 [0. 1. 1. ... 1. 0. 1.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 1. 1.]
 [1. 1. 1. ... 1. 1. 1.]
 [0. 0. 1. ... 1. 0. 0.]]


In [16]:
# Convertir la matrice des prédictions en une liste de listes
predictions_list = prediction_matrix.astype(int).tolist()

# Convertir la liste des prédictions en JSON
predictions_json = json.dumps(predictions_list)

# Écrire le JSON dans un fichier .js
with open('predictions_matrix_gea.js', 'w') as f:
    f.write("export let predictions = ")
    f.write(predictions_json)
    f.write(";")

In [105]:
prediction_matrix.shape

(34, 128)

### Matrice de précision de chaque élève  

In [10]:
# Initialiser une matrice vide de la même taille que la matrice des prédictions
accuracy_matrix = np.zeros(prediction_matrix.shape)

# Pour chaque arbre dans la forêt
for i, tree in enumerate(clf.estimators_):
    # Obtenir les prédictions de l'arbre pour chaque étudiant
    predictions = tree.predict(X_test)
    # Comparer les prédictions aux vraies valeurs et stocker le résultat dans la matrice de précision
    accuracy_matrix[:, i] = (predictions == y_test).astype(int)



In [39]:
print(accuracy_matrix)

[[1. 1. 1. ... 1. 0. 1.]
 [0. 1. 1. ... 1. 0. 1.]
 [1. 1. 0. ... 1. 1. 1.]
 ...
 [1. 1. 1. ... 1. 0. 0.]
 [1. 1. 1. ... 1. 1. 1.]
 [1. 1. 0. ... 0. 1. 1.]]


In [40]:
# Convertir la matrice des prédictions en une liste de listes
accuracy_list = accuracy_matrix.astype(int).tolist()

# Convertir la liste des prédictions en JSON
accuracy_json = json.dumps(accuracy_list)

# Écrire le JSON dans un fichier .js
with open('accuracy_matrix_gea.js', 'w') as f:
    f.write("export let accuracy = ")
    f.write(accuracy_json)
    f.write(";")

### Matrice des valeurs réelles de test 

In [11]:
# Initialiser une matrice vide de la même taille que la matrice des prédictions
true_values_matrix = np.zeros(prediction_matrix.shape)

# Pour chaque arbre dans la forêt
for i, tree in enumerate(clf.estimators_):
    # Obtenir les vraies valeurs des données de test
    true_values = y_test.values
    # Stocker les vraies valeurs dans la matrice des vraies valeurs
    true_values_matrix[:, i] = true_values


In [19]:
# Convertir la matrice des vraies valeurs en une liste de listes
true_values_list = true_values_matrix.tolist()

# Convertir la liste des vraies valeurs en JSON
true_values_json = json.dumps(true_values_list)

# Écrire le JSON dans un fichier .js
with open('true_values_matrix_tcn.js', 'w') as f:
    f.write("export let trueValues = ")
    f.write(true_values_json)
    f.write(";")


### Informations de chaque noeuds des arbres

In [50]:
from sklearn import tree as sktree  # import the sklearn tree module
import math

def tree_to_info(tree, tree_id, accuracy):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != sktree._tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]
    
    def recurse(node):
        if tree_.feature[node] != sktree._tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            samples = tree_.n_node_samples[node]
            value = tree_.value[node].tolist()
            gini = tree_.impurity[node]
            classes = class_names[np.argmax(value)]
            return {
                "name": name,
                "threshold": threshold,
                "samples": samples,
                "value": value,
                "gini": gini,
                "classes": classes,
                "children": [recurse(tree_.children_left[node]) or {}, recurse(tree_.children_right[node]) or {}]
            }
        else:
            return None

    info = recurse(0)
    return {
        "id": tree_id,
        "accuracy": accuracy,
        "info": info
    }



In [51]:
forest_info_json = []
for i, tree in enumerate(clf.estimators_):
    tree_id = i
    accuracy = math.ceil(tree.score(X_test, y_test) * 100) / 100
    tree_info_json = tree_to_info(tree, tree_id, accuracy)
    forest_info_json.append(tree_info_json)

with open('tree_info_tcc.js', 'w') as f:
    f.write("export let treeInfo = ")
    f.write(json.dumps(forest_info_json, cls=NumpyEncoder))
    f.write(";")
