In [1]:
# importação das bibliotecas
from sklearn.tree import export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df_edu = pd.read_csv('/home/amador/dados/xAPI-Edu-Data.csv')

In [3]:
df_edu.head(3)

Unnamed: 0,gender,NationalITy,PlaceofBirth,StageID,GradeID,SectionID,Topic,Semester,Relation,raisedhands,VisITedResources,AnnouncementsView,Discussion,ParentAnsweringSurvey,ParentschoolSatisfaction,StudentAbsenceDays,Class
0,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,15,16,2,20,Yes,Good,Under-7,M
1,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,20,20,3,25,Yes,Good,Under-7,M
2,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,10,7,0,30,No,Bad,Above-7,L


In [4]:
# verificando classes
df_edu['Class'].value_counts()

M    211
H    142
L    127
Name: Class, dtype: int64

In [5]:
# verificando registros nulos
df_edu.isnull().sum()

gender                      0
NationalITy                 0
PlaceofBirth                0
StageID                     0
GradeID                     0
SectionID                   0
Topic                       0
Semester                    0
Relation                    0
raisedhands                 0
VisITedResources            0
AnnouncementsView           0
Discussion                  0
ParentAnsweringSurvey       0
ParentschoolSatisfaction    0
StudentAbsenceDays          0
Class                       0
dtype: int64

In [6]:
# codificando os atributos numéricos
Features = df_edu
Cat_Colums = Features.dtypes.pipe(lambda Features: Features[Features=='object']).index
for col in Cat_Colums:
    label = LabelEncoder()
    Features[col] = label.fit_transform(Features[col])

In [7]:
# separando dados
dataset = df_edu.drop('Class',axis=1)
classes = df_edu['Class']

## Random Forest

In [8]:
# classificador
# random_state=1 para garantir o mesmo resultado em cada algoritmo (na separação dos dados)
# n_estimator=100
random_clf = RandomForestClassifier(random_state=1,n_estimators=100)

In [9]:
# cross validation
resultados_random = cross_val_predict(random_clf, dataset, classes, cv=5)
print(classification_report(classes,resultados_random))

              precision    recall  f1-score   support

           0       0.65      0.64      0.65       142
           1       0.77      0.78      0.77       127
           2       0.63      0.63      0.63       211

    accuracy                           0.67       480
   macro avg       0.68      0.68      0.68       480
weighted avg       0.67      0.67      0.67       480



## Decision Tree

In [10]:
tree_clf = DecisionTreeClassifier(random_state=1) # random_state=1 para garantir a mesma semente
resultados_tree = cross_val_predict(tree_clf,dataset,classes,cv=5)
print(classification_report(classes,resultados_tree))

              precision    recall  f1-score   support

           0       0.50      0.61      0.55       142
           1       0.74      0.68      0.70       127
           2       0.54      0.49      0.52       211

    accuracy                           0.57       480
   macro avg       0.59      0.59      0.59       480
weighted avg       0.58      0.57      0.58       480



## Verificando Overfitting

In [11]:
X_train, X_test, y_train, y_test = train_test_split(df_edu.drop('Class',axis=1),df_edu['Class'],test_size=0.3,random_state=1)

In [12]:
def compara_modelos_random_forest(maxdepth):
    if maxdepth == 0:
        rf = RandomForestClassifier(n_estimators=100,random_state=1)
    else: 
        rf = RandomForestClassifier(n_estimators=100,random_state=1, max_depth=maxdepth)
    rf.fit(X_train, y_train)
    train_score = rf.score(X_train, y_train)
    test_score = rf.score(X_test, y_test)
    return train_score,test_score

In [13]:
print('{:10} {:20} {:20}'.format('depth', 'Training score','Testing score'))
print('{:10} {:20} {:20}'.format('-----', '--------------','-------------'))
print('{:1}         {} '.format(2,str(compara_modelos_random_forest(2))))
print('{:1}         {} '.format(3,str(compara_modelos_random_forest(3))))
print('{:1}         {} '.format(4,str(compara_modelos_random_forest(4))))
print('{:1}         {} '.format(10,str(compara_modelos_random_forest(10))))
print('{:1}         {} '.format(15,str(compara_modelos_random_forest(15))))
print('{:1}         {} '.format('Full',str(compara_modelos_random_forest(0))))

depth      Training score       Testing score       
-----      --------------       -------------       
2         (0.75, 0.6180555555555556) 
3         (0.8244047619047619, 0.6805555555555556) 
4         (0.8720238095238095, 0.7152777777777778) 
10         (1.0, 0.7569444444444444) 
15         (1.0, 0.7986111111111112) 
Full         (1.0, 0.7986111111111112) 


In [14]:
def compara_modelos_decision_tree(maxdepth):
    if maxdepth == 0:
        df = DecisionTreeClassifier(random_state=1)
    else: 
        df = DecisionTreeClassifier(random_state=1, max_depth=maxdepth)
    df.fit(X_train, y_train)
    train_score = df.score(X_train, y_train)
    test_score = df.score(X_test, y_test)
    return train_score,test_score

In [15]:
print('{:10} {:20} {:20}'.format('depth', 'Training score','Testing score'))
print('{:10} {:20} {:20}'.format('-----', '--------------','-------------'))
print('{:1}         {} '.format(2,str(compara_modelos_decision_tree(2))))
print('{:1}         {} '.format(3,str(compara_modelos_decision_tree(3))))
print('{:1}         {} '.format(4,str(compara_modelos_decision_tree(4))))
print('{:1}         {} '.format(10,str(compara_modelos_decision_tree(10))))
print('{:1}         {} '.format(15,str(compara_modelos_decision_tree(15))))
print('{:1}         {} '.format('Full',str(compara_modelos_decision_tree(0))))

depth      Training score       Testing score       
-----      --------------       -------------       
2         (0.6398809523809523, 0.6805555555555556) 
3         (0.7321428571428571, 0.7013888888888888) 
4         (0.7916666666666666, 0.7430555555555556) 
10         (0.9910714285714286, 0.6875) 
15         (1.0, 0.6944444444444444) 
Full         (1.0, 0.6944444444444444) 


## Tunning do Modelo

In [16]:
# GridSearchCV para testes de Hyperparametros
from sklearn.model_selection import GridSearchCV

In [17]:
# Lista de possíveis valores de estimators ou quantidade de árvores da floresta.
valores_estimators = [10, 20, 50, 100, 150]
# Lista de possíveis valores para o critério de divisão.
valores_criterion = ['gini','entropy']
# Lista de possíveis valores para a profundidade máxima de cada árvore
valores_max_depth = [10, 20, 50, 100]
# Lista de possíveis valores para os parametros min_samples_split e min_samples_leaf.
valores_min_samples_split = [2, 5, 10,15]
valores_min_samples_leaf = [1, 5, 10,15]

In [18]:
# Define um dicionário que recebe as listas de parâmetros e valores
parametros_grid = dict(n_estimators=valores_estimators,
                       criterion=valores_criterion,
                       max_depth=valores_max_depth,
                       min_samples_split=valores_min_samples_split,
                       min_samples_leaf=valores_min_samples_leaf 
                      )

In [19]:
parametros_grid

{'n_estimators': [10, 20, 50, 100, 150],
 'criterion': ['gini', 'entropy'],
 'max_depth': [10, 20, 50, 100],
 'min_samples_split': [2, 5, 10, 15],
 'min_samples_leaf': [1, 5, 10, 15]}

In [20]:
# Instanciando o GridSearch com o modelo a ser utilizado, parametros, número de folds e scoring.
rf = RandomForestClassifier()
grid = GridSearchCV(rf, parametros_grid, cv=5, scoring='accuracy')

In [21]:
# Aplicando o GridSearch passando as features e classes
grid.fit(df_edu.drop('Class',axis=1),df_edu['Class'])

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [10, 20, 50, 100],
                         'min_samples_leaf': [1, 5, 10, 15],
                         'min_samples_split': [2, 5, 10, 15],
                         'n_estimators': [10, 20, 50, 100, 150]},
             scoring='accuracy')

In [22]:
# resultados
grid.cv_results_

{'mean_fit_time': array([0.03159409, 0.03119602, 0.07708888, 0.15568519, 0.24426641,
        0.01646633, 0.0305727 , 0.07542157, 0.15619941, 0.23582888,
        0.0182878 , 0.03031859, 0.08171515, 0.15137868, 0.21616654,
        0.01640587, 0.02948718, 0.07355032, 0.13984861, 0.21497655,
        0.01619687, 0.03460774, 0.08933344, 0.18443656, 0.28990903,
        0.02818618, 0.04341903, 0.08684206, 0.15058932, 0.29002643,
        0.02734118, 0.03688173, 0.09055576, 0.15791831, 0.23108377,
        0.01947222, 0.03599772, 0.07406321, 0.15005879, 0.22596622,
        0.01592832, 0.02952318, 0.07888532, 0.14799604, 0.22615294,
        0.01961107, 0.03473449, 0.07433414, 0.13325486, 0.22686915,
        0.01804113, 0.03146229, 0.07872424, 0.15479074, 0.2107553 ,
        0.01583896, 0.03483825, 0.07691298, 0.14729743, 0.2223372 ,
        0.01775737, 0.03091912, 0.07821937, 0.15211878, 0.22358928,
        0.01721096, 0.0348124 , 0.08262434, 0.15054708, 0.20822854,
        0.02048306, 0.03260622,

In [23]:
# melhores parâmetros
grid.best_params_

{'criterion': 'gini',
 'max_depth': 10,
 'min_samples_leaf': 10,
 'min_samples_split': 5,
 'n_estimators': 20}

In [24]:
# melhores scores
grid.best_score_

0.7291666666666667