# Dathaton FIAP - PASSOS MÁGICOS

## Importing Libraries

In [2]:
from dotenv import load_dotenv

import os
from pathlib import Path

import pandas as pd

from pycaret.classification import setup, compare_models, predict_model, tune_model, finalize_model, plot_model, evaluate_model, save_model, load_model

import psycopg2 as ps

## Loading Data

In [3]:
load_dotenv()

DB_KEY = os.getenv("DB_KEY")
DB_NAME = os.getenv("DB_NAME")

In [4]:
conn = ps.connect(
    dbname='PEDE_PASSOS',
    user=DB_NAME,
    host='localhost',
    port='5432',
    password=DB_KEY)

In [5]:
data = pd.read_sql("SELECT * FROM pede_passos.students_avaliation", con=conn, )

In [6]:
# path = Path('../data/raw/PEDE_PASSOS_DATASET_FIAP.csv')

# data = pd.read_csv(path, sep=';')

## Data Wrangling

In [7]:
melt_data = data.melt(id_vars=['NOME'], var_name='feature', value_name='value')
melt_data.rename(columns={'NOME': 'name'}, inplace=True)
melt_data['feature'] = melt_data['feature'].astype(str)
melt_data[['feature', 'year']] = melt_data['feature'].str.rsplit('_', n=1, expand=True)

In [8]:
fase = melt_data

In [9]:
fase['feature'] = fase['feature'].str.replace('FASE_TURMA', 'FASE')
fase = fase[fase['feature']=='FASE']
fase.loc[fase['year'] == '2020', 'value'] = fase.loc[fase['year'] == '2020', 'value'].str[:-1]

In [10]:
fase.dropna(inplace=True, axis=0, how='any')
fase['value'] = pd.to_numeric(fase['value'], errors='coerce')
fase.dropna(subset=['value'], inplace=True)

scholars = fase[fase['value'] <= 7]
academics = fase[fase['value'] > 7]

In [11]:
fase[(fase['year'] != '2020') & fase['value'] < 7]

Unnamed: 0,name,feature,value,year
4047,ALUNO-1,FASE,2.0,2020
4049,ALUNO-3,FASE,3.0,2020
4050,ALUNO-4,FASE,1.0,2020
4051,ALUNO-5,FASE,2.0,2020
4054,ALUNO-8,FASE,4.0,2020
...,...,...,...,...
52603,ALUNO-1342,FASE,2.0,2022
52604,ALUNO-1343,FASE,2.0,2022
52607,ALUNO-1346,FASE,0.0,2022
52609,ALUNO-1348,FASE,4.0,2022


In [12]:
scholars

Unnamed: 0,name,feature,value,year
4047,ALUNO-1,FASE,2.0,2020
4049,ALUNO-3,FASE,3.0,2020
4050,ALUNO-4,FASE,1.0,2020
4051,ALUNO-5,FASE,2.0,2020
4054,ALUNO-8,FASE,4.0,2020
...,...,...,...,...
52603,ALUNO-1342,FASE,2.0,2022
52604,ALUNO-1343,FASE,2.0,2022
52607,ALUNO-1346,FASE,0.0,2022
52609,ALUNO-1348,FASE,4.0,2022


In [13]:
melt_data['id'] = melt_data['name'] + '_' + melt_data['year']
melt_data.set_index('id', inplace=True)

melt_data.drop(columns=['name', 'year'], inplace=True)

In [14]:
values_to_keep = ['IAA', 'IEG', 'IPS', 'IDA', 'IPP','IPV', 'IAN', 'PEDRA']

melt_data = melt_data[melt_data['feature'].isin(values_to_keep)]

melt_data = melt_data.pivot(columns='feature', values='value')
melt_data.columns.name = None

In [15]:
scholars_to_keep = scholars['name'] + '_' + scholars['year']
scholars_data = melt_data.loc[melt_data.index.isin(scholars_to_keep)]

academics_to_keep = academics['name'] + '_' + academics['year']
academics_data = melt_data.loc[melt_data.index.isin(academics_to_keep)]

In [16]:
scholars_data

Unnamed: 0_level_0,IAA,IAN,IDA,IEG,IPP,IPS,IPV,PEDRA
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ALUNO-1000_2022,9.50002,10.0,2.0,8.093434343434343,6.458333333333334,7.5,6.611106666666666,Ágata
ALUNO-1001_2021,9.5,10.0,6.9,10.0,8.4,7.5,10.0,Topázio
ALUNO-1001_2022,9.000020000000001,5.0,8.083333333333334,8.452380952380953,5.625,5.625,7.75,Ametista
ALUNO-1002_2020,9.16668,5,8.333333333333334,9.4,8.125,7.5,7.944446666666668,Ametista
ALUNO-1002_2021,8.8,5.0,7.5,8.3,7.7,7.5,8.1,Ametista
...,...,...,...,...,...,...,...,...
ALUNO-998_2022,9.50002,10.0,6.166666666666666,9.668209876543209,5.625,5.625,7.499995,Ametista
ALUNO-999_2020,8.33334,10,7.333333333333333,5.8,6.875,3.125,7.72222,Ágata
ALUNO-999_2021,7.9,10.0,6.4,6.1,6.9,4.4,7.2,Ágata
ALUNO-99_2022,8.75001,5.0,7.25,10.0,7.5,7.5,7.2916625,Ametista


In [17]:
scholars_data.dropna(axis=0, subset='PEDRA', inplace=True)

pedras = ['Ágata', 'Topázio', 'Ametista', 'Quartzo']

scholars_data = scholars_data[melt_data['PEDRA'].isin(pedras)]

In [18]:
for col in scholars_data.columns:
	if col != 'PEDRA':
		scholars_data[col] = pd.to_numeric(scholars_data[col], errors='coerce')
scholars_data = scholars_data.dropna()

In [19]:
target = {'Quartzo': 0, 'Ágata': 1, 'Ametista': 2, 'Topázio': 3}

scholars_data['PEDRA'] = scholars_data['PEDRA'].map(target)

## Data Modeling

In [20]:
# Configurar o ambiente de PyCaret
clf = setup(data=scholars_data, target='PEDRA', session_id=123, experiment_name='pedra_prediction')

# Comparar todos os modelos disponíveis e selecionar o melhor
best_model = compare_models()

# Afinar o melhor modelo
tuned_model = tune_model(best_model)

# Finalizar o modelo
final_model = finalize_model(tuned_model)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,PEDRA
2,Target type,Multiclass
3,Original data shape,"(2249, 8)"
4,Transformed data shape,"(2249, 8)"
5,Transformed train set shape,"(1574, 8)"
6,Transformed test set shape,"(675, 8)"
7,Numeric features,7
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.8856,0.0,0.8856,0.8871,0.8853,0.8355,0.8362,0.348
qda,Quadratic Discriminant Analysis,0.8627,0.0,0.8627,0.8643,0.8619,0.8017,0.8029,0.004
et,Extra Trees Classifier,0.8609,0.9765,0.8609,0.8638,0.8598,0.7982,0.7998,0.043
lda,Linear Discriminant Analysis,0.8602,0.0,0.8602,0.8652,0.8582,0.7963,0.7988,0.015
gbc,Gradient Boosting Classifier,0.8539,0.0,0.8539,0.8567,0.8527,0.7882,0.7897,0.156
lightgbm,Light Gradient Boosting Machine,0.8533,0.9708,0.8533,0.8566,0.8527,0.7885,0.7898,0.906
knn,K Neighbors Classifier,0.8526,0.9616,0.8526,0.8546,0.8516,0.787,0.7883,0.029
rf,Random Forest Classifier,0.8513,0.9717,0.8513,0.8556,0.8503,0.7841,0.786,0.042
nb,Naive Bayes,0.7922,0.9472,0.7922,0.7972,0.7891,0.6954,0.699,0.007
dt,Decision Tree Classifier,0.7795,0.8413,0.7795,0.7832,0.7794,0.6846,0.686,0.006


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8734,0.0,0.8734,0.8754,0.8737,0.8201,0.8205
1,0.9177,0.0,0.9177,0.9192,0.9171,0.8816,0.8824
2,0.9177,0.0,0.9177,0.918,0.9177,0.8828,0.8829
3,0.8987,0.0,0.8987,0.8988,0.8986,0.8547,0.8548
4,0.8662,0.0,0.8662,0.8654,0.8643,0.8059,0.8067
5,0.8726,0.0,0.8726,0.8769,0.8734,0.8179,0.819
6,0.8917,0.0,0.8917,0.8916,0.8912,0.8432,0.8436
7,0.8917,0.0,0.8917,0.8927,0.8914,0.8445,0.845
8,0.8599,0.0,0.8599,0.8635,0.8606,0.8011,0.8019
9,0.8662,0.0,0.8662,0.8706,0.8656,0.805,0.8067


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


## Evaluation

In [21]:
# Avaliar o modelo final
evaluate_model(final_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [22]:
# Salvar o modelo
save_model(final_model, '../models/lr_pedra_classification')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['IAA', 'IAN', 'IDA', 'IEG', 'IPP',
                                              'IPS', 'IPV'],
                                     transformer=SimpleImputer(add_indicator=False,
                                                               copy=True,
                                                               fill_value=None,
                                                               keep_empty_features=False,
                                                               missing_values=nan,
                                                               strategy='mean'))),
                 ('categorical_imputer',
                  TransformerWrapper(exclude=None, include=[...
                                                               fill_value=None,
                                                 