# Dathaton FIAP - PASSOS MÁGICOS

## Importing Libraries

In [41]:
from dotenv import load_dotenv

import os
from pathlib import Path

import pandas as pd

from pycaret.classification import setup, compare_models, predict_model, tune_model, finalize_model, plot_model, evaluate_model, save_model, load_model

import psycopg2 as ps

## Loading Data

In [42]:
load_dotenv()

DB_KEY = os.getenv("DB_KEY")
DB_NAME = os.getenv("DB_NAME")

In [43]:
conn = ps.connect(
    dbname='PEDE_PASSOS',
    user=DB_NAME,
    host='localhost',
    port='5432',
    password=DB_KEY)

In [44]:
data = pd.read_sql("SELECT * FROM pede_passos.students_avaliation", con=conn, )

In [45]:
# path = Path('../data/raw/PEDE_PASSOS_DATASET_FIAP.csv')

# data = pd.read_csv(path, sep=';')

## Data Wrangling

In [46]:
melt_data = data.melt(id_vars=['NOME'], var_name='feature', value_name='value')
melt_data.rename(columns={'NOME': 'name'}, inplace=True)
melt_data['feature'] = melt_data['feature'].astype(str)
melt_data[['feature', 'year']] = melt_data['feature'].str.rsplit('_', n=1, expand=True)

In [47]:
fase = melt_data

In [48]:
fase['feature'] = fase['feature'].str.replace('FASE_TURMA', 'FASE')
fase = fase[fase['feature']=='FASE']
fase['value'] = fase.loc[fase['year']=='2020']['value'].str[:-1]

In [49]:
fase.dropna(inplace=True, axis=0, how='any')
fase['value'] = pd.to_numeric(fase['value'], errors='coerce')
fase.dropna(subset=['value'], inplace=True)

scholars = fase[fase['value'] <= 7]
academics = fase[fase['value'] > 7]

In [27]:
melt_data['id'] = melt_data['name'] + '_' + melt_data['year']
melt_data.set_index('id', inplace=True)

melt_data.drop(columns=['name', 'year'], inplace=True)

In [28]:
values_to_keep = ['IAA', 'IEG', 'IPS', 'IDA', 'IPP','IPV', 'IAN', 'PEDRA']

melt_data = melt_data[melt_data['feature'].isin(values_to_keep)]

melt_data = melt_data.pivot(columns='feature', values='value')
melt_data.columns.name = None

In [29]:
melt_data.dropna(axis=0, subset='PEDRA', inplace=True)

pedras = ['Ágata', 'Topázio', 'Ametista', 'Quartzo']

melt_data = melt_data[melt_data['PEDRA'].isin(pedras)]

In [30]:
for col in melt_data.columns:
	if col != 'PEDRA':
		melt_data[col] = pd.to_numeric(melt_data[col], errors='coerce')
melt_data = melt_data.dropna()

In [31]:
target = {'Quartzo': 0, 'Ágata': 1, 'Ametista': 2, 'Topázio': 3}

melt_data['PEDRA'] = melt_data['PEDRA'].map(target)

## Data Modeling

In [None]:
# Configurar o ambiente de PyCaret
clf = setup(data=melt_data, target='PEDRA', session_id=123, experiment_name='pedra_prediction')

# Comparar todos os modelos disponíveis e selecionar o melhor
best_model = compare_models()

# Afinar o melhor modelo
tuned_model = tune_model(best_model)

# Finalizar o modelo
final_model = finalize_model(tuned_model)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,PEDRA
2,Target type,Multiclass
3,Original data shape,"(2273, 8)"
4,Transformed data shape,"(2273, 8)"
5,Transformed train set shape,"(1591, 8)"
6,Transformed test set shape,"(682, 8)"
7,Numeric features,7
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.8711,0.0,0.8711,0.8726,0.8706,0.8147,0.8156,0.364
lightgbm,Light Gradient Boosting Machine,0.863,0.9713,0.863,0.865,0.8624,0.8027,0.8037,0.865
et,Extra Trees Classifier,0.8624,0.9754,0.8624,0.8643,0.8614,0.8004,0.8017,0.032
rf,Random Forest Classifier,0.8479,0.9715,0.8479,0.8503,0.847,0.7797,0.781,0.038
gbc,Gradient Boosting Classifier,0.8435,0.0,0.8435,0.8457,0.8425,0.7729,0.7743,0.125
lda,Linear Discriminant Analysis,0.8422,0.0,0.8422,0.8477,0.8396,0.7691,0.7727,0.004
qda,Quadratic Discriminant Analysis,0.8353,0.0,0.8353,0.8372,0.8347,0.763,0.7641,0.004
knn,K Neighbors Classifier,0.8328,0.9613,0.8328,0.8345,0.832,0.7583,0.7595,0.01
nb,Naive Bayes,0.775,0.9313,0.775,0.7789,0.772,0.67,0.6736,0.005
dt,Decision Tree Classifier,0.7687,0.8312,0.7687,0.7706,0.7678,0.6682,0.6694,0.006


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8875,0.0,0.8875,0.8883,0.8862,0.8369,0.8381
1,0.8679,0.0,0.8679,0.8733,0.8689,0.8124,0.8139
2,0.9057,0.0,0.9057,0.9074,0.9056,0.8645,0.8649
3,0.8491,0.0,0.8491,0.8487,0.8487,0.783,0.7831
4,0.8742,0.0,0.8742,0.8783,0.8746,0.8221,0.8234
5,0.8616,0.0,0.8616,0.8654,0.8617,0.8034,0.8049
6,0.8616,0.0,0.8616,0.8612,0.861,0.8019,0.8022
7,0.8491,0.0,0.8491,0.853,0.8492,0.7827,0.7837
8,0.8805,0.0,0.8805,0.8795,0.8794,0.8279,0.8284
9,0.8805,0.0,0.8805,0.882,0.8791,0.8271,0.8286


Fitting 10 folds for each of 10 candidates, totalling 100 fits


## Evaluation

In [33]:
# Avaliar o modelo final
evaluate_model(final_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [34]:
# Salvar o modelo
save_model(final_model, '../models/lr_pedra_classification')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['IAA', 'IAN', 'IDA', 'IEG', 'IPP',
                                              'IPS', 'IPV'],
                                     transformer=SimpleImputer(add_indicator=False,
                                                               copy=True,
                                                               fill_value=None,
                                                               keep_empty_features=False,
                                                               missing_values=nan,
                                                               strategy='mean'))),
                 ('categorical_imputer',
                  TransformerWrapper(exclude=None, include=[...
                                                               fill_value=None,
                                                 