# Dathaton FIAP - PASSOS MÁGICOS

## Importing Libraries

In [1]:
%%capture
# hide output, exclude the line above if you want to see the output of this installation
!pip install pycaret

In [2]:
from dotenv import load_dotenv

import os
from pathlib import Path

import pandas as pd
import numpy as np

# from pycaret.classification import setup, compare_models, predict_model, tune_model, finalize_model, plot_model, evaluate_model, save_model, load_model
import pycaret.classification as pc
# pc = pycaret.classification

import psycopg2

import plotly.express as px
import plotly.graph_objects as go

## Connecting to Database

In [3]:
# Getting the database variables from the .env file
load_dotenv()

DB_KEY = os.getenv("DB_KEY")
DB_NAME = os.getenv("DB_NAME")

In [4]:
# Connecting to the database
conn = psycopg2.connect(
    dbname='PEDE_PASSOS',
    user=DB_NAME,
    host='localhost',
    port='5432',
    password=DB_KEY)

## Loading Data

In [5]:
# Reading the data from the database
data = pd.read_sql("SELECT * FROM pede_passos.students_grades", con=conn)

## Preprocessing Data

In [6]:
# The selection of database columns is necessary, as some preprocessing and grouping have already been conducted in KNIME, 
# allowing me the flexibility to incorporate additional columns later
treated_data = data[['NOME', 'ANO', 'FASE', 'IAA', 'IAN', 'IDA', 'IEG', 'IPP', 'IPS', 'IPV',
       'INDE', 'PEDRA']]

# Due to inconsistent criteria, the 'NOME' column is removed, as the previous and new databases differ in this respect
treated_data = data.drop(columns=['NOME'], axis=1, errors='ignore')

# Adjusting 'FASE' column values
treated_data.loc[treated_data['ANO']=='2020', 'FASE'] = treated_data.loc[treated_data['ANO']=='2020', 'FASE'].str[:-1]
treated_data.loc[treated_data['ANO']=='2021', 'FASE'] = treated_data.loc[treated_data['ANO']=='2021', 'FASE'].str[:-2]
treated_data.loc[treated_data['ANO']=='2022', 'FASE'] = treated_data.loc[treated_data['ANO']=='2022', 'FASE'].str[:-2]
treated_data.loc[treated_data['ANO']=='2024', 'FASE'] = treated_data.loc[treated_data['ANO']=='2024', 'FASE'].str[:-1]
treated_data['FASE'] = treated_data['FASE'].str.replace('FASE ', '')
treated_data['FASE'] = treated_data['FASE'].str.replace(r'ALF.*', '0', regex=True)

# The column 'ANO'can be removed as well using the same logic as the 'NOME' column
treated_data = treated_data.drop(columns=['ANO'], axis=1, errors='ignore')

In [7]:
# Converting columns to numeric
columns_to_adj = ['IAA', 'IAN', 'IDA', 'IEG', 'IPP', 'IPS', 'IPV', 'INDE']
for col in columns_to_adj:
	treated_data[col] = treated_data[col].str.replace(',', '.')

for col in treated_data.columns[:-1]:
	treated_data[col] = pd.to_numeric(treated_data[col], errors='coerce')

# Adjusting 'PEDRA' column values
pedras = ['Quartzo', 'Ágata', 'Ametista', 'Topázio']

treated_data['PEDRA'] = treated_data['PEDRA'].str.replace('Agata', 'Ágata')
treated_data['PEDRA'] = treated_data['PEDRA'].apply(lambda x: x if x in pedras else np.nan)

In [8]:
# Segregating scholars data from academic data as the criteria for the definition of PEDRA is different
# Creating new columns for the scholars' data, where 0 means the student is a scholar and 1 means the student is a academic
treated_data['STUDENT'] = treated_data['FASE'].apply(lambda x: 0 if x < 8 else 1)
treated_data.drop(columns=['FASE'], inplace=True)

# Since the academic data only covers 2020 and includes just 24 entries, I'm excluding it from this analysis for now
treated_data = treated_data[treated_data['STUDENT'] == 0].drop(columns=['STUDENT'])

In [9]:
# Removing invalid data values
treated_data = treated_data.dropna(how='any', subset=['PEDRA'])
treated_data = treated_data.dropna(thresh=6).reset_index(drop=True)

# Converting 'FASE' column to integer
# treated_data['FASE'] = treated_data['FASE'].astype(int)

In [10]:
# Adjusting the 'PEDRA' column values to numericla to fit the model
category_pedras = {
    'Quartzo': 0,
    'Ágata': 1,
    'Ametista': 2,
    'Topázio': 3
}

treated_data['PEDRA'] = treated_data['PEDRA'].map(category_pedras)

In [11]:
# Removing INDE column as for a Simulator this indicator would not be available
treated_data.drop(columns='INDE', inplace=True)

## Data Modeling

In [12]:
clf = pc.setup(data=treated_data, target='PEDRA', session_id=123, experiment_name='Previsão de Pedra Conceito', verbose=False)
description = pc.pull()

best_model = pc.compare_models(sort='F1', verbose=False)
metrics = pc.pull()

tuned_model = pc.tune_model(best_model, verbose=False)
fold = pc.pull()

final_model = pc.finalize_model(tuned_model)

In [13]:
pc.save_model(final_model, '../models/lr_pedra_classification', model_only=True)

Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['IAA', 'IAN', 'IDA', 'IEG', 'IPP',
                                              'IPS', 'IPV'],
                                     transformer=SimpleImputer(add_indicator=False,
                                                               copy=True,
                                                               fill_value=None,
                                                               keep_empty_features=False,
                                                               missing_values=nan,
                                                               strategy='mean'))),
                 ('categorical_imputer',
                  TransformerWrapper(exclude=None, include=[...
                                                               keep_empty_features=False,
                                       

In [14]:
predictions = pc.predict_model(tuned_model)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.904,0.9853,0.904,0.9046,0.9039,0.8652,0.8655


## Visualization

In [15]:
from sklearn.metrics import confusion_matrix

In [16]:
cm = confusion_matrix(predictions['PEDRA'], predictions['prediction_label'])

fig = px.imshow(cm, 
                labels=dict(x="Predito", y="Real", color="Contagem"),
                x=['Quartzo', 'Ágata', 'Ametista', 'Topázio'],
                y=['Quartzo', 'Ágata', 'Ametista', 'Topázio'],
                text_auto=True,
                color_continuous_scale='gnbu')

fig.update_layout(
    # title="Matriz de Confusão Interativa",
                  width=600,
                  height=600,
                )
fig.show()

In [17]:
discrete_colors = ['#ffa600',
'#374c80',
'#7a5195',
'#bc5090',
'#ef5675',
'#ff764a',
'#ffa600']

In [18]:
# Obter as importâncias das features
feature_importance = np.abs(tuned_model.coef_[0])

# Criar dataframe com as importâncias
df_importance = pd.DataFrame({
    'Feature': tuned_model.feature_names_in_,
    'Importance': feature_importance
}).sort_values(by="Importance", ascending=False)

# Criar gráfico de barras interativo
fig = px.bar(df_importance, x='Importance', y='Feature', 
            orientation='h', 
            #  title="Importância das Features",
            color='Importance', 
            color_continuous_scale='aggrnyl',
            template='plotly_white',
            text_auto=True
            )

fig.update_layout(
                  width=800,
                  height=500,
                )

fig.update_traces(
    marker=dict(reversescale=True),
    texttemplate='%{x:.2f}')

fig.show()

In [19]:
scholar_data = treated_data[['IAA', 'IAN', 'IDA', 'IEG', 'PEDRA']]

In [20]:
clf_scholar = pc.setup(data=scholar_data, target='PEDRA', session_id=123, experiment_name='Previsão de Pedra Conceito', verbose=False)
description_scholar = pc.pull()

best_model_scholar = pc.compare_models(sort='F1', verbose=False)
metrics_scholar = pc.pull()

tuned_model_scholar = pc.tune_model(best_model_scholar, verbose=False)
fold_scholar = pc.pull()

final_model_scholar = pc.finalize_model(tuned_model_scholar)

In [21]:
pc.save_model(final_model_scholar, '../models/gbc_pedra_classification_scholar', model_only=True)

Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['IAA', 'IAN', 'IDA', 'IEG'],
                                     transformer=SimpleImputer(add_indicator=False,
                                                               copy=True,
                                                               fill_value=None,
                                                               keep_empty_features=False,
                                                               missing_values=nan,
                                                               strategy='mean'))),
                 ('categorical_imputer',
                  TransformerWrapper(exclude=None, include=[],
                                     transformer=Simp...
                                             criterion='friedman_mse', init=None,
                                             learning_rate

In [24]:
# Calculando a matriz de correlação
correlation_matrix = treated_data.corr().round(2)

# Criando o heatmap de correlação
fig = px.imshow(correlation_matrix, 
                labels=dict(x="Variáveis", y="Variáveis", color="Correlação"),
                x=correlation_matrix.columns,
                y=correlation_matrix.columns,
                color_continuous_scale='gnbu',
                text_auto=True)

fig.update_layout(
    width=800,
    height=800,
    title="Matriz de Correlação das Variáveis"
)

fig.show()