In [1]:
from pathlib import Path
from functools import partial
import pandas as pd
from IPython.display import Markdown
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, RobustScaler, OrdinalEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import ShuffleSplit, cross_validate
from src.data.outlier import OutlierExtractor
from sklearn.metrics import precision_score, recall_score, accuracy_score, make_scorer

In [2]:
data_path = Path('../data/raw/data.csv')
dict_path = Path('../data/external/dicionario.csv')
n_splits = 30

In [3]:
df =  pd.read_csv(data_path)
display(Markdown("### Conjunto de dados"))
display(df.head())

df_dict =  pd.read_csv(dict_path)
display(Markdown("### Dicionário de dados"))
display(df_dict.head())

### Conjunto de dados

Unnamed: 0,gender,NationalITy,PlaceofBirth,StageID,GradeID,SectionID,Topic,Semester,Relation,raisedhands,VisITedResources,AnnouncementsView,Discussion,ParentAnsweringSurvey,ParentschoolSatisfaction,StudentAbsenceDays,Class
0,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,15,16,2,20,Yes,Good,Under-7,M
1,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,20,20,3,25,Yes,Good,Under-7,M
2,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,10,7,0,30,No,Bad,Above-7,L
3,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,30,25,5,35,No,Bad,Above-7,L
4,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,40,50,12,50,No,Bad,Above-7,M


### Dicionário de dados

Unnamed: 0,variavel,significado,valores,tipo
0,gender,Gênero do estudante,"[""Male"", ""Female""]",Nominal
1,NationalITy,Nacionalidade do estudante,"[""Kuwait"", ""Lebanon"", ""Egypt"", ""SaudiArabia"", ...",Nominal
2,PlaceofBirth,Local de nascimento do estudante,"[""Kuwait"", ""Lebanon"", ""Egypt"", ""SaudiArabia"", ...",Nominal
3,StageID,Nivel educacional do estudante,"[""lowerlevel"", ""MiddleSchool"", ""HighSchool""]",Ordinal
4,GradeID,Turma a qual o aluno pertence,"[""G-01"", ""G-02"", ""G-03"", ""G-04"", ""G-05"", ""G-06...",Nominal


In [4]:
target_column = 'Class'

nominal_columns = (
    df_dict
    .query("tipo == 'Nominal' and variavel != @target_column")
    .variavel
    .to_list()
)
ordinal_columns = (
    df_dict
    .query("tipo == 'Ordinal' and variavel != @target_column")
    .variavel
    .to_list()
)
discrete_columns = (
    df_dict
    .query("tipo == 'Discreta' and variavel != @target_column")
    .variavel
    .to_list()
)

In [5]:
nominal_preprocessor = Pipeline(steps=[
    # Tratamento de dados discrepantes
    ('missing', SimpleImputer(strategy='most_frequent')), # Tratamento de dados faltantes
    ('encoding', OneHotEncoder(sparse=False, handle_unknown='ignore')), # Codificação de variáveis
    # Seleção de variáveis
    # ('normalization', StandardScaler())# Normalização
])

ordinal_preprocessor = Pipeline(steps=[
    # Tratamento de dados discrepantes
    ('missing', SimpleImputer(strategy='most_frequent')), # Tratamento de dados faltantes
    ('encoding', OrdinalEncoder()), # Codificação de variáveis
    # Seleção de variáveis
    # ('normalization', StandardScaler())# Normalização
])

discrete_preprocessor = Pipeline(steps=[
    # Tratamento de dados discrepantes
    ('missing', SimpleImputer(strategy='median')), # Tratamento de dados faltantes
    # Seleção de variáveis
    ('normalization', RobustScaler())# Normalização
])

preprocessor = ColumnTransformer([
    ('nominal', nominal_preprocessor, nominal_columns),
    ('ordinal', ordinal_preprocessor, ordinal_columns),
    ('discrete', discrete_preprocessor, discrete_columns)
])

In [6]:
models = [
    ("logistic regression", LogisticRegression()), # colocar um dicionário com parâmetros
    ("k nearest neighborhood", KNeighborsClassifier(n_neighbors=5)),
]

In [7]:
X = df.drop(columns=target_column, axis=1)
y = df[[target_column]]

In [14]:
cv = ShuffleSplit(n_splits=n_splits, test_size=.2, random_state=42)
results = pd.DataFrame()
for model_name, model_obj in models:
    
    # TODO: Adicionar busca em grade (sklearn.model_selection.GridSearchCV)
    
    approach = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("model", model_obj)
    ])
    scores = cross_validate(
        estimator=approach,
        X=X,
        y=y,
        scoring={
            'acuracia': make_scorer(accuracy_score),
            'precisão': make_scorer(precision_score, average='macro'),
            'recall': make_scorer(recall_score, average='macro')
        },
        cv=cv
    )
    scores["model_name"] = [model_name] * n_splits
    results = results.append(pd.DataFrame(scores))

In [17]:
results.groupby('model_name').agg(func=["mean", "std"])

Unnamed: 0_level_0,fit_time,fit_time,score_time,score_time,test_acuracia,test_acuracia,test_precisão,test_precisão,test_recall,test_recall
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
model_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
k nearest neighborhood,0.045437,0.012141,0.149756,0.091627,0.679167,0.044983,0.680606,0.043259,0.698642,0.042027
logistic regression,0.088558,0.035251,0.018144,0.004067,0.762153,0.038984,0.775554,0.0359,0.764031,0.040593


In [None]:
# Criar gráficos

In [None]:
# Escolher melhor modelo

In [None]:
# treinar melhor modelo com todos os dados
# salvar modelo treinado