In [9]:
from pathlib import Path
import pandas as pd
from IPython.display import Markdown
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, RobustScaler, OrdinalEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate

from src.data.outlier import OutlierExtractor

In [10]:
data_path = Path('../data/raw/data.csv')
dict_path = Path('../data/external/dicionario.csv')

In [11]:
df =  pd.read_csv(data_path)
display(Markdown("### Conjunto de dados"))
display(df.head())

df_dict =  pd.read_csv(dict_path)
display(Markdown("### Dicionário de dados"))
display(df_dict.head())

### Conjunto de dados

Unnamed: 0,gender,NationalITy,PlaceofBirth,StageID,GradeID,SectionID,Topic,Semester,Relation,raisedhands,VisITedResources,AnnouncementsView,Discussion,ParentAnsweringSurvey,ParentschoolSatisfaction,StudentAbsenceDays,Class
0,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,15,16,2,20,Yes,Good,Under-7,M
1,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,20,20,3,25,Yes,Good,Under-7,M
2,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,10,7,0,30,No,Bad,Above-7,L
3,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,30,25,5,35,No,Bad,Above-7,L
4,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,40,50,12,50,No,Bad,Above-7,M


### Dicionário de dados

Unnamed: 0,variavel,significado,valores,tipo
0,gender,Gênero do estudante,"[""Male"", ""Female""]",Nominal
1,NationalITy,Nacionalidade do estudante,"[""Kuwait"", ""Lebanon"", ""Egypt"", ""SaudiArabia"", ...",Nominal
2,PlaceofBirth,Local de nascimento do estudante,"[""Kuwait"", ""Lebanon"", ""Egypt"", ""SaudiArabia"", ...",Nominal
3,StageID,Nivel educacional do estudante,"[""lowerlevel"", ""MiddleSchool"", ""HighSchool""]",Ordinal
4,GradeID,Turma a qual o aluno pertence,"[""G-01"", ""G-02"", ""G-03"", ""G-04"", ""G-05"", ""G-06...",Nominal


In [12]:
target_column = 'Class'

nominal_columns = (
    df_dict
    .query("tipo == 'Nominal' and variavel != @target_column")
    .variavel
    .to_list()
)
ordinal_columns = (
    df_dict
    .query("tipo == 'Ordinal' and variavel != @target_column")
    .variavel
    .to_list()
)
discrete_columns = (
    df_dict
    .query("tipo == 'Discreta' and variavel != @target_column")
    .variavel
    .to_list()
)

In [13]:
X = df.drop(columns=target_column, axis=1)
y = df[[target_column]]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, train_size=.8)

In [14]:
nominal_preprocessor = Pipeline(steps=[
    # Tratamento de dados discrepantes
    ('missing', SimpleImputer(strategy='most_frequent')), # Tratamento de dados faltantes
    ('encoding', OneHotEncoder(sparse=False, handle_unknown='ignore')), # Codificação de variáveis
    # Seleção de variáveis
    # ('normalization', StandardScaler())# Normalização
])

ordinal_preprocessor = Pipeline(steps=[
    # Tratamento de dados discrepantes
    ('missing', SimpleImputer(strategy='most_frequent')), # Tratamento de dados faltantes
    ('encoding', OrdinalEncoder()), # Codificação de variáveis
    # Seleção de variáveis
    # ('normalization', StandardScaler())# Normalização
])

discrete_preprocessor = Pipeline(steps=[
    # Tratamento de dados discrepantes
    ('missing', SimpleImputer(strategy='median')), # Tratamento de dados faltantes
    # Seleção de variáveis
    ('normalization', RobustScaler())# Normalização
])

preprocessor = ColumnTransformer([
    ('nominal', nominal_preprocessor, nominal_columns),
    ('ordinal', ordinal_preprocessor, ordinal_columns),
    ('discrete', discrete_preprocessor, discrete_columns)
])

In [15]:
models = [
    ("logistic regression", LogisticRegression()),
    ("k nearest neighborhood", KNeighborsClassifier(n_neighbors=1)),
]

In [16]:
for model_name, model in models:
    outlier_extractor = OutlierExtractor(columns=discrete_columns, threshold=3)
    outlier_extractor.fit(X_train)

    X_train_without_ourliers, y_train_without_ourliers = outlier_extractor.transform(X_train, y_train)

    approach = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("model", model)
    ])
    approach.fit(X_train_without_ourliers, y_train_without_ourliers)
    y_test_hat = approach.predict(X_test)
    acc = accuracy_score(y_test, y_test_hat)
    
    print(f"{model_name}: {acc:.2f}")

(224, 16)
(384, 16)
logistic regression: 0.73
(224, 16)
(384, 16)
k nearest neighborhood: 0.65
