In [174]:
import pandas as pd
import numpy as np
import seaborn as sns
from pathlib import Path
from IPython.display import Markdown
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import TransformerMixin
from scipy.spatial.distance import cdist

In [175]:
data_path = Path('../data/raw/data.xls')
dict_path = Path('../data/external/dicionario.csv')

In [176]:
df = (
    pd
    .read_excel(data_path, header=1)
    .rename(columns={
        'default payment next month': 'DEFAULT_PAY'
    })
)
display(Markdown('### Conjunto de dados'))
display(df.head())


df_dict = (
    pd
    .read_csv(dict_path)
    .replace({
        'default payment next month': 'DEFAULT_PAY'
    })
)
display(Markdown('### Dicionário de dados'))
display(df_dict.head())

### Conjunto de dados

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,DEFAULT_PAY
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


### Dicionário de dados

Unnamed: 0,variavel,significado,tipo
0,ID,Identificador unico,continuo
1,LIMIT_BAL,Valor do crédito concedido (Novo Dolár Taiwanês ),continuo
2,SEX,Genero (1= masculino; 2= feminino),nominal
3,EDUCATION,Educação Nivel de Escolaridade (1 = pós-gradua...,nominal
4,MARRIAGE,Estado civil (1 = casado; 2 = solteiro; 3 = ou...,nominal


In [177]:
target_column = 'DEFAULT_PAY'
continuous_columns = (
    df_dict
    .query('tipo == "continuo"')
    .variavel
    .to_list()
)
nominal_columns = (
    df_dict
    .query('tipo == "nominal" and variavel != @target_column')
    .variavel
    .to_list()
)

In [178]:
 # continuous_columns

In [166]:
class OutlierExtractor(TransformerMixin):
    def __init__(self, threshold=3, columns=None):
        self.threshold = threshold
        self.columns = columns
        
        
    def fit(self, X, y=None):
        self.outliers_zscore = dict()
        if self.columns is None:
            self.columns = X.columns
            
        for column in self.columns:
            x = np.asarray(X[column]).reshape(-1,1)
            std = np.std(x)
            self.outliers_zscore[column] = (np.abs(cdist(x.mean(axis=0)[np.newaxis], x) / std)[0])
            
        return self
    
    
    def transform(self, X):
        pos_outliers = []
        pos_outliers = np.array(pos_outliers)
        for column in self.columns:
            pos_outliers = np.concatenate((np.where(self.outliers_zscore[column] > 3), pos_outliers), axis=None)
        return X.drop(set(pos_outliers), inplace=False)
    

In [187]:
# oe = OutlierExtractor(columns = continuous_columns)
# df_t = oe.fit_transform(df)

In [169]:
continuous_preprocessor = Pipeline(steps=[
    ('outlier_extractor', OutlierExtractor(columns = continuous_columns).fit_transform(df)), # Tratamento de dados discrepantes
    ('missing_data', SimpleImputer(strategy='mean')), # Tratamento de dados faltantes
    # Seleção de variáveis
    ('normalization', MinMaxScaler()) # Normalização
])

nominal_preprocessor = Pipeline(steps=[
    # Tratamento de dados discrepantes
    ('missing_data', SimpleImputer(strategy='most_frequent')), # Tratamento de dados faltantes
    ('encoder', OneHotEncoder(sparse=False)), # Codificação de variáveis
    # Seleção de variáveis
    # ('normalization', MinMaxScaler())# Normalização 
])

In [170]:
preprocessor = ColumnTransformer(transformers=[
    ('continuous', continuous_preprocessor, continuous_columns),
    ('nominal', nominal_preprocessor, nominal_columns)
])
model = LinearSVC()
approach = Pipeline([
    ("preprocessor", preprocessor),
    ("model", model)
])

In [7]:
X = df.drop(columns=[target_column])
y = df[[target_column]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

In [8]:
approach.fit(X_train, y_train)
y_test_hat = approach.predict(X_test)
accuracy = accuracy_score(y_test.DEFAULT_PAY, y_test_hat)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.8008
