In [220]:
import os
from dotenv import load_dotenv
from math import nan

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import mlflow

In [221]:
load_dotenv()

True

In [230]:
dataset_path = os.getenv('DATASET_PATH')
print('loading file: {dataset_path}')
df = pd.read_csv(
    dataset_path,
    sep=';', 
    names=[
        'id', 
        'safra_abertura', 
        'cidade', 
        'estado', 
        'idade', 
        'sexo', 
        'limite_total', 
        'limite_disp', 
        'data', 
        'valor', 
        'grupo_estabelecimento', 
        'cidade_estabelecimento', 
        'pais_estabelecimento'],
    header=0,
    decimal=',',
    encoding='unicode_escape'
)

## Cleaning

In [231]:
df

Unnamed: 0,id,safra_abertura,cidade,estado,idade,sexo,limite_total,limite_disp,data,valor,grupo_estabelecimento,cidade_estabelecimento,pais_estabelecimento
0,4.530000e+11,201405,CAMPO LIMPO PAULISTA,SP,37,F,4700,5605,4.12.2019,31,SERVIO,SAO PAULO,BR
1,4.530000e+11,201405,CAMPO LIMPO PAULISTA,SP,37,F,4700,5343,9.11.2019,15001,FARMACIAS,SANTOS,BR
2,4.530000e+11,201405,CAMPO LIMPO PAULISTA,SP,37,F,4700,2829,6.05.2019,50,SERVIO,SAO PAULO,BR
3,4.530000e+11,201405,CAMPO LIMPO PAULISTA,SP,37,F,4700,2547,1.06.2019,544,M.O.T.O.,OSASCO,BR
4,4.530000e+11,201405,CAMPO LIMPO PAULISTA,SP,37,F,4700,2515,1.06.2019,3279,M.O.T.O.,OSASCO,BR
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4950,9.190000e+11,201910,SAO PAULO,SP,37,M,8500,5305,23.01.2020,255,RESTAURANTE,SAO PAULO,BR
4951,9.190000e+11,201910,SAO PAULO,SP,37,M,8500,5266,23.01.2020,395,SUPERMERCADOS,SAO PAULO,BR
4952,9.190000e+11,201910,SAO PAULO,SP,37,M,8500,5445,19.02.2020,7556,RESTAURANTE,SAO PAULO,BR
4953,9.190000e+11,201910,SAO PAULO,SP,37,M,8500,5002,30.03.2020,59,SERVIO,SAO PAULO,BR


In [226]:
df['grupo_estabelecimento'] = df['grupo_estabelecimento'].str.replace('�', 'C')

In [205]:
y = df['grupo_estabelecimento']
df = df.drop('grupo_estabelecimento', axis=1)

In [206]:
le = LabelEncoder()
y = le.fit_transform(y)

In [207]:
class ConvertStrToFloat(TransformerMixin):
    def __init__(self, columns):
        if (not hasattr(columns, '__iter__')) or isinstance(columns, str):
            columns = [columns]
        self.columns = columns

    def fit(self, x, y=None):
        return self

    def transform(self, x):
        df = x.copy()
        for c in self.columns:
            df[c] = df[c].apply(self.str_to_float)
        return df
    
    def str_to_float(self, value):
        try:
            return float(value.replace('.', '').replace(',', '.'))
        except:
            return np.nan

In [208]:
X_train, X_test, y_train, y_test = train_test_split(df, y)

In [209]:
test_file = os.path.join(os.path.dirname(dataset_path), 'test_data', f'test_{os.path.basename(dataset_path)}')
print('saving test file at: {test_file}')
X_test.to_csv(test_file, index=False)

Unnamed: 0,id,safra_abertura,cidade,estado,idade,sexo,limite_total,limite_disp,data,valor,cidade_estabelecimento,pais_estabelecimento
4827,9.190000e+11,201910,SAO PAULO,SP,37,M,8500,7861,22.11.2019,815,OSASCO,BR
3783,6.510000e+11,201705,SAO PAULO,SP,27,M,3650,1739,3.02.2020,2088,SAO PAULO,BR
1440,5.020000e+11,201901,SAO PAULO,SP,34,F,1358,232,3.03.2020,169,SAO PAULO,BR
3503,2.210000e+11,201311,RIO DE JANEIRO,RJ,45,M,35200,38450,8.09.2019,47,RIO DE JANEIR,BR
1500,5.020000e+11,201901,SAO PAULO,SP,34,F,5200,151,28.05.2019,704,SAOPAULO,BR
...,...,...,...,...,...,...,...,...,...,...,...,...
4350,9.540000e+11,201312,SAO PAULO,SP,28,F,6500,7933,28.07.2019,35,SAO PAULO,BR
1872,3.310000e+11,201903,SAO PAULO,SP,33,M,12000,10080,7.01.2020,1654,RIO DE JANEIR,BR
3015,9.813602e+09,201901,EMBU DAS ARTES,SP,29,F,4700,2117,21.02.2020,25,"5,51E+11",BR
4674,5.380000e+11,201905,SANTO ANDRE,SP,28,F,4200,1161,21.09.2019,"1.129,86",BELO HORIZONT,BR


In [193]:
ohe = OneHotEncoder(drop='if_binary')

ct = make_column_transformer(
    (ohe, [0]),
    remainder='passthrough'
)

select_features = ColumnTransformer(
    [('select', 'passthrough', ['sexo', 'idade', 'valor', 'limite_total', 'limite_disp'])],
    remainder='drop'
)

pipeline  = Pipeline(
    steps = [
        ('convert_to_decimal', ConvertStrToFloat('valor')),
        ('select_features', select_features),
        ('preprocessing', ct),
        ('clf', DecisionTreeClassifier())
    ]
)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)


In [196]:

precisions = precision_score(y_test, y_pred, average=None)
recalls = recall_score(y_test, y_pred, average=None)
f1_scores = f1_score(y_test, y_pred, average=None)

model_name = os.getenv('MODEL_NAME', 'example-use-case-decision-tree')
with mlflow.start_run() as active_run:
    print('loging model to MLFlow')
    print(f'Active run_id: {active_run.info.run_id}')
    mlflow.sklearn.log_model(pipeline, artifact_path='model', registered_model_name=model_name)
    mlflow.log_metric('accuracy', accuracy_score(y_test, y_pred))

    for precision, recall, f1, label in zip(precisions, recalls, f1_scores, pipeline.classes_):
        mlflow.log_metric(f'label_{label}-precision', precision)
        mlflow.log_metric(f'label_{label}-recall', recall)
        mlflow.log_metric(f'label_{label}-fmeasure', f1)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
