## Montar gDrive

In [None]:
from google.colab import drive
 
drive.mount('./gdrive', force_remount=True)
%cd './gdrive/My Drive/Colab Notebooks/dso'

In [None]:
!pip install -q scikit-learn imbalanced-learn xgboost --upgrade

## Importações e Funções

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

from sklearn.preprocessing import RobustScaler

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.base import clone


def plot_confusion_matrix(gt, predict, cmap='Blues', title='Matriz de Confusão'):
    print(classification_report(gt, predict, digits=4, zero_division=True))

    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8,8))
    g = sns.heatmap(confusion_matrix(gt, predict), fmt='d', square=True, annot=True, cmap=cmap, ax=ax)
    g.set_title(title)
 

def cross_validation(clf, x, y, n_splits=10, n_repeats=3, run_only_once=False, random_state=42):
    rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=random_state)

    x, y = x.to_numpy(), y.to_numpy()
    gt, predict = [], []

    for i, (train, test) in enumerate(rskf.split(x, y)):
        if i == 0: print(f'Train size: {len(train)}, Test size: {len(test)}')
        print(f'Running: {i + 1} / {rskf.get_n_splits()}')
 
        x_train, x_test, y_train, y_test = x[train], x[test], y[train], y[test]
        
        clf_c = clone(clf)
        clf_c.fit(x_train, np.squeeze(y_train))
 
        gt.extend(y_test)
        predict.extend(clf_c.predict(x_test))
 
        if run_only_once:
            break
 
    return gt, predict, clf_c

## Dataset

In [None]:
df = pd.read_csv('./data/base4.csv', parse_dates=['DueDate'], low_memory=False)

train = df[df['DueDate'] < '2021-02-01'].drop(['DueDate'], axis=1)
test = df[(df['DueDate'] >= '2021-02-01') & (df['DueDate'] < '2021-03-01')].drop(['DueDate'], axis=1)


fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(9,4))

train.groupby(['PaymentCategory']).size().reset_index(name='Total')\
    .plot(title='Partição de Treino', x='PaymentCategory', kind='bar', fontsize=12, rot=0, ax=axes[0])

test.groupby(['PaymentCategory']).size().reset_index(name='Total')\
    .plot(title='Partição de Teste', x='PaymentCategory', kind='bar', fontsize=12, rot=0, ax=axes[1])

## Atributos

In [None]:
y_columns = [
    'PaidLate',
    'PaidLateAM',
    'DaysLate',
    'DaysLateAM',
    'PaymentCategory',
]
 
x_columns = [
    'InvoiceCount',
    'OSInvoiceCount',
    'R_OSInvoiceCount',

    'InvoiceAmount',
    'OSInvoiceAmount',
    'R_OSInvoiceAmount',

    'DaysToDueDate',
    'DaysToEndMonth',

    'WeekdayEndMonth',
    'PartnerCustomer',

    'MAD_DaysLate',
    'MED_DaysLate',

    'MAD_DaysLateAM',
    'MED_DaysLateAM',

    'MAD_OSDaysLate',
    'MED_OSDaysLate',

    'MAD_OSDaysLateAM',
    'MED_OSDaysLateAM',

    'PaidCount',
    'PaidLateCount',
    'PaidLateAMCount',
    'R_PaidLateCount',
    'R_PaidLateAMCount',

    'PaidAmount',
    'PaidLateAmount',
    'PaidLateAMAmount',
    'R_PaidLateAmount',
    'R_PaidLateAMAmount',

    'OSCount',
    'OSLateCount',
    'OSLateAMCount',
    'R_OSLateCount',
    'R_OSLateAMCount',

    'OSAmount',
    'OSLateAmount',
    'OSLateAMAmount',
    'R_OSLateAmount',
    'R_OSLateAMAmount',
]

## Variação de Parâmetros

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit

from sklearn.ensemble import RandomForestClassifier


y_train_paidlate = train[y_columns]
x_train_paidlate = train[x_columns]

qt_train_paidlate = RobustScaler(quantile_range=(25.0, 75.0))

x_train_paidlate.iloc[:][:] = qt_train_paidlate.fit_transform(x_train_paidlate.to_numpy())


parameters = {
    'n_estimators': [15, 150, 250, 500],
    'criterion': ['entropy', 'gini'],
    'min_samples_split': [2, 4, 8, 10],
}


clf = RandomForestClassifier(class_weight='balanced', random_state=42)
cv = StratifiedShuffleSplit(n_splits=1, random_state=42)

hyper = GridSearchCV(clf, parameters, cv=cv)
hyper.fit(x_train_paidlate.values, y_train_paidlate['PaidLate'].values)

hyper.best_estimator_

## Train: Pagamento no prazo x atrasado

In [None]:
from sklearn.ensemble import RandomForestClassifier


y_train_paidlate = train[y_columns]
x_train_paidlate = train[x_columns]

qt_train_paidlate = RobustScaler(quantile_range=(25.0, 75.0))

x_train_paidlate.iloc[:][:] = qt_train_paidlate.fit_transform(x_train_paidlate.to_numpy())


clf = RandomForestClassifier(n_estimators=100,
                             criterion='entropy',
                             min_samples_split=8,
                             class_weight='balanced',
                             random_state=42)

train_gt_paidlate, train_pred_paidlate, clf = cross_validation(clf, x_train_paidlate, y_train_paidlate['PaidLate'], n_splits=10, n_repeats=3)

plot_confusion_matrix(train_gt_paidlate, train_pred_paidlate, cmap='Blues')

## Test: Pagamento no prazo x atrasado

In [None]:
y_test_paidlate = test[y_columns]
x_test_paidlate = test[x_columns]

qt_test_paidlate = RobustScaler(quantile_range=(25.0, 75.0))
x_test_paidlate.iloc[:][:] = qt_test_paidlate.fit_transform(x_test_paidlate.iloc[:].to_numpy())

test_pred_paidlate = clf.predict(x_test_paidlate)

plot_confusion_matrix(y_test_paidlate['PaidLate'], test_pred_paidlate, cmap='YlOrBr')

## Train: Pagamento atrasado no mês de vencimento x atrasado além do mês de vencimento

In [None]:
y_train_paidlate_am = train[train['PaidLate'] == 1][y_columns]
x_train_paidlate_am = train[train['PaidLate'] == 1][x_columns]

qt_train_paidlate_am = RobustScaler(quantile_range=(25.0, 75.0))
x_train_paidlate_am.iloc[:][:] = qt_train_paidlate_am.fit_transform(x_train_paidlate_am.to_numpy())


clf = RandomForestClassifier(n_estimators=100,
                             criterion='entropy',
                             min_samples_split=8,
                             class_weight='balanced',
                             random_state=42)


train_gt_paidlate_am, train_pred_paidlate_am, clf = cross_validation(clf, x_train_paidlate_am, y_train_paidlate_am['PaidLateAM'], n_splits=10, n_repeats=3)

plot_confusion_matrix(train_gt_paidlate_am, train_pred_paidlate_am, cmap='Blues')

## Test: Pagamento atrasado no mês de vencimento x atrasado além do mês de vencimento

In [None]:
y_test_paidlate_am = test[test_pred_paidlate == 1][y_columns]
x_test_paidlate_am = test[test_pred_paidlate == 1][x_columns]

qt_test_paidlate_am = RobustScaler(quantile_range=(25.0, 75.0))
x_test_paidlate_am.iloc[:][:] = qt_test_paidlate_am.fit_transform(x_test_paidlate_am.iloc[:].to_numpy())

test_pred_paidlate_am = clf.predict(x_test_paidlate_am)

plot_confusion_matrix(y_test_paidlate_am['PaidLateAM'], test_pred_paidlate_am, cmap='YlOrBr')