In [None]:
!nvidia-smi

In [None]:
from google.colab import drive

drive.mount('./gdrive', force_remount=True)
%cd './gdrive/My Drive/Colab Notebooks/cubricks'

In [None]:
!pip -q install imbalanced-learn xgboost tensorflow-gpu pandas-profiling --upgrade

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier

from sklearn.preprocessing import OrdinalEncoder, RobustScaler
from sklearn.metrics import confusion_matrix, classification_report

from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN
from pandas_profiling import ProfileReport

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

SEED = 42

In [None]:
def plot_countplot(df, cols=[0], title=None, rotation=0):
    for col in cols:
        fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8, 2))
        g = sns.countplot(x=np.squeeze(df[col] if isinstance(col, str) else df[:,col]), ax=ax)
        g.set_xticklabels(labels=g.get_xticklabels(), rotation=rotation)
        g.set_title(title)

def plot_confuncion_matrix(y_test, predict, title='Confusion Matrix', report=True):
    if report: print(classification_report(y_test, y_predict, zero_division=True))
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8, 8))
    g = sns.heatmap(confusion_matrix(y_test, predict), fmt='d', square=True, annot=True, cmap='Blues', ax=ax)
    g.set_title(title)

def plot_feature_importance(features, importances):
    features = np.array(features)
    indices = np.argsort(importances)[::-1]
    print(f'Feature ranking:')

    for f in range(len(features)):
        print(f'{importances[indices[f]]}\t{features[indices[f]]}')

    plt.figure(figsize=(10, 8))
    plt.barh(range(len(features)), importances[indices])
    plt.yticks(range(len(features)), features[indices])
    plt.title('Feature Importance')
    plt.gca().invert_yaxis()
    plt.show()

In [None]:
def setup_buckets(df, col, bins, sufix='Bkt'):
    bins = [-np.inf] + bins + [np.inf]
    labels = [f'{bins[i]} to {bins[i+1]-1}' for i in range(len(bins[:-1]))]
    df[f'{col}{sufix}Label'] = pd.cut(df[col], bins=bins, labels=labels, right=False, include_lowest=True)
    df[[f'{col}{sufix}']] = df[[f'{col}{sufix}Label']].apply(lambda x: pd.Categorical(x, ordered=True).codes)
    return df, labels

def split_data(df, date_column, split_date, train_months=6, test_months=1):
    split_date = pd.to_datetime(split_date)
    train_date = split_date - pd.DateOffset(months=train_months)
    test_date = split_date + pd.DateOffset(months=test_months)

    df_ranged = df[(df[date_column] >= train_date) & (df[date_column] < test_date)].copy()
    df_ranged.reset_index(drop=True, inplace=True)

    train = df_ranged[df_ranged[date_column] < split_date]
    test = df_ranged[df_ranged[date_column] >= split_date]
    return train, test

def resample(df, x_column, y_column, func):
    dtypes = df[x_column].dtypes.to_dict()
    dtypes.update(df[y_column].dtypes.to_dict())    
    x, y = df[x_column].values, df[y_column].values

    try:
        x, y = func.fit_resample(x, y)
        y = np.expand_dims(y, axis=1)
    except:
        pass

    xy = np.concatenate((x, y), axis=1)
    data = pd.DataFrame(xy, columns=np.concatenate((x_column, y_column)))
    data = data.astype(dtypes)
    return data

def preprocess_data(train, test, x_column, y_column, nominal_columns=None):
    if nominal_columns is not None:
        train, test = train.copy(), test.copy()
        to_number = lambda x: [int(w) if str(w).isnumeric() else int(''.join(format(ord(c), '') for c in str(w).lower())) for w in x]

        for col in nominal_columns:
            idmax = train[col].value_counts().idxmax()
            train[col] = train[col].fillna(idmax)
            test[col] = test[col].fillna(idmax)

        train[nominal_columns] = train[nominal_columns].apply(lambda x: to_number(x))
        test[nominal_columns] = test[nominal_columns].apply(lambda x: to_number(x))

    x_train, y_train = train[x_column].values, train[y_column].values
    x_test, y_test = test[x_column].values, test[y_column].values

    qt = RobustScaler(quantile_range=(25.0, 75.0))
    qt.fit(x_train, y_train)

    x_train = qt.transform(x_train)
    x_test = qt.transform(x_test)

    return x_train, y_train, x_test, y_test

In [None]:
def feature_selection(train, test, x_column, y_column, nominal_columns=None, target_class='macro avg', random_state=None):
    insertion = lambda l, x: [l[:i] + [x] + l[i:] for i in range(len(l) + 1)]
    flatten = lambda l: [item for sublist in l for item in sublist]
    x_column = x_column.tolist()

    def _loop_selection(groups, score=-1, features=[]):
        scores_groups = []

        for feature_group in groups:
            x_column = np.array(flatten(feature_group))
            x_train, y_train, x_test, y_test = preprocess_data(train, test, x_column, y_column, nominal_columns)
 
            clf = RandomForestClassifier(256, criterion='entropy', class_weight='balanced', random_state=random_state, n_jobs=-1)
            clf.fit(x_train, np.squeeze(y_train))
            y_predict = clf.predict(x_test)

            cr = classification_report(test[y_column].values, y_predict, output_dict=True, zero_division=True)
            scores_groups.append(cr[target_class]['f1-score'])

        local_score = max(scores_groups)
        local_features = list(groups[scores_groups.index(local_score)])
        print(f'Score: {local_score:.8f} >>> {local_features}')

        if local_score > score:
            return local_score, local_features

        return score, features

    in_score, in_features = _loop_selection([[flatten(x_column)]])
    score, features = _loop_selection(np.expand_dims(x_column, axis=1).tolist())
    x_column.remove(features[0])

    for group in x_column:
        score, features = _loop_selection(insertion(features, group), score, features)

    if in_score > score:
        return in_score, in_features

    return score, features

In [None]:
### Read dataset ###
df = pd.read_csv('InvoicedDocuments_full.csv', sep=';', parse_dates=['DocumentDate', 'DueDate', 'ClearingDate'], low_memory=False)
df.sort_values(by=['DueDate'], ascending=False, ignore_index=False, inplace=True)

### First filters ###
df.dropna(subset=['ClearingDate'], inplace=True)
df = df[(df['DueDate'] > df['DocumentDate']) & (df['ClearingDate'] > df['DocumentDate'])]

### 'Days to' columns ###
date_int = lambda x: x.astype('timedelta64[D]').astype(int)
df['DueDateToClearingDate'] = date_int(df['ClearingDate'] - df['DueDate'])

### Date columns ###
for col in ['DocumentDate', 'DueDate']:
    df[col + 'Day'] = pd.DatetimeIndex(df[col]).day
    df[col + 'WeekDay'] = pd.DatetimeIndex(df[col]).weekday

### Ratio columns ###
later_columns = ['TotalLatePaidInvoices', 'TotalPendingLateInvoices', 'SumAmountLatePaidInvoices', 'SumAmountPendingLateInvoices']
total_columns = ['TotalPaidInvoices', 'TotalPendingInvoices', 'SumAmountPaidInvoices', 'SumAmountPendingInvoices']
ratio_columns = ['RatioLatePaidInvoices', 'RatioPendingLateInvoices', 'RatioAmountPaidLateInvoices', 'RatioAmountPendingLateInvoices']

for l, t, r in zip(later_columns, total_columns, ratio_columns):
    df[r] = (df[l] / df[t]).fillna(0)

### Fix customer columns ###
for col in ['CustomerShipToKey', 'CustomerSoldToKey']:
    df[col] = df[col].fillna(df['CustomerKey'])

df.fillna(0, inplace=True)
df.head()

In [None]:
df, bin_class_names = setup_buckets(df, col='DueDateToClearingDate', bins=[4], sufix='Bin')
df, bucket_class_names = setup_buckets(df, col='DueDateToClearingDate', bins=[1, 4, 8], sufix='Bkt')

plot_countplot(df, cols=['DueDateToClearingDateBinLabel', 'DueDateToClearingDateBktLabel'])

In [None]:
columns = {
    'target': [
        'DueDateToClearingDateBin',
    ],
    'numeric': [
        'AvgDaysLatePaidInvoices',
        'StdevDaysLatePaidInvoices',
        'AvgDaysLatePendingInvoices',
        'StdevDaysLatePendingInvoices',
        'RatioLatePaidInvoices',
        'TotalLatePaidInvoices',
        'TotalPaidInvoices',
        'RatioAmountPaidLateInvoices',
        'SumAmountLatePaidInvoices',
        'SumAmountPaidInvoices',
        'RatioPendingLateInvoices',
        'TotalPendingLateInvoices',
        'TotalPendingInvoices',
        'RatioAmountPendingLateInvoices',
        'SumAmountPendingLateInvoices',
        'SumAmountPendingInvoices',
        'InvoiceAmount',
        'PaymentTerm',
        'PaymentFrequency',
    ],
    'nominal': [
        'CompanyKey',
        'CorporateDivision',
        'CustomerShipToKey',
        'CustomerSoldToKey',
        'CustomerKey',
    ],
    'date': [
        'DocumentDateDay',
        'DocumentDateWeekDay',
        'DueDateDay',
        'DueDateWeekDay',
    ],
}

y_column = np.array(columns['target'])
x_column = np.array(columns['numeric'] + columns['nominal'] + columns['date'])

train, test = split_data(df, date_column='DueDate', split_date='2020-08-01', train_months=6, test_months=1)

# train = resample(train, x_column, y_column, SMOTE(sampling_strategy='auto', random_state=SEED))
# train = resample(train, x_column, y_column, ADASYN(sampling_strategy='auto', random_state=SEED))
# train = resample(train, x_column, y_column, BorderlineSMOTE(sampling_strategy='auto', kind='borderline-1', random_state=SEED))
# train = resample(train, x_column, y_column, BorderlineSMOTE(sampling_strategy='auto', kind='borderline-2', random_state=SEED))

plot_countplot(train, cols=y_column, title='Train')
plot_countplot(test, cols=y_column, title='Test')

In [None]:
x_train, y_train, x_test, y_test = preprocess_data(train, test, x_column, y_column, nominal_columns=columns['nominal'] + columns['date'])

clf = RandomForestClassifier(n_estimators=256, criterion='entropy', min_samples_leaf=1, class_weight='balanced', random_state=SEED, n_jobs=-1)
clf.fit(x_train, np.squeeze(y_train))

y_predict = clf.predict(x_test)
plot_confuncion_matrix(test[y_column].values, y_predict)

In [None]:
# test[(test[y_column[0]] == 1) & (y_predict == 0)].head(50)

# y_predict = (clf.predict_proba(x_test)[:,1] >= 0.9999).astype('int')
# plot_confuncion_matrix(test[y_column].values, y_predict)

In [None]:
# plot_feature_importance(x_column, clf.feature_importances_)

In [None]:
x_column = np.array([
    # ['StdevDaysLatePaidInvoices',
    # 'AvgDaysLatePaidInvoices',
    # 'StdevDaysLatePendingInvoices',
    # 'AvgDaysLatePendingInvoices'],


    ['TotalLatePaidInvoices'],
    ['RatioLatePaidInvoices'],

    ['SumAmountLatePaidInvoices'],
    ['RatioAmountPaidLateInvoices'],

    # ['TotalPendingLateInvoices'],
    # ['RatioPendingLateInvoices'],

    # ['SumAmountPendingLateInvoices'],
    # ['RatioAmountPendingLateInvoices'],


    # ['PaymentFrequency',
    # 'PaymentTerm',
    # 'InvoiceAmount'],

    # ['DocumentDateWeekDay',
    # 'DueDateWeekDay',
    # 'DocumentDateDay',
    # 'DueDateDay'],
])


score, features = feature_selection(train, test, x_column, y_column, nominal_columns=columns['nominal'] + columns['date'], random_state=SEED)

print(f'\nFinal score: {score:.8f} >>> {features}')