In [None]:
from google.colab import drive

drive.mount('./gdrive', force_remount=True)
%cd './gdrive/My Drive/Colab Notebooks/cubricks'

In [None]:
!pip -q install imbalanced-learn --upgrade

In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

seed = 42

In [None]:
def plot_data_distribution(train, test, labels=None):
    fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(30, 2))
    sns.countplot(np.squeeze(train), ax=ax[0]).set_title(f'Train - {col}')
    sns.countplot(np.squeeze(test), ax=ax[1]).set_title(f'Test - {col}')

def plot_confuncion_matrix(y_test, predict, title='Confuncion Matrix'):
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8, 8))
    sns.heatmap(confusion_matrix(y_test, predict), fmt='d', square=True, annot=True, cmap='Blues', ax=ax).set_title(title)

def plot_feature_importance(features, importances, plot=True):
    indices = np.argsort(importances)[::-1]
    print(f'Feature ranking:')

    for f in range(x_train.shape[1]):
        print(f'{importances[indices[f]]}\t{features[indices[f]]}')

    if plot:
        plt.figure(figsize=(10, 8))
        plt.barh(range(x_train.shape[1]), importances[indices])
        plt.yticks(range(x_train.shape[1]), features[indices])
        plt.title('Feature Importance')
        plt.gca().invert_yaxis()
        plt.show()

def setup_buckets(df, bins, columns):
    bins = [-np.inf] + bins + [np.inf]
    labels = [f'{bins[i]}-{bins[i+1]-1}' for i in range(len(bins[:-1]))]
    labels[0], labels[-1] = 'on-time', labels[-1].replace('-inf', '+')

    for col in columns:
        df[col + 'Bucket'] = pd.cut(df[col], bins=bins, labels=labels, right=False, include_lowest=True)
        df[[col + 'BucketCT']] = df[[col + 'Bucket']].apply(lambda x: pd.Categorical(x, ordered=True).codes)
    return (df, labels)

def split_data_month_window(df, col, date, month_window):
    date_0 = pd.to_datetime(date)
    date_1 = date_0 - pd.DateOffset(months=month_window)
    date_2 = date_0 + pd.DateOffset(months=1)
    train = df[(df[col] >= date_1) & (df[col] < date_0)]
    test = df[(df[col] >= date_0) & (df[col] < date_2)]
    return train, test

def resample(df, x_column, y_column, func):
    dtypes = df[x_column].dtypes.to_dict()
    dtypes.update(df[y_column].dtypes.to_dict())

    x, y = df[x_column].values, df[y_column].values
    x, y = func.fit_resample(x, y)
    xy = np.concatenate((x, np.expand_dims(y, axis=1)), axis=1)
    
    data = pd.DataFrame(xy, columns=np.concatenate((x_column, y_column)))
    data = data.astype(dtypes)
    return data


def features_selection(train, test, x_column, y_column, random_state=None):
    from itertools import combinations
    import multiprocessing
    import functools

    total = len(x_column)
    predicts = []

    for i in range(total, 0, -1):
        print(f'>>>\r{round((1-((i-1)/total)) * 100, 1)}% complete', end='')
        cb = sum([list(map(list, combinations(x_column, y))) for y in range(i, i + 1)], [])

        with multiprocessing.Pool(multiprocessing.cpu_count()) as pool:
            pd = pool.map(functools.partial(_selection, train, test, x_column, y_column, random_state), cb)
            pd = np.array(pd, dtype=object)
            pool.close()
            pool.join()

        if i == total:
            x_column = x_column[np.argsort(pd[:,2][0])[::-1]]

        index_max = np.argmax(pd[:,0])
        index_del = [i for i, item in enumerate(x_column) if item not in pd[index_max][1]]

        predicts.append(pd[index_max])
        x_column = np.delete(x_column, index_del)

    predicts = np.array(predicts, dtype=object)
    predicts = predicts[predicts[:,0].argsort()][::-1]
    index_max = np.argmax(predicts[:,0])
    return (predicts, index_max)

def _selection(*args):
    train, test, x_column, y_column, random_state, features = args

    x_train, y_train = train[features].values, train[y_column].values
    x_test, y_test = test[features].values, test[y_column].values

    clf = RandomForestClassifier(n_estimators=1, criterion='entropy', random_state=random_state)
    clf.fit(x_train, np.squeeze(y_train))

    cr = classification_report(y_test, clf.predict(x_test), output_dict=True, zero_division=True)
    return [cr['macro avg']['f1-score'], features, clf.feature_importances_]

In [None]:
### READ ###
df = pd.read_csv('InvoicedDocuments_v4.csv', sep=';', na_values=['N/I'], parse_dates=['DocumentDate', 'DueDate', 'ClearingDate'])
df.sort_values(by=['DocumentDate'], ascending=True, ignore_index=True, inplace=True)


### FILTERS ###
df.drop(columns=['CustomerLastCreditReview'], inplace=True)
df.dropna(subset=['ClearingDate', 'PaymentTerms'], inplace=True)

df = df[df['DocumentAmount'] > 1000]
df = df[(df['DueDate'] > df['DocumentDate']) & (df['ClearingDate'] > df['DocumentDate'])]
df.reset_index(drop=True, inplace=True)


### PREPROCESSING ###
df.fillna(-1, inplace=True)
df['CustomerRegion'] = df['CustomerRegion'].apply(lambda x: -1 if str(x).replace('-', '').isnumeric() else x)

integer_cols = ['InvoicedDocuments', 'PaidDocuments', 'PaidPastDocuments', 'OpenDocuments', 'PastDueDocuments']
df[integer_cols] = df[integer_cols].apply(pd.to_numeric, downcast='integer')


### FEATURE GENERATION ###
category_cols = ['CustomerRegion', 'PaymentTerms']
df[category_cols] = df[category_cols].apply(lambda x: pd.Categorical(x, ordered=True).codes)
# df[category_cols] = df[category_cols].apply(lambda x: [int(y) if str(y).replace('-', '').isnumeric() else int(''.join(format(ord(w), '') for w in str(y))) for y in x])


amount_cols = ['InvoicedAmount', 'PaidAmount', 'PaidPastAmount', 'OpenAmount', 'PastDueAmount']
count_cols = ['InvoicedDocuments', 'PaidDocuments', 'PaidPastDocuments', 'OpenDocuments', 'PastDueDocuments']

for amount, count in zip(amount_cols, count_cols):
    ratio_col = 'Ratio' + amount + count
    df[ratio_col] = df[amount] / df[count]
    df[ratio_col].fillna(0, inplace=True)

for col in ['DocumentDate', 'DueDate']:
    df[col + 'DayOfYear'] = pd.DatetimeIndex(df[col]).dayofyear
    df[col + 'WeekDay'] = pd.DatetimeIndex(df[col]).weekday
    df[col + 'Day'] = pd.DatetimeIndex(df[col]).day
    df[col + 'MonthEnd'] = df[col] + pd.offsets.MonthEnd(1)

df['DueDateToClearingDate'] = (df['ClearingDate'] - df['DueDate']).astype('timedelta64[D]').astype(int)
df['DueDateToMonthEnd'] = (df['DueDateMonthEnd'] - df['DueDate']).astype('timedelta64[D]').astype(int)
df['DocumentDateToDueDate'] = (df['DueDate'] - df['DocumentDate']).astype('timedelta64[D]').astype(int)
df['DocumentDateToMonthEnd'] = (df['DocumentDateMonthEnd'] - df['DocumentDate']).astype('timedelta64[D]').astype(int)

df, labels = setup_buckets(df, bins=[1, 8, 15, 22, 29], columns=['DueDateToClearingDate'])

In [None]:
df, labels = setup_buckets(df, bins=[1, 4, 7], columns=['DueDateToClearingDate'])
# df, labels = setup_buckets(df, bins=[1], columns=['DueDateToClearingDate'])

# df.isnull().sum()
# df['CustomerKey'].unique()

# df.head()

In [None]:
y_column = np.array(['DueDateToClearingDateBucketCT'])
x_column = np.array([
                     'CompanyKey',
                     'CustomerKey',
                     'CustomerRegion',
                     'PaymentTerms',
                     'DocumentAmount',
                     'InvoicedDocuments',
                     'InvoicedAmount',
                     'PaidDocuments',
                     'PaidAmount',
                     'PaidPastDocuments',
                     'PaidPastAmount',
                     'OpenDocuments',
                     'OpenAmount',
                     'PastDueDocuments',
                     'PastDueAmount',
                     'AvgDSOPastDueDocuments',
                     'PastDueDays',
                     'RatioInvoicedAmountInvoicedDocuments',
                     'RatioPaidAmountPaidDocuments',
                     'RatioPaidPastAmountPaidPastDocuments',
                     'RatioOpenAmountOpenDocuments',
                     'RatioPastDueAmountPastDueDocuments',
                     'DocumentDateDayOfYear',
                     'DocumentDateWeekDay',
                     'DocumentDateDay',
                     'DueDateDayOfYear',
                     'DueDateWeekDay',
                     'DueDateDay',
                     'DueDateToMonthEnd',
                     'DocumentDateToDueDate',
                     'DocumentDateToMonthEnd',
                     ])

# x_column = np.array([
#                     #  'CompanyKey',
#                     #  'CustomerKey',
#                     #  'CustomerRegion',
#                     #  'DueDateWeekDay',
#                     #  'DocumentDateToDueDate',
#                      'CompanyKey',
#                      'PaymentTerms',
#                      'DueDateWeekDay',
#                      'CustomerKey',
#                      'DocumentDateToDueDate',
#                      'CustomerRegion',
#                      ])


train, test = split_data_month_window(df, col='DocumentDate', date='2020-06-01', month_window=12)

plot_data_distribution(train['DueDateToClearingDateBucket'], test['DueDateToClearingDateBucket'])

In [None]:
x_train, y_train = train[x_column].values, train[y_column].values
x_test, y_test = test[x_column].values, test[y_column].values

clf = RandomForestClassifier(n_estimators=1, criterion='entropy', random_state=seed, n_jobs=-1)
clf.fit(x_train, np.squeeze(y_train))

predict = clf.predict(x_test)
plot_confuncion_matrix(y_test, predict)

# plot_feature_importance(x_column, clf.feature_importances_)
print(classification_report(y_test, predict))

In [None]:
predicts, index_max = features_selection(train, test, x_column, y_column, random_state=seed)

print(f'\n>>> Max f1-score: {predicts[index_max,0]}, {predicts[index_max,1]}')
print(f'\n>>> Attempts:\n{predicts}')