In [None]:
from google.colab import drive

drive.mount('./gdrive', force_remount=True)
%cd './gdrive/My Drive/cubricks'

In [None]:
!pip -q install pandas-profiling --upgrade

In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
def plot_feature_importance(features, importances, plot=False):
    indices = np.argsort(importances)[::-1]
    print(f'Feature ranking:')

    for f in range(x_train.shape[1]):
        print(f'{importances[indices[f]]}\t{features[indices[f]]}')

    if plot:
        plt.figure(figsize=(10, 8))
        plt.barh(range(x_train.shape[1]), importances[indices])
        plt.yticks(range(x_train.shape[1]), features[indices])
        plt.title('Feature Importance')
        plt.gca().invert_yaxis()
        plt.show()


def plot_report(y_test, predict, labels=None, dummy=None, plot=False):
    print(f'Total items: {len(y_test)}')

    if dummy is not None:
        dummy_cm = confusion_matrix(y_test, dummy)
        print(f'Dummy accuracy: {accuracy_score(y_test, dummy) * 100:.2f}%')

    cm = confusion_matrix(y_test, predict)
    np.seterr(divide='ignore', invalid='ignore')

    print(f'Model accuracy: {accuracy_score(y_test, predict) * 100:.2f}%\n')

    if plot:
        if dummy is not None:
            fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(8, 3))
            ax[0].set_title('Dummy prediction')
            ax[1].set_title('Model prediction')
            sns.heatmap(dummy_cm/dummy_cm.sum(axis=0), square=True, annot=True, fmt='.2%', cmap='Blues', xticklabels=labels, yticklabels=labels, ax=ax[0])
            sns.heatmap(cm/cm.sum(axis=0), square=True, annot=True, fmt='.2%', cmap='Blues', xticklabels=labels, yticklabels=labels, ax=ax[1])
        else:
            fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(3, 3))
            ax.set_title('Model prediction')
            sns.heatmap(cm/cm.sum(axis=0), square=True, annot=True, fmt='.2%', cmap='Blues', xticklabels=labels, yticklabels=labels, ax=ax)


def accuracy_by_class(y_test, predict, plot=True):
    cm = confusion_matrix(y_test, predict)
    acc = [len(y_test), cm.diagonal().sum() / len(y_test)]
    acc_class = []

    for i, x in enumerate(np.unique(y_test)):
        acc_class.append([(y_test == x).sum(), cm.diagonal()[i] / (y_test == x).sum()])

    acc = np.array(acc, dtype=np.object)
    acc_class = np.array(acc_class, dtype=np.object)

    if plot:
        print(f'\t\tAcc.\t Match\t/ Total')
        print(f'Classes: \t{acc[1]*100:.2f}%\t({int(acc[0]*acc[1])}\t/ {acc[0]})')
        for i, x in enumerate(acc_class):
            print(f'Class {i}: \t{x[1]*100:.2f}%\t({int(x[0]*x[1])}\t/ {x[0]})')

    return (acc, acc_class)


def split_data_month_window(df, col, date, month_window):
    date_0 = pd.to_datetime(date)
    date_1 = date_0 - pd.DateOffset(months=month_window)
    date_2 = date_0 + pd.DateOffset(months=1)
    train = df[(df[col] >= date_1) & (df[col] < date_0)]
    test = df[(df[col] >= date_0) & (df[col] < date_2)]
    return train, test

In [None]:
df = pd.read_csv('InvoicedDocuments_v4.csv', sep=';', na_values=['N/I'])

df.dropna(subset=['ClearingDate'], inplace=True)
df.fillna(0, inplace=True)


number_cols = ['CustomerRegion', 'PaymentTerms']
df[number_cols] = df[number_cols].apply(lambda x: [y if str(y).isnumeric() else int(''.join(format(ord(w), '') for w in str(y))) for y in x])


int_cols = ['InvoicedDocuments', 'PaidDocuments', 'PaidPastDocuments', 'OpenDocuments', 'PastDueDocuments']
df[int_cols] = df[int_cols].apply(pd.to_numeric, downcast='integer')


date_cols = ['CustomerLastCreditReview', 'DocumentDate', 'DueDate', 'ClearingDate']
df[date_cols] = df[date_cols].apply(pd.to_datetime, errors='coerce')

for col in date_cols[1:]:
    df[col + 'Month'] = pd.DatetimeIndex(df[col]).month
    df[col + 'Day'] = pd.DatetimeIndex(df[col]).day
    df[col + 'WeekDay'] = pd.DatetimeIndex(df[col]).weekday
    df[col + 'MonthEnd'] = df[col] + pd.offsets.MonthEnd(1)


dividend_cols = ['InvoicedAmount', 'PaidAmount', 'PaidPastAmount', 'OpenAmount', 'PastDueAmount']
divisor_cols = ['InvoicedDocuments', 'PaidDocuments', 'PaidPastDocuments', 'OpenDocuments', 'PastDueDocuments']

for dividend, divisor in zip(dividend_cols, divisor_cols):
    ratio_col = 'Ratio' + dividend + divisor
    df[ratio_col] = df[dividend] / df[divisor]
    df[ratio_col].fillna(0, inplace=True)


source_cols = ['DocumentDate', 'DocumentDate', 'DueDate', 'DocumentDate', 'CustomerLastCreditReview']
target_cols = ['DueDate', 'DueDateMonthEnd', 'DueDateMonthEnd', 'ClearingDate', 'DocumentDate']

for src, tgt in zip(source_cols, target_cols):
    delta_col = 'DaysTo' + tgt
    df[delta_col] = df[tgt] - df[src]
    df[delta_col].fillna(pd.Timedelta(seconds=0), inplace=True)
    df[delta_col] = df[delta_col].astype('timedelta64[D]').astype(int)
    df[delta_col] = df[delta_col].clip(lower=0)


df.sort_values(by=['DocumentDate'], ascending=True, ignore_index=True, inplace=True)

In [None]:
df = df[df['DocumentDate'] <= df['DueDate']]
df = df[df['DocumentDate'] <= df['ClearingDate']]

df = df[(df['ClearingDate'].dt.month - df['DueDate'].dt.month) >= 0]
df = df[(df['ClearingDate'].dt.month - df['DueDate'].dt.month) <= 1]

df['AfterDueDateMonthEnd'] = (df['ClearingDate'] > df['DueDateMonthEnd']) * 1

In [None]:
y_column = np.array(['AfterDueDateMonthEnd'])
features = np.array([
                     'AvgDSOPastDueDocuments',
                     'CompanyKey',
                     'CustomerKey',
                     'CustomerRegion',
                     'DaysToDocumentDate',
                     'DaysToDueDate',
                     'DaysToDueDateMonthEnd',
                     'DocumentAmount',
                     'DocumentDateDay',
                     'DocumentDateMonth',
                     'DocumentDateWeekDay',
                     'DueDateDay',
                     'DueDateMonth',
                     'DueDateWeekDay',
                     'InvoicedAmount',
                     'InvoicedDocuments',
                     'OpenAmount',
                     'OpenDocuments',
                     'PaidAmount',
                     'PaidDocuments',
                     'PaidPastAmount',
                     'PaidPastDocuments',
                     'PastDueAmount',
                     'PastDueDays',
                     'PastDueDocuments',
                     'PaymentTerms',
                     'RatioInvoicedAmountInvoicedDocuments',
                     'RatioOpenAmountOpenDocuments',
                     'RatioPaidAmountPaidDocuments',
                     'RatioPaidPastAmountPaidPastDocuments',
                     'RatioPastDueAmountPastDueDocuments',
                     ])


train, test = split_data_month_window(df, col='DocumentDate', date='2020-04-01', month_window=2)

# t1, _ = split_data_month_window(df, col='DocumentDate', date='2020-07-01', month_window=2)
# train = train[train['AfterDueDateMonthEnd'] == 1].append(t1[t1['AfterDueDateMonthEnd'] == 0][75000:], ignore_index=False)

# t1, _ = split_data_month_window(df, col='DocumentDate', date='2020-02-01', month_window=3)
# train = train.append(t1[t1['AfterDueDateMonthEnd'] == 1], ignore_index=False)


fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(30, 2))
sns.countplot(train[y_column[0]], ax=ax[0]).set_title("Train - AfterDueDateMonthEnd")
sns.countplot(test[y_column[0]], ax=ax[1]).set_title("Test - AfterDueDateMonthEnd")


# fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(30, 5))
# sns.heatmap(train[features].corr(), square=True, annot=False, fmt='.1g', vmin=-1, vmax=1, center=0, cmap='Pastel1', ax=ax[0])
# sns.heatmap(test[features].corr(), square=True, annot=False, fmt='.1g', vmin=-1, vmax=1, center=0, cmap='Pastel1', ax=ax[1])

x_train, y_train = train[features].values, train[y_column].values
x_test, y_test = test[features].values, test[y_column].values

In [None]:
from sklearn.utils import resample

# Separate majority and minority classes
df_majority = train[train['AfterDueDateMonthEnd'] == 1]
df_minority = train[train['AfterDueDateMonthEnd'] == 0]
 
# Upsample minority class
df_minority_upsampled = resample(df_minority,
                                 replace=True,                  # sample with replacement
                                 n_samples=len(df_majority),    # to match majority class
                                 random_state=32)               # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
print(df_upsampled['AfterDueDateMonthEnd'].value_counts())

In [None]:
x_train, y_train = df_upsampled[features].values, df_upsampled[y_column].values

clf = LogisticRegression().fit(x_train, np.squeeze(y_train))
predict = clf.predict(x_test)

print(np.unique(predict))
print('\nModel prediction:')
_ = accuracy_by_class(y_test, predict)

In [None]:
# def select_features(train, test, features, y_column):
#     cb, predicts = [], []

#     for y in range(len(features)):
#         for x in range(y, len(features)):
#             cb.append(cb[-1] + [features[x]] if x > y else [features[y]])

#     cb = np.array(sorted(cb, key=len))
#     print(f'Combinations: {len(cb)}')

#     for features_comb in cb:
#         random_forest = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=32)

#         x_train, y_train = train[features_comb].values, train[y_column].values
#         x_test, y_test = test[features_comb].values, test[y_column].values

#         random_forest.fit(x_train, np.squeeze(y_train))
#         predict = random_forest.predict(x_test)

#         acc, acc_class = accuracy_by_class(y_test, predict, plot=False)
#         predicts.append([acc_class[1][1], features_comb])

#     return np.array(predicts, dtype=object)


# Max accuracy: 0.33461261693451666, features: ['CompanyKey', 'CustomerKey', 'CustomerRegion', 'DaysToDocumentDate', 'DaysToDueDate', 'DaysToDueDateMonthEnd', 'DocumentAmount', 'DocumentDateDay', 'DocumentDateMonth', 'DocumentDateWeekDay', 'DueDateDay', 'DueDateMonth', 'DueDateWeekDay', 'InvoicedAmount', 'InvoicedDocuments', 'OpenAmount', 'OpenDocuments', 'PaidAmount', 'PaidDocuments', 'PaidPastAmount', 'PaidPastDocuments', 'PastDueAmount', 'PastDueDays', 'PastDueDocuments', 'PaymentTerms', 'RatioInvoicedAmountInvoicedDocuments', 'RatioOpenAmountOpenDocuments', 'RatioPaidAmountPaidDocuments']
# predicts = select_features(train, test, features, y_column)

# print(f'\nMax accuracy: {np.max(predicts[:,0])}, features: {predicts[np.argmax(predicts[:,0])][1]}')
# print(f'\nAttempts:\n{predicts}')

In [None]:
# random_forest = RandomForestClassifier(n_estimators=10, criterion='entropy', min_weight_fraction_leaf=1e-4, random_state=32)
random_forest = RandomForestClassifier(n_estimators=15, criterion='entropy', random_state=32)
random_forest.fit(x_train, np.squeeze(y_train))

predict = random_forest.predict(x_test)

print('Dummy prediction:')
_ = accuracy_by_class(y_test, np.zeros(y_test.shape))

print('\nModel prediction:')
_ = accuracy_by_class(y_test, predict)

# plot_report(y_test, predict, labels=[0, 1], dummy=np.zeros(predict.shape), plot=True)
# plot_feature_importance(features, random_forest.feature_importances_, plot=True)