In [None]:
!nvidia-smi

In [None]:
from google.colab import drive

drive.mount('./gdrive', force_remount=True)
%cd './gdrive/My Drive/Colab Notebooks/cubricks'

In [None]:
!pip -q install pandas-profiling imbalanced-learn tensorflow-gpu --upgrade

In [None]:
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

seed = 42

In [None]:
def plot_countplot(df, cols, title=None, rotation=0):
    for col in cols:
        fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8, 2))
        g = sns.countplot(x=np.squeeze(df[col] if isinstance(col, str) else df[:,col]), ax=ax)
        g.set_xticklabels(labels=g.get_xticklabels(), rotation=rotation)
        g.set_title(title)

def plot_confuncion_matrix(y_test, predict, title='Confusion Matrix', report=True):
    if report: print(classification_report(y_test, y_predict))
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8, 8))
    g = sns.heatmap(confusion_matrix(y_test, predict), fmt='d', square=True, annot=True, cmap='Blues', ax=ax)
    g.set_title(title)

def plot_feature_importance(features, importances):
    features = np.array(features)
    indices = np.argsort(importances)[::-1]
    print(f'Feature ranking:')

    for f in range(len(features)):
        print(f'{importances[indices[f]]}\t{features[indices[f]]}')

    plt.figure(figsize=(10, 8))
    plt.barh(range(len(features)), importances[indices])
    plt.yticks(range(len(features)), features[indices])
    plt.title('Feature Importance')
    plt.gca().invert_yaxis()
    plt.show()

def setup_buckets(df, col, bins, sufix='Bucket'):
    bins = [-np.inf] + bins + [np.inf]
    labels = [f'{bins[i]} to {bins[i+1]-1}' for i in range(len(bins[:-1]))]
    df[col + sufix + 'Category'] = pd.cut(df[col], bins=bins, labels=labels, right=False, include_lowest=True)
    df[[col + sufix]] = df[[col + sufix + 'Category']].apply(lambda x: pd.Categorical(x, ordered=True).codes)
    return df

def split_data_month_window(df, col, date, month_window):
    date_0 = pd.to_datetime(date)
    date_1 = date_0 - pd.DateOffset(months=month_window)
    date_2 = date_0 + pd.DateOffset(months=1)

    train = df[(df[col] >= date_1) & (df[col] < date_0)]
    test = df[(df[col] >= date_0) & (df[col] < date_2)]

    train.reset_index(drop=True, inplace=True)
    test.reset_index(drop=True, inplace=True)
    return train, test

def prepare_data(train, test, x_column, y_column, random_state=None):
    randomize = np.arange(train.shape[0])
    np.random.seed(random_state)
    np.random.shuffle(randomize)

    x_train, y_train = train[x_column].values[randomize], train[y_column].values[randomize]
    x_test, y_test = test[x_column].values, test[y_column].values

    x_train, x_test = np.vectorize(np.log)(x_train + 1), np.vectorize(np.log)(x_test + 1)

    qt = RobustScaler()
    qt.fit(np.concatenate((x_train, x_test)), np.concatenate((y_train, y_test)))

    x_train, x_test = qt.transform(x_train), qt.transform(x_test)

    return x_train, y_train, x_test, y_test

def classifier_predict(clf, x_test, threshold=0.5, network=False):
    y_predict = clf.predict(x_test) if network else clf.predict_proba(x_test)
    return (y_predict[:,1] >= threshold).astype('int')

# def resample(df, x_column, y_column, func):
#     dtypes = df[x_column].dtypes.to_dict()
#     dtypes.update(df[y_column].dtypes.to_dict())

#     x, y = df[x_column].values, df[y_column].values
#     x, y = func.fit_resample(x, y)
#     xy = np.concatenate((x, np.expand_dims(y, axis=1)), axis=1)
    
#     data = pd.DataFrame(xy, columns=np.concatenate((x_column, y_column)))
#     data = data.astype(dtypes)
#     return data

In [None]:
### Read dataset ###
df = pd.read_csv('InvoicedDocuments_v6.csv', sep=';', na_values=['N/I'], parse_dates=['DocumentDate', 'DueDate', 'ClearingDate'])

### First filters ###
df.dropna(subset=['ClearingDate', 'PaymentTerms'], inplace=True)
df = df[(df['DueDate'] > df['DocumentDate']) & (df['ClearingDate'] > df['DocumentDate'])]
df = df[df['DocumentDate'] >= '2018-03-01']

df.sort_values(by=['DocumentDate'], ascending=True, ignore_index=True, inplace=True)
df.reset_index(drop=True, inplace=True)

### Fix numerical columns ###
for amount, count in zip(['InvoicedAmount', 'PaidAmount', 'PaidPastAmount', 'OpenAmount', 'PastDueAmount'],
                         ['InvoicedDocuments', 'PaidDocuments', 'PaidPastDocuments', 'OpenDocuments', 'PastDueDocuments']):
    df[amount] = df[amount] / df[count]
    df[[amount, count]] = df[[amount, count]].fillna(0)

avg_cols = ['AvgDSOPastDueDocuments', 'AvgPastDueDays']
df[avg_cols] = df[avg_cols].fillna(0)

### Extract date information ###
date_int = lambda x: x.astype('timedelta64[D]').astype(int)

for col in ['DocumentDate', 'DueDate']:
    df[col + 'DayOfYear'] = pd.DatetimeIndex(df[col]).dayofyear
    df[col + 'Month'] = pd.DatetimeIndex(df[col]).month
    df[col + 'Day'] = pd.DatetimeIndex(df[col]).day
    df[col + 'WeekDay'] = pd.DatetimeIndex(df[col]).weekday

### Days to DueDate ###
df['DocumentDateToDueDate'] = date_int(df['DueDate'] - df['DocumentDate'])

### Days to ClearingDate ###
df['DueDateToClearingDate'] = date_int(df['ClearingDate'] - df['DueDate'])
df['DocumentDateToClearingDate'] = date_int(df['ClearingDate'] - df['DocumentDate'])

### Categorical columns ###
category_cols = ['CompanyKey', 'CustomerKey', 'CorporateDivision', 'CustomerRegion', 'PaymentTerms']
df['CustomerRegion'].fillna(df['CustomerRegion'].value_counts().idxmax(), inplace=True)
df[category_cols] = df[category_cols].apply(lambda x: pd.Categorical(x, ordered=False).codes)

df.head()

In [None]:
df = setup_buckets(df, col='DueDateToClearingDate', bins=[1], sufix='Bin')
df = setup_buckets(df, col='DueDateToClearingDate', bins=[1, 3, 8], sufix='Bucket')

plot_countplot(df, cols=['DueDateToClearingDateBinCategory', 'DueDateToClearingDateBucketCategory'])

In [None]:
y_column = np.array(['DueDateToClearingDateBin'])
x_column = np.array([
                    'AvgPastDueDays',
                    'AvgDSOPastDueDocuments',
                    # 'PaidDocuments',
                    # 'PaidAmount',
                    # 'InvoicedDocuments',
                    # 'InvoicedAmount',
                    # 'OpenDocuments',
                    # 'OpenAmount',
                    # 'PaidPastDocuments',
                    'PastDueAmount',
                    'PaidPastAmount',
                    # 'PastDueDocuments',
                    'DocumentAmount',
                    'DocumentDateToDueDate',
                        'CompanyKey',
                        'PaymentTerms',
                        'CorporateDivision',
                        'CustomerKey',
                        'CustomerRegion',
                    'DocumentDateDay',
                    'DocumentDateWeekDay',
                    'DocumentDateDayOfYear',
                    'DueDateDay',
                    'DueDateWeekDay',
                    'DueDateDayOfYear',
                    ])


train, test = split_data_month_window(df, col='DueDate', date='2020-08-01', month_window=12)


# from imblearn.over_sampling import SMOTE, ADASYN
# train = resample(train, x_column, y_column, ADASYN(random_state=seed))


x_train, y_train, x_test, y_test = prepare_data(train, test, x_column, y_column, random_state=seed)

plot_countplot(train, cols=['DueDateToClearingDateBin'], title='Train')
plot_countplot(test, cols=['DueDateToClearingDateBin'], title='Test')

In [None]:
clf = RandomForestClassifier(n_estimators=50, criterion='entropy', class_weight='balanced', random_state=seed, n_jobs=-1)
clf.fit(x_train, np.squeeze(y_train))

y_predict = classifier_predict(clf, x_test, threshold=0.5)
plot_confuncion_matrix(test[y_column].values, y_predict)

In [None]:
y_predict = classifier_predict(clf, x_test, threshold=0.9999)
plot_confuncion_matrix(test[y_column].values, y_predict)

In [None]:
plot_feature_importance(x_column, clf.feature_importances_)

In [None]:
# from pandas_profiling import ProfileReport

# cols = np.concatenate((x_column, y_column), axis=0)
# df_train = np.concatenate((x_train, y_train), axis=1)
# df_test = np.concatenate((x_test, y_test), axis=1)
# df_local = np.concatenate((df_train, df_test), axis=0)

# df_local = pd.DataFrame(df_local, columns=cols)
# profile = ProfileReport(df_local)

# profile

In [None]:
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split

import tensorflow_addons as tfa
import tensorflow as tf

def make_model():
    model = tf.keras.models.Sequential(name='cubricks')
    model.add(tf.keras.layers.Input(shape=x_train.shape[1]))

    model.add(tf.keras.layers.Dense(256, kernel_initializer='glorot_normal', kernel_regularizer=tf.keras.regularizers.l1_l2(l1=1e-3, l2=1e-2)))
    model.add(tf.keras.layers.PReLU())
    model.add(tf.keras.layers.BatchNormalization(renorm=False))
    # model.add(tf.keras.layers.Dropout(rate=0.1))

    model.add(tf.keras.layers.Dense(512, kernel_initializer='glorot_normal', kernel_regularizer=tf.keras.regularizers.l1_l2(l1=1e-3, l2=1e-2)))
    model.add(tf.keras.layers.PReLU())
    model.add(tf.keras.layers.BatchNormalization(renorm=False))
    # model.add(tf.keras.layers.Dropout(rate=0.1))

    model.add(tf.keras.layers.Dense(256, kernel_initializer='glorot_normal', kernel_regularizer=tf.keras.regularizers.l1_l2(l1=1e-3, l2=1e-2)))
    model.add(tf.keras.layers.PReLU())
    model.add(tf.keras.layers.BatchNormalization(renorm=False))

    model.add(tf.keras.layers.Dense(128, kernel_initializer='glorot_normal', kernel_regularizer=tf.keras.regularizers.l1_l2(l1=1e-3, l2=1e-2)))
    model.add(tf.keras.layers.PReLU())
    model.add(tf.keras.layers.BatchNormalization(renorm=False))

    model.add(tf.keras.layers.Dense(np.unique(y_train).shape[0], activation='softmax'))

    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4, epsilon=1e-8, amsgrad=True),
        loss=tf.keras.losses.CategoricalCrossentropy(),
        # loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.1),
        metrics=[tfa.metrics.F1Score(num_classes=np.unique(y_train).shape[0], average='weighted')])

    return model


model = make_model()
model.summary()

train, test = split_data_month_window(df, col='DueDate', date='2020-08-01', month_window=12)
x_train, y_train, x_test, y_test = prepare_data(train, test, x_column, y_column, random_state=seed)

x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.1, shuffle=True, random_state=seed, stratify=y_train)

In [None]:
BATCH = 256
EPOCHS = 10000
PATIENCE = 10000

checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath='./logs/weights.hdf5', monitor='val_f1_score', mode='max', save_best_only=True, verbose=1)
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_f1_score', mode='max', patience=PATIENCE, restore_best_weights=True, verbose=1)
tensorboard = tf.keras.callbacks.TensorBoard(log_dir='./logs', profile_batch=0)

class_weight = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train[:,0])
class_weight = dict(zip(np.unique(y_train), class_weight))

model = make_model()
# model.load_weights(filepath='./logs/weights.hdf5')

model_history = model.fit(x_train, tf.keras.utils.to_categorical(y_train),
                          validation_data=(x_valid, tf.keras.utils.to_categorical(y_valid)),
                          epochs=EPOCHS, batch_size=BATCH, shuffle=True,
                          callbacks=[checkpoint, early_stopping, tensorboard],
                          class_weight=class_weight)

In [None]:
def plot_model_history(history):
    colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
    plt.figure(figsize=(15, 3))

    for n, metric in enumerate(['loss', 'f1_score']):
        plt.subplot(1, 2, n+1)
        plt.plot(history.epoch,  history.history[metric], color=colors[0], label='Train')
        plt.plot(history.epoch, history.history['val_' + metric], color=colors[0], linestyle='--', label='Val')
        plt.ylim([plt.ylim()[0], 1] if n > 0 else [0, plt.ylim()[1]])
        plt.ylabel(metric.replace('_', ' ').capitalize())
        plt.xlabel('Epoch')
        plt.legend()


plot_model_history(model_history)

In [None]:
model.load_weights(filepath='./logs/weights.hdf5')

y_predict = classifier_predict(model, x_test, threshold=0.5, network=True)
plot_confuncion_matrix(y_test, y_predict)