In [None]:
!nvidia-smi

In [None]:
from google.colab import drive

drive.mount('./gdrive', force_remount=True)
%cd './gdrive/My Drive/Colab Notebooks/cubricks'

## Modules

In [None]:
!pip -q install pandas-profiling imbalanced-learn tensorflow-gpu --upgrade

In [None]:
from pandas_profiling import ProfileReport

from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import RobustScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

SEED = 42

## Data Functions

### Plot

In [None]:
def plot_countplot(df, cols=[0], title=None, rotation=0):
    for col in cols:
        fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8, 2))
        g = sns.countplot(x=np.squeeze(df[col] if isinstance(col, str) else df[:,col]), ax=ax)
        g.set_xticklabels(labels=g.get_xticklabels(), rotation=rotation)
        g.set_title(title)

def plot_confuncion_matrix(y_test, predict, title='Confusion Matrix', report=True):
    if report: print(classification_report(y_test, y_predict, zero_division=True))
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8, 8))
    g = sns.heatmap(confusion_matrix(y_test, predict), fmt='d', square=True, annot=True, cmap='Blues', ax=ax)
    g.set_title(title)

def plot_feature_importance(features, importances):
    features = np.array(features)
    indices = np.argsort(importances)[::-1]
    print(f'Feature ranking:')

    for f in range(len(features)):
        print(f'{importances[indices[f]]}\t{features[indices[f]]}')

    plt.figure(figsize=(10, 8))
    plt.barh(range(len(features)), importances[indices])
    plt.yticks(range(len(features)), features[indices])
    plt.title('Feature Importance')
    plt.gca().invert_yaxis()
    plt.show()

### Data Manager

In [None]:
def setup_buckets(df, col, bins, sufix='Bucket'):
    bins = [-np.inf] + bins + [np.inf]
    labels = [f'{bins[i]} to {bins[i+1]-1}' for i in range(len(bins[:-1]))]
    df[col + sufix + 'Category'] = pd.cut(df[col], bins=bins, labels=labels, right=False, include_lowest=True)
    df[[col + sufix]] = df[[col + sufix + 'Category']].apply(lambda x: pd.Categorical(x, ordered=True).codes)
    return df

def split_data(df, date_column, split_date, train_months=6, test_months=1, category_cols=None):
    split_date = pd.to_datetime(split_date)
    train_date = split_date - pd.DateOffset(months=train_months)
    test_date = split_date + pd.DateOffset(months=test_months)

    df_ranged = df[(df[date_column] >= train_date) & (df[date_column] < test_date)].copy()
    df_ranged.reset_index(drop=True, inplace=True)

    if category_cols is not None:
        df_ranged[category_cols] = df_ranged[category_cols].apply(lambda x: pd.Categorical(x, ordered=False).codes)

    train = df_ranged[df_ranged[date_column] < split_date]
    test = df_ranged[df_ranged[date_column] >= split_date]
    return train, test

def preprocess_data(train, test, x_column, y_column):
    x_train, y_train = train[x_column].values[::-1], train[y_column].values[::-1]
    x_test, y_test = test[x_column].values, test[y_column].values

    x_train, x_test = np.vectorize(np.log)(x_train + 1), np.vectorize(np.log)(x_test + 1)

    qt = RobustScaler(quantile_range=(25.0, 75.0))
    qt.fit(np.concatenate((x_train, x_test)), np.concatenate((y_train, y_test)))

    x_train, x_test = qt.transform(x_train), qt.transform(x_test)
    return x_train, y_train, x_test, y_test

### Deprecated ###
def resample(df, x_column, y_column, func):
    dtypes = df[x_column].dtypes.to_dict()
    dtypes.update(df[y_column].dtypes.to_dict())    
    x, y = df[x_column].values, df[y_column].values

    try:
        x, y = func.fit_resample(x, y)
        y = np.expand_dims(y, axis=1)
    except:
        pass

    xy = np.concatenate((x, y), axis=1)
    data = pd.DataFrame(xy, columns=np.concatenate((x_column, y_column)))
    data = data.astype(dtypes)
    return data

In [None]:
def features_selection(train, test, x_column, y_column, target_class='macro avg', random_state=None):
    insertion = lambda l, x: [l[:i] + [x] + l[i:] for i in range(len(l) + 1)]
    flatten = lambda l: [item for sublist in l for item in sublist]

    def _loop_selection(groups, score=-1, features=[]):
        scores_groups = []

        for feature_group in groups:
            x_column = np.array(flatten(feature_group))
            x_train, y_train, x_test, y_test = preprocess_data(train, test, x_column, y_column)
 
            clf = RandomForestClassifier(n_estimators=100, criterion='entropy', class_weight='balanced', random_state=random_state, n_jobs=-1)
            clf.fit(x_train, np.squeeze(y_train))
            y_predict = clf.predict(x_test)

            cr = classification_report(test[y_column].values, y_predict, output_dict=True, zero_division=True)
            scores_groups.append(cr[target_class]['f1-score'])

        local_score = max(scores_groups)
        local_features = list(groups[scores_groups.index(local_score)])
        print(f'Score: {local_score:.8f} >>> {local_features}')

        if local_score > score:
            return local_score, local_features

        return score, features

    x_column = x_column.tolist()
    groups = np.expand_dims(x_column, axis=1).tolist()

    score, features = _loop_selection(groups)
    x_column.remove(features[0])

    for group in x_column:
        score, features = _loop_selection(insertion(features, group), score, features)

    return score, features

## Read Dataset

In [None]:
### Read dataset ###
df = pd.read_csv('InvoicedDocuments_v7_Buckets.csv', sep=';', na_values=['N/I'], parse_dates=['DocumentDate', 'DueDate', 'ClearingDate'])
# df = pd.read_csv('InvoicedDocuments_v7_Default.csv', sep=';', na_values=['N/I'], parse_dates=['DocumentDate', 'DueDate', 'ClearingDate'])
df.sort_values(by=['DocumentDate'], ascending=True, ignore_index=True, inplace=True)

### First filters ###
df.dropna(subset=['ClearingDate', 'PaymentTerms', 'CustomerRegion'], inplace=True)
df = df[(df['DueDate'] > df['DocumentDate']) & (df['ClearingDate'] > df['DocumentDate'])]

### Feature engineer ('Days to' columns) ###
date_int = lambda x: x.astype('timedelta64[D]').astype(int)

df['DocumentDateToDueDate'] = date_int(df['DueDate'] - df['DocumentDate'])
df['DueDateToClearingDate'] = date_int(df['ClearingDate'] - df['DueDate'])
df['DocumentDateToClearingDate'] = date_int(df['ClearingDate'] - df['DocumentDate'])

### Feature engineer (numeric columns) ###
for col in ['Invoiced', 'Paid', 'Open']:
# for col in ['Invoiced', 'Paid', 'Open', 'PastDue', 'PaidPast']:
    df[f'Ratio{col}Amount'] = df[f'{col}Amount'] / df[f'{col}Documents']

df.fillna(0, inplace=True)

### Feature engineer (date columns) ###
for col in ['DocumentDate', 'DueDate']:
    df[col + 'Day'] = pd.DatetimeIndex(df[col]).day
    df[col + 'WeekDay'] = pd.DatetimeIndex(df[col]).weekday
    df[col + 'Month'] = pd.DatetimeIndex(df[col]).month
    df[col + 'DayOfYear'] = pd.DatetimeIndex(df[col]).dayofyear

df.head()

In [None]:
df = setup_buckets(df, col='DueDateToClearingDate', bins=[1], sufix='Bin')
df = setup_buckets(df, col='DueDateToClearingDate', bins=[1, 4, 8], sufix='Bucket')

plot_countplot(df, cols=['DueDateToClearingDateBinCategory', 'DueDateToClearingDateBucketCategory'])

## Train and Test Data

In [None]:
y_column = np.array(['DueDateToClearingDateBucket'])
x_column = np.array([
                    # 'AvgPastDueDays',
                    # 'AvgDSOPastDueDocuments',
                    # 'RatioPaidAmount',
                    # 'RatioInvoicedAmount',
                    # 'RatioOpenAmount',
                    # 'RatioPastDueAmount',
                    # 'RatioPaidPastAmount',
                    # 'DocumentAmount',
                    # 'DocumentDateToDueDate',
                    #     'CompanyKey',
                    #     'PaymentTerms',
                    #     'CorporateDivision',
                    #     'CustomerKey',
                    #     'CustomerRegion',
                    # 'DocumentDateDay',
                    # 'DueDateDay',
                    # 'DocumentDateWeekDay',
                    # 'DueDateWeekDay',
                    # 'DocumentDateDayOfYear',
                    # 'DueDateDayOfYear',


                    'PastDueDays',
                    'AvgPastDueDays',
                    'StDevPastDueDays',

                    'DSOPastDueDocuments',
                    'AvgDSOPastDueDocuments',
                    'StDevDSOPastDueDocuments',

                    'PaidDocuments',
                    'PaidDocumentsOnTime',
                    'PaidDocuments1to3',
                    'PaidDocuments4to7',
                    'PaidDocuments7more',

                    'PaidAmount',
                    'PaidAmountOnTime',
                    'PaidAmount1to3',
                    'PaidAmount4to7',
                    'PaidAmount7more',

                    'AvgPaidAmount',
                    'AvgPaidAmountOnTime',
                    'AvgPaidAmount1to3',
                    'AvgPaidAmount4to7',
                    'AvgPaidAmount7more',

                    'StDevPaidAmount',
                    'StDevPaidAmountOnTime',
                    'StDevPaidAmount1to3',
                    'StDevPaidAmount4to7',
                    'StDevPaidAmount7more',

                    'InvoicedDocuments',
                    'InvoicedAmount',
                    'AvgInvoicedAmount',
                    'StDevInvoicedAmount',

                    'OpenDocuments',
                    'OpenAmount',
                    'AvgOpenAmount',
                    'StDevOpenAmount',

                    'DocumentAmount',
                    'DocumentDateToDueDate',

                    'RatioInvoicedAmount',
                    'RatioPaidAmount',
                    'RatioOpenAmount',

                    'CompanyKey',
                    'PaymentTerms',
                    'CorporateDivision',
                    'CustomerKey',
                    'CustomerRegion',

                    'DocumentDateDay',
                    'DocumentDateWeekDay',
                    'DocumentDateMonth',
                    'DocumentDateDayOfYear',

                    'DueDateDay',
                    'DueDateWeekDay',
                    'DueDateMonth',
                    'DueDateDayOfYear',
                    ])


train, test = split_data(df, date_column='DueDate', split_date='2020-06-01', train_months=6, test_months=1,
                         category_cols=['CompanyKey', 'CorporateDivision', 'CustomerKey', 'CustomerRegion', 'PaymentTerms'])

# train = resample(train, x_column, y_column, SMOTE(sampling_strategy='auto', random_state=SEED))

# 100:   75945
# 1000:  380664
# 10000: 309053
# train = train[train['CustomerKey'] == 75945]
# test = test[test['CustomerKey'] == 75945]

plot_countplot(train, cols=y_column, title='Train')
plot_countplot(test, cols=y_column, title='Test')

## Data Analysis

In [None]:
# x_train, y_train, x_test, y_test = prepare_data(train, test, x_column, y_column, random_state=SEED)

# cols = np.concatenate((x_column, y_column), axis=0)
# df_train = np.concatenate((x_train, y_train), axis=1)
# df_test = np.concatenate((x_test, y_test), axis=1)
# df_local = np.concatenate((df_train, df_test), axis=0)

# df_local = pd.DataFrame(df_local, columns=cols)
# profile = ProfileReport(df_local)

# profile

## Random Forest

In [None]:
x_train, y_train, x_test, y_test = preprocess_data(train, test, x_column, y_column)

clf = RandomForestClassifier(n_estimators=100, criterion='entropy', class_weight='balanced', random_state=SEED, n_jobs=-1)
clf.fit(x_train, np.squeeze(y_train))

y_predict = clf.predict(x_test)
plot_confuncion_matrix(test[y_column].values, y_predict)

In [None]:
plot_feature_importance(x_column, clf.feature_importances_)

In [None]:
# y_predict = (clf.predict_proba(x_test)[:,1] >= 0.9999).astype('int')
# plot_confuncion_matrix(test[y_column].values, y_predict)

In [None]:
x_column = np.array([                   
                    ['PastDueDays'],
                    ['AvgPastDueDays'],
                    ['StDevPastDueDays'],

                    ['DSOPastDueDocuments'],
                    ['AvgDSOPastDueDocuments'],
                    ['StDevDSOPastDueDocuments'],
                    ])

# target_class='3', 
score, features = features_selection(train, test, x_column, y_column, random_state=SEED)

# Final score: 0.56753507 >>> [['StDevDSOPastDueDocuments'], ['StDevPastDueDays'], ['DSOPastDueDocuments'], ['AvgPastDueDays'], ['AvgDSOPastDueDocuments']]

print(f'\nFinal score: {score:.8f} >>> {features}')

## Neural Network

In [None]:
# from sklearn.utils.class_weight import compute_class_weight
# from sklearn.model_selection import train_test_split

# import tensorflow_addons as tfa
# import tensorflow as tf

# def make_model():
#     model = tf.keras.models.Sequential(name='cubricks')
#     model.add(tf.keras.layers.Input(shape=x_train.shape[1]))

#     model.add(tf.keras.layers.Dense(256, kernel_initializer='glorot_normal'))
#     model.add(tf.keras.layers.PReLU())
#     model.add(tf.keras.layers.BatchNormalization(renorm=False))

#     model.add(tf.keras.layers.Dense(512, kernel_initializer='glorot_normal'))
#     model.add(tf.keras.layers.PReLU())
#     model.add(tf.keras.layers.BatchNormalization(renorm=False))

#     model.add(tf.keras.layers.Dense(256, kernel_initializer='glorot_normal'))
#     model.add(tf.keras.layers.PReLU())
#     model.add(tf.keras.layers.BatchNormalization(renorm=False))

#     # model.add(tf.keras.layers.Dense(32, kernel_initializer='glorot_normal', kernel_regularizer=tf.keras.regularizers.l1_l2(l1=1e-3, l2=1e-2)))
#     # model.add(tf.keras.layers.PReLU())
#     # model.add(tf.keras.layers.BatchNormalization(renorm=False))

#     # model.add(tf.keras.layers.Dense(64, kernel_initializer='glorot_normal', kernel_regularizer=tf.keras.regularizers.l1_l2(l1=1e-3, l2=1e-2)))
#     # model.add(tf.keras.layers.PReLU())
#     # model.add(tf.keras.layers.BatchNormalization(renorm=False))
#     # # model.add(tf.keras.layers.Dropout(rate=0.1))

#     # model.add(tf.keras.layers.Dense(128, kernel_initializer='glorot_normal', kernel_regularizer=tf.keras.regularizers.l1_l2(l1=1e-3, l2=1e-2)))
#     # model.add(tf.keras.layers.PReLU())
#     # model.add(tf.keras.layers.BatchNormalization(renorm=False))

#     model.add(tf.keras.layers.Dense(np.unique(y_train).shape[0], activation='softmax'))

#     model.compile(
#         optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4, epsilon=1e-8, amsgrad=True),
#         loss=tf.keras.losses.CategoricalCrossentropy(),
#         # loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.1),
#         metrics=[tfa.metrics.F1Score(num_classes=np.unique(y_train).shape[0], average='weighted')])

#     return model


# model = make_model()
# model.summary()

# train, test = split_data_month_window(df, col='DueDate', date='2020-08-01', month_window=12)
# x_train, y_train, x_test, y_test = prepare_data(train, test, x_column, y_column, random_state=SEED)

# x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.1, shuffle=True, random_state=SEED, stratify=y_train)

In [None]:
# BATCH = 256
# EPOCHS = 10000
# PATIENCE = 10000

# tensorboard = tf.keras.callbacks.TensorBoard(log_dir='./logs', profile_batch=0)

# checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath='./logs/weights.hdf5',
#                                                 monitor='val_f1_score', mode='max',
#                                                 save_best_only=True, save_weights_only=True, verbose=1)

# early_stopping = tf.keras.callbacks.EarlyStopping(patience=PATIENCE,
#                                                   monitor='val_f1_score', mode='max',
#                                                   restore_best_weights=True, verbose=1)

# class_weight = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train[:,0])
# class_weight = dict(zip(np.unique(y_train), class_weight))

# model = make_model()
# # model.load_weights(filepath='./logs/weights.hdf5')

# model_history = model.fit(x_train, tf.keras.utils.to_categorical(y_train),
#                           validation_data=(x_valid, tf.keras.utils.to_categorical(y_valid)),
#                           epochs=EPOCHS, batch_size=BATCH, shuffle=True,
#                           callbacks=[checkpoint, early_stopping, tensorboard],
#                           class_weight=class_weight)

In [None]:
# def plot_model_history(history):
#     colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
#     plt.figure(figsize=(15, 3))

#     for n, metric in enumerate(['loss', 'f1_score']):
#         plt.subplot(1, 2, n+1)
#         plt.plot(history.epoch,  history.history[metric], color=colors[0], label='Train')
#         plt.plot(history.epoch, history.history['val_' + metric], color=colors[0], linestyle='--', label='Val')
#         plt.ylim([plt.ylim()[0], 1] if n > 0 else [0, plt.ylim()[1]])
#         plt.ylabel(metric.replace('_', ' ').capitalize())
#         plt.xlabel('Epoch')
#         plt.legend()


# plot_model_history(model_history)

In [None]:
# model.load_weights(filepath='./logs/weights.hdf5')

# y_predict = classifier_predict(model, x_test, threshold=0.5, network=True)
# plot_confuncion_matrix(y_test, y_predict)