In [None]:
!nvidia-smi

In [None]:
from google.colab import drive

drive.mount('./gdrive', force_remount=True)
%cd './gdrive/My Drive/cubricks'

In [None]:
!pip install -q tensorflow-gpu

In [None]:
%tensorflow_version 2.x

import os
import numpy as np
import pandas as pd
import tensorflow as tf

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
def set_ratio(df, cols):
    for col in cols:
        ratio_col = 'Ratio' + col[0] + col[1]
        df[ratio_col] = df[col[0]] / df[col[1]]
        df[ratio_col].fillna(0, inplace=True)
    return df

def set_daysto(df, cols):
    for col in cols:
        delta_col = 'DaysTo' + col[1]
        df[delta_col] = df[col[1]] - df[col[0]]
        df[delta_col].fillna(pd.Timedelta(seconds=0), inplace=True)
        df[delta_col] = df[delta_col].astype('timedelta64[D]').astype(int)
        df[delta_col] = df[delta_col].clip(lower=0)
    return df

def set_days(df, cols):
    for col in cols:
        df[col + 'Month'] = df[col].dt.month
        df[col + 'Day'] = df[col].dt.day
        df[col + 'WeekDay'] = df[col].dt.weekday
    return df

def set_range_cols(df, bins, labels, cols):
    for col in cols:
        df[col + 'Range'] = pd.cut(df[col], bins=bins, labels=labels, right=False, include_lowest=True)
        df[[col + 'RangeCT']] = df[[col + 'Range']].apply(lambda x: pd.Categorical(x, ordered=True).codes)
    return df

def get_data_range(df, col, date, month_range):
    date_0 = pd.to_datetime(date)
    date_1 = date_0 - pd.DateOffset(months=month_range)
    date_2 = date_0 + pd.DateOffset(months=1)

    train = df[(df[col] >= date_1) & (df[col] < date_0)]
    test = df[(df[col] >= date_0) & (df[col] < date_2)]

    return train, test

In [None]:
### Read data ###
df = pd.read_csv('InvoicedDocuments_v4.csv', sep=';', na_values=['N/I'])

### Fill NaN data ###
df.dropna(subset=['ClearingDate'], inplace=True)
df.fillna(0, inplace=True)

### Generate ratio ###
df = set_ratio(df, cols=[('InvoicedAmount', 'InvoicedDocuments'),
                         ('PaidAmount', 'PaidDocuments'),
                         ('PaidPastAmount', 'PaidPastDocuments'),
                         ('OpenAmount', 'OpenDocuments'),
                         ('PastDueAmount', 'PastDueDocuments')])

## String to number ###
cols = ['CustomerRegion', 'PaymentTerms']
df = df.apply(lambda x: [int(''.join(format(ord(w), '') for w in str(y))) for y in x] if x.name in cols else x)

### Convert columns to integer columns ###
cols = ['InvoicedDocuments', 'PaidDocuments', 'PaidPastDocuments', 'OpenDocuments', 'PastDueDocuments']
df = df.apply(lambda x: pd.to_numeric(x, downcast='integer') if x.name in cols else x)

### Convert columns to date columns ###
cols = ['CustomerLastCreditReview', 'DocumentDate', 'DueDate', 'ClearingDate']
df = df.apply(lambda x: pd.to_datetime(x, errors='coerce') if x.name in cols else x)

### Setup 'DaysTo' columns ###
df = df[(df['DocumentDate'] < df['DueDate']) & (df['DocumentDate'] < df['ClearingDate'])]

df = set_daysto(df, cols=[('DocumentDate', 'DueDate'),
                          ('DocumentDate', 'ClearingDate'),
                          ('CustomerLastCreditReview', 'DocumentDate')])

### Setup Month, Day and WeekDay columns ###
df = set_days(df, cols=['DocumentDate', 'DueDate'])

### Setup range columns ###
bins = [1, 8, 15, 22, 29, np.inf]
labels = [f'{bins[i]}-{bins[i+1]-1}' for i in range(len(bins[:-1]))]
df = set_range_cols(df, bins, labels, cols=['DaysToDueDate', 'DaysToClearingDate'])

df.head(10)

In [None]:
bins = [1, 8, 15, 22, 29, np.inf]
# bins = list(range(1, 31, 3)) + [np.inf]

labels = [f'{bins[i]}-{bins[i+1]-1}' for i in range(len(bins[:-1]))]
df = set_range_cols(df, bins, labels, cols=['DaysToDueDate', 'DaysToClearingDate'])

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(30, 4))
sns.countplot(df['DaysToClearingDateRange'], ax=ax)

In [None]:
y_column = np.array(['DaysToClearingDateRangeCT'])
features = np.array([
                     'CompanyKey',
                     'CustomerKey',
                     'CustomerRegion',
                    ##  CustomerLastCreditReview,
                    ##  'DocumentDate',
                    ##  'DueDate',
                    ##  'ClearingDate',
                     'PaymentTerms',
                    ##  'DocumentNumber',
                     'DocumentAmount',
                    #  'InvoicedDocuments',
                    #  'InvoicedAmount',
                    #  'PaidDocuments',
                    #  'PaidAmount',
                    #  'PaidPastDocuments',
                    #  'PaidPastAmount',
                    #  'OpenDocuments',
                    #  'OpenAmount',
                    #  'PastDueDocuments',
                    #  'PastDueAmount',
                     'AvgDSOPastDueDocuments',
                     'PastDueDays',
                     'DaysToDueDate',
                    #  'DaysToDocumentDate',
                    ##  'DaysToClearingDate',
                    #  'DocumentDateMonth',
                    #  'DocumentDateDay',
                     'DocumentDateWeekDay',
                    #  'DueDateMonth',
                    #  'DueDateDay',
                     'DueDateWeekDay',
                    ##  'DaysToDueDateRange', 
                    #  'DaysToDueDateRangeCT',
                    ##  'DaysToClearingDateRange',
                    #  'DaysToClearingDateRangeCT',
                     'RatioInvoicedAmountInvoicedDocuments',
                    #  'RatioPaidAmountPaidDocuments',
                    #  'RatioPaidPastAmountPaidPastDocuments',
                    #  'RatioOpenAmountOpenDocuments',
                     'RatioPastDueAmountPastDueDocuments',
                     ])

In [None]:
train, test = get_data_range(df, col='DocumentDate', date='2019-01-01', month_range=6)

x_train, y_train = train[features].values, train[y_column].values
x_test, y_test = test[features].values, test[y_column].values

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(30, 5))
sns.heatmap(train[features].corr(), square=True, annot=False, fmt='.1g', vmin=-1, vmax=1, center=0, cmap='Pastel1', ax=ax[0])
sns.heatmap(test[features].corr(), square=True, annot=False, fmt='.1g', vmin=-1, vmax=1, center=0, cmap='Pastel1', ax=ax[1])

fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(30, 4))
sns.countplot(train['DaysToClearingDateRange'], ax=ax[0])
sns.countplot(test['DaysToClearingDateRange'], ax=ax[1])

print(f'Total items: {len(y_test)}')
print(f'Due date predict')
print(f'Always predicting the range of "DueDateRange", accuracy is {accuracy_score(y_test, test["DaysToDueDateRangeCT"].values) * 100:.2f}%')

In [None]:
random_forest = RandomForestClassifier(n_estimators=40, criterion='entropy', min_weight_fraction_leaf=1e-4, random_state=42)
random_forest.fit(x_train, np.squeeze(y_train))

In [None]:
predict = random_forest.predict(x_test)

importances = random_forest.feature_importances_
indices = np.argsort(importances)[::-1]

print(f'Total items: {len(y_test)}')
print(f'Accuracy: {accuracy_score(y_test, predict) * 100:.2f}%\n')
print(f'Feature ranking:')

for f in range(x_train.shape[1]):
    print(f'{importances[indices[f]]}\t{features[indices[f]]}')

plt.figure(figsize=(10, 8))
plt.barh(range(x_train.shape[1]), importances[indices])
plt.yticks(range(x_train.shape[1]), features[indices])
plt.title('Feature Importance')
plt.gca().invert_yaxis()
plt.show()

In [None]:
cm_report = classification_report(y_test, predict, target_names=labels)
cm = confusion_matrix(y_test, predict)
print(cm_report)

plt.figure(figsize=(10, 10))
sns.heatmap(cm, square=True, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)

In [None]:
def binary_encoding(df, cols):
    for col in cols:
        bincol = np.array([str('{0:b}'.format(x)) for x in df[col[1]].values])
        header = np.array([f'{col[1]}{i}' for i in range(col[0])])
        newcol = np.zeros((bincol.shape[0], col[0]), dtype=np.int8)

        for i in range(bincol.shape[0]):
            a = np.array(list(bincol[i]), dtype=np.int8)
            newcol[i][col[0] - len(a):] = a

        df2 = pd.DataFrame(newcol, columns=header)
        df.reset_index(drop=True, inplace=True)
        df = pd.concat([df, df2], axis=1)
        df.drop(columns=[col[1]], inplace=True)

    return df


dfNN_train = train[features].copy()
dfNN_test = test[features].copy()

cols = [(32, 'CompanyKey'),
        (32, 'CustomerKey'),
        (32, 'CustomerRegion'),
        (32, 'PaymentTerms'),
        (3, 'DocumentDateWeekDay'),
        (3, 'DueDateWeekDay')]

dfNN_train = binary_encoding(dfNN_train, cols)
dfNN_test = binary_encoding(dfNN_test, cols)

dfNN_train.head(5)

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(dfNN_train.values,
                                                      train[y_column].values,
                                                      test_size=0.1, 
                                                      shuffle=True,
                                                      random_state=42,
                                                      stratify=train[y_column].values)

y_train_categorical = tf.keras.utils.to_categorical(y_train)
y_valid_categorical = tf.keras.utils.to_categorical(y_valid)

In [None]:
def create_model():
    model = tf.keras.models.Sequential(name='cubricks')

    model.add(tf.keras.layers.Input(shape=dfNN_train.values.shape[1]))
    model.add(tf.keras.layers.BatchNormalization(renorm=True))

    model.add(tf.keras.layers.Dense(512, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l=0.001)))
    # model.add(tf.keras.layers.Dense(512, activation='relu'))
    model.add(tf.keras.layers.BatchNormalization(renorm=True))
    model.add(tf.keras.layers.Dropout(rate=0.1))

    model.add(tf.keras.layers.Dense(512, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l=0.001)))
    # model.add(tf.keras.layers.Dense(512, activation='relu'))
    model.add(tf.keras.layers.BatchNormalization(renorm=True))
    model.add(tf.keras.layers.Dropout(rate=0.1))

    model.add(tf.keras.layers.Dense(512, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l=0.001)))
    # model.add(tf.keras.layers.Dense(512, activation='relu'))
    model.add(tf.keras.layers.BatchNormalization(renorm=True))
    model.add(tf.keras.layers.Dropout(rate=0.1))

    model.add(tf.keras.layers.Dense(len(labels), activation='softmax'))
    return model


model = create_model()

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001, epsilon=1e-8, amsgrad=True),
              loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.1, reduction=tf.keras.losses.Reduction.SUM),
              metrics=['accuracy'])

model.summary()

In [None]:
logdir = os.path.join('.', 'output')
training_log = os.path.join(logdir, 'training.txt')
model_checkpoint = os.path.join(logdir, 'model.hdf5')

# os.makedirs(logdir, exist_ok=True)

# if os.path.isfile(model_checkpoint):
#     model.load_weights(model_checkpoint)

callbacks = [
    # tf.keras.callbacks.TensorBoard(logdir, profile_batch=0),
    # tf.keras.callbacks.CSVLogger(training_log, separator=',', append=True),
    # tf.keras.callbacks.ModelCheckpoint(model_checkpoint, monitor='val_loss', save_best_only=True, verbose=1),
    # tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', min_delta=1e-8, factor=0.1, patience=10, verbose=1),
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=1e-8, patience=40, restore_best_weights=True, verbose=1),
]

In [None]:
model.fit(x_train,
          y_train_categorical,
          validation_data=(x_valid, y_valid_categorical),
          callbacks=callbacks,
          batch_size=256,
          epochs=10000,
          verbose=1)    

In [None]:
predict = np.argmax(model.predict(dfNN_test.values), axis=1)

print(f'Total: {len(test[y_column].values)}')
print(f'Accuracy: {accuracy_score(test[y_column].values, predict) * 100:.2f}%')

In [None]:
# cm_report = classification_report(test[y_column].values, predict, target_names=labels)
# print(cm_report)

In [None]:
# cm = confusion_matrix(test[y_column].values, predict)

# plt.figure(figsize=(10, 10))
# sns.heatmap(cm, square=True, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)