In [None]:
!nvidia-smi

In [None]:
from google.colab import drive

drive.mount('./gdrive', force_remount=True)
%cd './gdrive/My Drive/cubricks'

In [None]:
!pip install -q tensorflow-gpu

In [None]:
%tensorflow_version 2.x

import tensorflow as tf
import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
### Load csv file ###
df = pd.read_csv('InvoicedDocuments_v2.csv', na_values=['N/I'])

### Trim string/Remove duplicates ###
df.columns = df.columns.str.replace(' ', '')
df.drop_duplicates('Document', keep=False, inplace=True)

### String to datetime format ###
df['DocumentDate'] = pd.to_datetime(df['DocumentDate'], errors='coerce')
df['DueDate'] = pd.to_datetime(df['DueDate'], errors='coerce')
df['ClearingDate'] = pd.to_datetime(df['ClearingDate'], errors='coerce')
df['LastCreditReview'] = pd.to_datetime(df['LastCreditReview'], errors='coerce')

### Drop/Fill null values ###
df.dropna(subset=['DocumentDate', 'Document', 'DueDate', 'ClearingDate'], inplace=True)
df[df.columns.drop('LastCreditReview')].fillna(0, inplace=True)

### Float to integer ###
df['Company'] = pd.to_numeric(df['Company'], downcast='integer')
df['Document'] = pd.to_numeric(df['Document'], downcast='integer')
df['CustomerCode'] = pd.to_numeric(df['CustomerCode'], downcast='integer')

### String to number ###
string_number = lambda x: int(''.join(format(ord(w), '') for w in str(x)))
df['Country'] = df['Country'].apply(string_number)
df['Region'] = df['Region'].apply(string_number)
df['Terms'] = df['Terms'].apply(string_number)

### Amount treatment/filter ###
df['Amount'] = df['Amount'].apply(lambda x: int(str(x).replace(',', '')))
df = df[df['Amount'] > 20]

### Summary ###
df.info()
df.head(100)

In [None]:
#########################
### Generate DaysTo__ ###
#########################
df['DaysToLastCreditReview'] = df['DocumentDate'] - df['LastCreditReview']
df['DaysToLastCreditReview'] = df['DaysToLastCreditReview'].fillna(pd.Timedelta(seconds=0)).astype('timedelta64[D]').astype(int)

df['DaysToDue'] = (df['DueDate'] - df['DocumentDate']).astype('timedelta64[D]').astype(int)
df = df[df['DaysToDue'] > 0]

df['DaysToClearingDate'] = (df['ClearingDate'] - df['DocumentDate']).astype('timedelta64[D]').astype(int)
df = df[df['DaysToClearingDate'] > 0]

df.head(10)

In [None]:
#####################
### Define IsLate ###
#####################
df['IsLate'] = (df['DueDate'] - df['ClearingDate']).astype('timedelta64[D]').astype(int)
df['IsLate'] = df['IsLate'].apply(lambda x: 0 if x > 0 else 1)

sns.countplot(df['IsLate'])
df.head(10)

In [None]:
###############################################
### Define ClearingDateRange and Categorize ###
###############################################
bins = [0, 8, 15, 22, 29, 61, np.inf]
labels = [f'{bins[i]}-{bins[i+1]-1}' for i in range(len(bins[:-1]))]

print(f'Labels: {labels}')

df['ClearingDateRange'] = pd.cut(df['DaysToClearingDate'], bins=bins, labels=labels, right=False, include_lowest=True)
df[['ClearingDateRangeCT']] = df[['ClearingDateRange']].apply(lambda x: pd.Categorical(x, ordered=True).codes)

sns.countplot(df['ClearingDateRange'])
df.head(10)

In [None]:
################################################
### Extract Document/Due month, day, weekday ###
################################################
df['DocumentMonth'] = df['DocumentDate'].dt.month
df['DocumentDay'] = df['DocumentDate'].dt.day
df['DocumentWeekDay'] = df['DocumentDate'].dt.weekday


fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(30,4))
sns.countplot(df['DocumentMonth'], ax=ax[0])
sns.countplot(df['DocumentDay'], ax=ax[1])
sns.countplot(df['DocumentWeekDay'], ax=ax[2])

df.head(10)

In [None]:
y_column = np.array(['ClearingDateRangeCT'])
features = np.array([
                    #  'ClearingDateRangeCT',
                    #  'DaysToClearingDate',
                    #  'Document',
                     'Company',
                     'CustomerCode',
                    #  'Country',
                     'Region',
                     'Terms',
                     'Amount',
                     'DaysToLastCreditReview',
                     'DaysToDue',
                     'IsLate',
                     'DocumentMonth',
                     'DocumentDay',
                     'DocumentWeekDay',
                     ])


plt.figure(figsize=(10, 10))
sns.heatmap(df[features].corr(), square=True, annot=True, fmt='.1g', vmin=-1, vmax=1, center=0, cmap='Pastel1')

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df[features].values,
                                                    df[y_column].values,
                                                    test_size=0.1,
                                                    shuffle=True,
                                                    random_state=42,
                                                    stratify=df[y_column].values)

In [None]:
# dumb predict
print(f'Total items: {len(y_test)}')

for i in range(len(labels)):
    dumb_data = np.ones(y_test.shape, dtype=np.int) * i
    print(f'Always predicting the range "{i}",', f'accuracy is {accuracy_score(y_test, dumb_data) * 100:.2f}%')

In [None]:
random_forest = RandomForestClassifier(n_estimators=10, criterion='gini', random_state=42, n_jobs=-1)
random_forest.fit(x_train, np.squeeze(y_train))

In [None]:
predict = random_forest.predict(x_test)

importances = random_forest.feature_importances_
indices = np.argsort(importances)[::-1]

print(f'Total items: {len(y_test)}')
print(f'Accuracy: {accuracy_score(y_test, predict) * 100:.2f}%\n')
print(f'Feature ranking:')

for f in range(x_train.shape[1]):
    print(f'{importances[indices[f]]}\t{features[indices[f]]}')

plt.barh(range(x_train.shape[1]), importances[indices])
plt.yticks(range(x_train.shape[1]), features[indices])
plt.title('Feature Importance')
plt.gca().invert_yaxis()
plt.show()

In [None]:
def binary_encoding(df, columns, left_padding=0):
  for item in columns:
    col = np.array([str('{0:b}'.format(x)) for x in df[item].values])
    maxlength = len(max(col, key=len)) + left_padding

    header = np.array([f'{item}{i}' for i in range(maxlength)])
    newcol = np.zeros((col.shape[0], maxlength), dtype=np.int8)

    for i in range(len(col)):
      a = np.array(list(col[i]), dtype=np.int8)
      newcol[i][maxlength - len(a):] = a

    df2 = pd.DataFrame(newcol, columns=header)

    df.reset_index(drop=True, inplace=True)
    df = pd.concat([df, df2], axis=1)

  df.drop(columns=columns, inplace=True)
  return df


df_network = df[features].copy()

# scalar = ['Amount', 'DaysToLastCreditReview', 'DaysToDue']
category = ['Company', 'CustomerCode', 'Region', 'Terms', 'DocumentMonth', 'DocumentDay', 'DocumentWeekDay']

# df_network[scalar] = StandardScaler().fit_transform(df_network[scalar])
df_network = binary_encoding(df_network, category)

df_network.head(10)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df_network.values,
                                                    df[y_column].values,
                                                    test_size=0.1, 
                                                    shuffle=True,
                                                    random_state=42,
                                                    stratify=df[y_column].values)

y_train_categorical = tf.keras.utils.to_categorical(y_train)
y_test_categorical = tf.keras.utils.to_categorical(y_test)

In [None]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super(CustomSchedule, self).__init__()
    
    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps
    
  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)
    
    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)


def create_model():
    model = tf.keras.models.Sequential(name='cubricks')

    model.add(tf.keras.layers.Input(shape=x_train.shape[1]))
    model.add(tf.keras.layers.BatchNormalization())

    model.add(tf.keras.layers.Dense(256, activation='relu', kernel_initializer='he_normal'))
    model.add(tf.keras.layers.BatchNormalization())

    model.add(tf.keras.layers.Dense(512, activation='relu', kernel_initializer='he_normal'))
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.Dropout(rate=0.1))

    model.add(tf.keras.layers.Dense(512, activation='relu', kernel_initializer='he_normal'))
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.Dropout(rate=0.1))

    model.add(tf.keras.layers.Dense(128, activation='relu', kernel_initializer='he_normal'))
    model.add(tf.keras.layers.BatchNormalization())

    model.add(tf.keras.layers.Dense(len(labels), activation='softmax'))
    return model


model = create_model()
lr = CustomSchedule(x_train.shape[1])

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr, epsilon=1e-8),
              loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.1, reduction='none'),
              metrics=['accuracy'])

model.summary()

In [None]:
logdir = os.path.join('.', 'output')
os.makedirs(logdir, exist_ok=True)

training_log = os.path.join(logdir, 'training.txt')
model_checkpoint = os.path.join(logdir, 'model.hdf5')

if os.path.isfile(model_checkpoint):
    model.load_weights(model_checkpoint)

callbacks = [
    tf.keras.callbacks.TensorBoard(logdir, profile_batch=0),
    tf.keras.callbacks.CSVLogger(training_log, separator=',', append=True),
    tf.keras.callbacks.ModelCheckpoint(model_checkpoint, monitor='val_loss', save_best_only=True, verbose=1),
    # tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', min_delta=1e-8, factor=0.2, patience=15, verbose=1),
    # tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=1e-8, patience=100, restore_best_weights=True, verbose=1),
]

In [None]:
model.fit(x_train,
          y_train_categorical,
          validation_data=(x_test, y_test_categorical),
          callbacks=callbacks,
          batch_size=256,
          epochs=10000,
          verbose=1)

In [None]:
predict = np.argmax(model.predict(x_test), axis=1)

print(f'Total: {len(y_test)}')
print(f'Accuracy: {accuracy_score(y_test, predict) * 100:.2f}%')

In [None]:
classification_report = classification_report(y_test, predict, target_names=labels)
print(classification_report)

In [None]:
cm = confusion_matrix(y_test, predict)

plt.figure(figsize=(10, 10))
sns.heatmap(cm, square=True, annot=True, fmt='d', cmap='Blues')