In [None]:
!nvidia-smi

In [None]:
from google.colab import drive

drive.mount('./gdrive', force_remount=True)

%cd './gdrive/My Drive/cubricks'
!ls -l

In [None]:
!pip install -q tensorflow-gpu

In [None]:
%tensorflow_version 2.x

import matplotlib.pyplot as plt
import seaborn as sns

import calendar
import numpy as np
import pandas as pd
import itertools

import sklearn.model_selection as skms
import sklearn.metrics as skm
import tensorflow as tf

print("Tensorflow version " + tf.__version__)

In [None]:
def setup_count_chart(quant, w=30, h=15, rotation=0):
    f, axes = plt.subplots(quant, 1, figsize=(w, h))

    f.tight_layout(pad=7.5)
    _ = [ax.tick_params(axis='x', labelrotation=rotation) for ax in f.axes]

    return f, axes


def plot_confusion_matrix(cm, target_names, title='Confusion matrix', cmap=None, normalize=True):
    accuracy = np.trace(cm) / np.sum(cm).astype('float')
    misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(20, 10))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, '{:0.4f}'.format(cm[i, j]),
                     horizontalalignment='center',
                     color='white' if cm[i, j] > thresh else 'black')
        else:
            plt.text(j, i, '{:,}'.format(cm[i, j]),
                     horizontalalignment='center',
                     color='white' if cm[i, j] > thresh else 'black')

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    plt.show()


def plot_correlation_heatmap(corrmat):
    top_corr_features = corrmat.index
    plt.figure(figsize=(20, 10))

    sns.heatmap(df[top_corr_features].corr(), annot=True, cmap="RdYlGn")

In [None]:
df = pd.read_csv('InvoicedDocuments_v2.csv', na_values=['N/I'])

df['Last Credit Review'] = pd.to_datetime(df['Last Credit Review'], errors='coerce')
df['Document Date'] = pd.to_datetime(df['Document Date'], errors='coerce')
df['Due Date'] = pd.to_datetime(df['Due Date'], errors='coerce')
df['Clearing Date'] = pd.to_datetime(df['Clearing Date'], errors='coerce')

df.dropna(subset=df.columns.drop('Last Credit Review'), inplace=True)

df.info()
df.head(10)

In [None]:
df['Company'] = pd.to_numeric(df['Company'], downcast='integer', errors='coerce')
df['CustomerCode'] = pd.to_numeric(df['CustomerCode'], downcast='integer', errors='coerce')
df['Document'] = pd.to_numeric(df['Document'], downcast='integer', errors='coerce')

df.dropna(subset=df.columns.drop('Last Credit Review'), inplace=True)

df.info()
df.head(10)

In [None]:
f, axes = setup_count_chart(quant=1, h=5)
sns.countplot(x='Company', data=df, ax=axes)

In [None]:
def string_to_number(x):
    return int(''.join(format(ord(w), '') for w in str(x)))


df['Country'] = df['Country'].apply(string_to_number)
df['Region'] = df['Region'].apply(string_to_number)
df['Terms'] = df['Terms'].apply(string_to_number)

df.head(10)

In [None]:
f, axes = setup_count_chart(quant=3, h=10, rotation=90)
sns.countplot(x='Country', data=df, ax=axes[0])
sns.countplot(x='Region', data=df, ax=axes[1])
sns.countplot(x='Terms', data=df, ax=axes[2])

In [None]:
def generate_range(data, until, step):
    bins = [x for x in range(0, int(until) + 1, int(step))] + [np.inf]

    data = pd.cut(data, bins=bins, labels=False)
    data.fillna(-1, inplace=True)
    data = data.apply(lambda x: x + 1)
    data = pd.to_numeric(data, downcast='integer')

    return data


df['Amount'] = df['Amount'].apply(lambda x: str(x).replace(',', ''))
df['Amount'] = pd.to_numeric(df['Amount'], downcast='integer', errors='coerce')
df = df[df['Amount'] > 20]

df['AmountRange'] = generate_range(df['Amount'], until=1e5, step=1e4)

df.head(10)

In [None]:
f, axes = setup_count_chart(quant=1, h=5)
sns.countplot(x='AmountRange', data=df, ax=axes)

In [None]:
def get_amount_bin(x):
    if x > 20000: return 1
    else: return 0


df['AmountBin'] = df['Amount'].apply(get_amount_bin)

f, axes = setup_count_chart(quant=1, h=5)
sns.countplot(x='AmountBin', data=df, ax=axes)

In [None]:
def get_magnitude(x):
    # return max(2, min(5, int(np.log10(x))))
  return min(5, int(np.log10(x)))

def get_magnitude_relative(x):
    return x / (10 ** (get_magnitude(x) + 1))


df['AmountMagnitude'] = df['Amount'].apply(get_magnitude)
df['AmountRelativeMagnitude'] = df['Amount'].apply(get_magnitude_relative)

# df = df[df['AmountMagnitude'] > 1] # 12190
# df = df[df['AmountMagnitude'] < 6] # 159

df.head(10)

In [None]:
f, axes = setup_count_chart(quant=1, h=5)
sns.countplot(x='AmountMagnitude', data=df, ax=axes)

In [None]:
def is_late(dates):
    if len(dates) == 1:
        dates.append(pd.datetime.now())

    diff = (dates[0] - dates[1]).days
    return max(0, min(diff, 1))


df['IsLate'] = df[['Due Date', 'Clearing Date']].apply(is_late, axis=1)

df.head(10)

In [None]:
f, axes = setup_count_chart(quant=1, h=5)
sns.countplot(x='IsLate', data=df, ax=axes)

In [None]:
def diff_dates(x):
    diff = abs((x[0] - x[1]).days)
    return 0 if np.isnan(diff) else diff


df['DaysToLastCreditReview'] = df[['Last Credit Review', 'Document Date']].apply(diff_dates, axis=1)
df['DaysToDue'] = df[['Document Date', 'Due Date']].apply(diff_dates, axis=1)
df['DaysToClearingDate'] = df[['Document Date', 'Clearing Date']].apply(diff_dates, axis=1)

df.head(10)

In [None]:
df['LastCreditReviewRange'] = generate_range(df['DaysToLastCreditReview'], until=720, step=91)
df['DueDateRange'] = generate_range(df['DaysToDue'], until=28, step=7)
df['ClearingDateRange'] = generate_range(df['DaysToClearingDate'], until=28, step=7)

df.head(10)

In [None]:
f, axes = setup_count_chart(quant=3, h=10)

sns.countplot(x='LastCreditReviewRange', data=df, ax=axes[0])
sns.countplot(x='DueDateRange', data=df, ax=axes[1])
sns.countplot(x='ClearingDateRange', data=df, ax=axes[2])

In [None]:
calendar.setfirstweekday(6)

def get_quarter(date):
    x = np.array(range(1, 13)).reshape(4, 3)
    return np.where(x==date.month)[0][0] + 1

def get_quarter_month(date):
    x = np.array(range(1, 13)).reshape(4, 3)
    return np.where(x==date.month)[1][0] + 1

def get_week_month(date):
    x = np.array(calendar.monthcalendar(date.year, date.month))
    return min(np.where(x==date.day)[0][0] + 1, 5)

def get_day_week(date):
    x = np.array(calendar.monthcalendar(date.year, date.month))
    return np.where(x==date.day)[1][0] + 1

In [None]:
df['DocumentQuarter'] = df['Document Date'].apply(get_quarter)
df['DocumentQuarterMonth'] = df['Document Date'].apply(get_quarter_month)
df['DocumentWeekMonth'] = df['Document Date'].apply(get_week_month)
df['DocumentDayWeek'] = df['Document Date'].apply(get_day_week)

df.head(10)

In [None]:
f, axes = setup_count_chart(quant=4, h=10)

sns.countplot(x='DocumentQuarter', data=df, ax=axes[0])
sns.countplot(x='DocumentQuarterMonth', data=df, ax=axes[1])
sns.countplot(x='DocumentWeekMonth', data=df, ax=axes[2])
sns.countplot(x='DocumentDayWeek', data=df, ax=axes[3])

In [None]:
df['DueQuarter'] = df['Due Date'].apply(get_quarter)
df['DueQuarterMonth'] = df['Due Date'].apply(get_quarter_month)
df['DueWeekMonth'] = df['Due Date'].apply(get_week_month)
df['DueDayWeek'] = df['Due Date'].apply(get_day_week)

df.head(10)

In [None]:
f, axes = setup_count_chart(quant=4, h=10)

sns.countplot(x='DueQuarter', data=df, ax=axes[0])
sns.countplot(x='DueQuarterMonth', data=df, ax=axes[1])
sns.countplot(x='DueWeekMonth', data=df, ax=axes[2])
sns.countplot(x='DueDayWeek', data=df, ax=axes[3])

In [None]:
# df.drop(columns=['Last Credit Review',
#                  'Document Date',
#                  'Due Date',
#                  'Clearing Date',
#                  'DaysToClearingDate'], inplace=True, errors='ignore')

# df.head(10)

In [None]:
plot_correlation_heatmap(df.corr())

In [None]:
# df.drop(columns=['Amount', 'DaysToLastCreditReview', 'DaysToDue'], inplace=True, errors='ignore')

# df.head(10)

In [None]:
# plot_correlation_heatmap(df.corr())

In [None]:
# 0.12900018757757797	DaysToDue
# 0.09115122672129665	DueDateRange
# 0.07961419908960324	CustomerCode
# 0.0792714135303946	Document
# 0.07529140921166559	IsLate
# 0.06660832727751806	Amount
# 0.060009351491705464	Region
# 0.058083238863443964	AmountRelativeMagnitude
# 0.054214386669862925	DaysToLastCreditReview
# 0.051381339689836526	Terms
# 0.036840021328726824	Company
# 0.030951642614065793	DocumentDayWeek
# 0.030487544048197056	DueDayWeek
# 0.026566185886496042	LastCreditReviewRange
# 0.02191523864205556	DueWeekMonth
# 0.02138776728423403	DocumentWeekMonth
# 0.015610550822677576	AmountRange
# 0.013891003262061557	DueQuarter
# 0.013661185196872917	DocumentQuarter
# 0.012986905640220179	DueQuarterMonth
# 0.012751398738939398	DocumentQuarterMonth
# 0.010089984188227905	Country
# 0.008235492224320153	AmountMagnitude

x_columns = df.columns.drop([
                             'Last Credit Review',
                             'Document Date',
                             'Due Date',
                             'Clearing Date',
                             'DaysToClearingDate',
                             'ClearingDateRange',
                             ])
y_columns = 'ClearingDateRange'


x_columns= np.array(['DaysToDue', 'CustomerCode', 'Document', 'IsLate', 'Region', 'DaysToLastCreditReview', 'Terms', 'Country', 'Company', 'DocumentQuarter', 'DocumentDayWeek'])


x = df[x_columns].values
# x = df[x_columns].values
y = df[y_columns].values

x_train, x_test, y_train, y_test = skms.train_test_split(x, y, test_size=0.1, shuffle=True, random_state=42, stratify=y)

In [None]:
# dumb predict
print(f'Total items: {len(y_test)}\n')

for x in range(6):
    dumb_data = np.ones(y_test.shape, dtype=np.int) * x
    print(f'Always predicting the range "{x}",', f'accuracy is {skm.accuracy_score(y_test, dumb_data) * 100:.2f}%')

In [None]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=10, random_state=42, n_jobs=-1)
random_forest.fit(x_train, y_train)

importances = random_forest.feature_importances_
indices = np.argsort(importances)[::-1]

print('Feature ranking:')

for f in range(x_train.shape[1]):
    print(f'{importances[indices[f]]}\t{x_columns[indices[f]]}')

plt.bar(range(x_train.shape[1]), importances[indices])
plt.xticks(range(x_train.shape[1]), x_columns[indices], rotation=90)
plt.title('Feature Importance')
plt.show()

In [None]:
predict = random_forest.predict(x_test)

# All: 86.21%
# 'DaysToDue': 64.10%
# 'DaysToDue', 'DueDateRange': 64.10%
# 'DueDateRange': 63.72%

# 'DaysToDue', 'CustomerCode', 'Document': 81.57%
# 'DaysToDue', 'CustomerCode', 'Document', 'IsLate': 85.61%
# 'DueDateRange', 'CustomerCode', 'Document', 'IsLate': 84.75%

# 'DaysToDue', 'CustomerCode', 'Document', 'IsLate', 'Amount': 80.99%
# 'DaysToDue', 'CustomerCode', 'Document', 'IsLate', 'AmountRelativeMagnitude': 80.24%
# 'DaysToDue', 'CustomerCode', 'Document', 'IsLate', 'AmountRange': 82.72%

# 'DaysToDue', 'CustomerCode', 'Document', 'IsLate', 'Region', 'DaysToLastCreditReview', 'Terms': 86.53%

# 'DaysToDue', 'CustomerCode', 'Document', 'IsLate', 'Region', 'DaysToLastCreditReview', 'Terms', 'DocumentDayWeek': 87.31%
# 'DaysToDue', 'CustomerCode', 'Document', 'IsLate', 'Region', 'DaysToLastCreditReview', 'Terms', 'DocumentDayWeek', 'DueDayWeek': 87.21%
# 'DaysToDue', 'CustomerCode', 'Document', 'IsLate', 'Region', 'DaysToLastCreditReview', 'Terms', 'DueDayWeek': 87.27%

# 'DaysToDue', 'CustomerCode', 'Document', 'IsLate', 'Region', 'LastCreditReviewRange', 'Terms', 'DocumentDayWeek': 87.18%
# 'DaysToDue', 'CustomerCode', 'Document', 'IsLate', 'Region', 'LastCreditReviewRange', 'Terms', 'DueDayWeek': 87.13%


# 'DaysToDue', 'CustomerCode', 'Document', 'IsLate', 'Region', 'DaysToLastCreditReview', 'Terms', 'DocumentDayWeek', 'Amount': 86.53%
# 'DaysToDue', 'CustomerCode', 'Document', 'IsLate', 'Region', 'DaysToLastCreditReview', 'Terms', 'DocumentDayWeek', 'AmountMagnitude': 86.85%
# 'DaysToDue', 'CustomerCode', 'Document', 'IsLate', 'Region', 'DaysToLastCreditReview', 'Terms', 'DocumentDayWeek', 'AmountRelativeMagnitude': 86.18%

# 'DaysToDue', 'CustomerCode', 'Document', 'IsLate', 'Region', 'DaysToLastCreditReview', 'Terms', 'DocumentDayWeek', 'AmountMagnitude', 'AmountRelativeMagnitude': 86.16%

# 'DaysToDue', 'CustomerCode', 'Document', 'IsLate', 'Region', 'DaysToLastCreditReview', 'Terms', 'DocumentDayWeek', 'Country': 87.40%
# 'DaysToDue', 'CustomerCode', 'Document', 'IsLate', 'Region', 'DaysToLastCreditReview', 'Terms', 'DocumentDayWeek', 'Company': 87.81%
# 'DaysToDue', 'CustomerCode', 'Document', 'IsLate', 'Region', 'DaysToLastCreditReview', 'Terms', 'DocumentDayWeek', 'Country', 'Company': 87.77%


#### 'DaysToDue', 'CustomerCode', 'Document', 'IsLate', 'Region', 'DaysToLastCreditReview', 'Terms', 'DocumentDayWeek', 'Country', 'Company', 'DocumentQuarter': 87.85%
# 'DaysToDue', 'CustomerCode', 'Document', 'IsLate', 'Region', 'DaysToLastCreditReview', 'Terms', 'DocumentDayWeek', 'Country', 'Company', 'DueQuarter': 87.84%
# 'DaysToDue', 'CustomerCode', 'Document', 'IsLate', 'Region', 'DaysToLastCreditReview', 'Terms', 'DocumentDayWeek', 'Country', 'Company', 'DocumentQuarter', 'DueDayWeek': 87.62% 


# 'DaysToDue', 'CustomerCode', 'Document', 'IsLate', 'Region', 'DaysToLastCreditReview', 'Terms', 'DocumentDayWeek', 'Country', 'Company', 'DocumentQuarter', 'Amount': 87.07%
# 'DaysToDue', 'CustomerCode', 'Document', 'IsLate', 'Region', 'DaysToLastCreditReview', 'Terms', 'DocumentDayWeek', 'Country', 'Company', 'DocumentQuarter', 'AmountBin':  87.40%
# 'DaysToDue', 'CustomerCode', 'Document', 'IsLate', 'Region', 'DaysToLastCreditReview', 'Terms', 'DocumentDayWeek', 'Country', 'Company', 'DocumentQuarter', 'AmountRange': 87.12%
# 'DaysToDue', 'CustomerCode', 'Document', 'IsLate', 'Region', 'DaysToLastCreditReview', 'Terms', 'DocumentDayWeek', 'Country', 'Company', 'DocumentQuarter', 'AmountMagnitude': 87.27%
# 'DaysToDue', 'CustomerCode', 'Document', 'IsLate', 'Region', 'DaysToLastCreditReview', 'Terms', 'DocumentDayWeek', 'Country', 'Company', 'DocumentQuarter', 'AmountRelativeMagnitude': 86.87%


# 'DaysToDue', 'CustomerCode', 'Document', 'IsLate', 'Region', 'DaysToLastCreditReview', 'Terms', 'DocumentDayWeek', 'Company', 'DocumentQuarter': 87.83%
# 'DaysToDue', 'CustomerCode', 'Document', 'IsLate', 'Region', 'DaysToLastCreditReview', 'Terms', 'DocumentDayWeek', 'Company', 'DueQuarter': 87.82%


# 'DaysToDue', 'CustomerCode', 'Document', 'IsLate', 'Region', 'DaysToLastCreditReview', 'Terms', 'DocumentDayWeek', 'Country', 'DocumentQuarter': 87.39%
# 'DaysToDue', 'CustomerCode', 'Document', 'IsLate', 'Region', 'DaysToLastCreditReview', 'Terms', 'DocumentDayWeek', 'Country', 'DueQuarter': 87.36%
# 'DaysToDue', 'CustomerCode', 'Document', 'IsLate', 'Region', 'DaysToLastCreditReview', 'Terms', 'DocumentDayWeek', 'Country', 'DocumentQuarterMonth': 87.38%
# 'DaysToDue', 'CustomerCode', 'Document', 'IsLate', 'Region', 'DaysToLastCreditReview', 'Terms', 'DocumentDayWeek', 'Country', 'DueQuarterMonth': 87.43%

#### 1 | 0.1    :: 85.19%
#### 10 | 0.1   :: 87.85%
#### 100 | 0.1  :: 88.21%

#### 1 | 0.2    :: 84.85%
#### 10 | 0.2   :: 87.59%
#### 100 | 0.2  :: 87.96%


print(f'Total: {len(y_test)}')
print(f'Accuracy: {skm.accuracy_score(y_test, predict) * 100:.2f}%')

In [None]:
def create_model():
    model = tf.keras.models.Sequential(name='cubricks')

    model.add(tf.keras.layers.Input(shape=x_train.shape[1]))
    model.add(tf.keras.layers.BatchNormalization())

    model.add(tf.keras.layers.Dense(256, activation='relu', kernel_initializer='he_normal'))
    # model.add(tf.keras.layers.Dropout(rate=0.1))
    model.add(tf.keras.layers.BatchNormalization())

    model.add(tf.keras.layers.Dense(512, activation='relu', kernel_initializer='he_normal'))
    # model.add(tf.keras.layers.Dropout(rate=0.1))
    model.add(tf.keras.layers.BatchNormalization())

    model.add(tf.keras.layers.Dense(512, activation='relu', kernel_initializer='he_normal'))
    # model.add(tf.keras.layers.Dropout(rate=0.1))
    model.add(tf.keras.layers.BatchNormalization())

    model.add(tf.keras.layers.Dense(128, activation='relu', kernel_initializer='he_normal'))
    # model.add(tf.keras.layers.Dropout(rate=0.1))
    model.add(tf.keras.layers.BatchNormalization())

    model.add(tf.keras.layers.Dense(np.unique(y_train).shape[0], activation='softmax'))

    return model


model = create_model()

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.005, epsilon=1e-8),
              loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.1, reduction='none'),
              metrics=['accuracy'])

model.summary()

In [None]:
import os

logdir = os.path.join('.', 'output')
# os.makedirs(logdir, exist_ok=True)

training_log = os.path.join(logdir, 'training.txt')
model_checkpoint = os.path.join(logdir, 'model.hdf5')

# if os.path.isfile(model_checkpoint):
#     model.load_weights(model_checkpoint)

callbacks = [
    # tf.keras.callbacks.TensorBoard(logdir, profile_batch=0),
    # tf.keras.callbacks.CSVLogger(training_log, separator=',', append=True),
    # tf.keras.callbacks.ReduceLROnPlateau(monitor='val_accuracy', min_delta=1e-8, factor=0.2, patience=100, verbose=1),
    # tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', min_delta=1e-8, patience=200, restore_best_weights=True, verbose=1),
    # tf.keras.callbacks.ModelCheckpoint(model_checkpoint, monitor='val_accuracy', save_best_only=True, verbose=1),
]

In [None]:
model.fit(x_train,
          tf.keras.utils.to_categorical(y_train),
          validation_data=(x_test, tf.keras.utils.to_categorical(y_test)),
          callbacks=callbacks,
          batch_size=128,
          epochs=10000,
          verbose=1)

In [None]:
predict = np.argmax(model.predict(x_test), axis=1)

print(f'Total: {len(y_test)}')
print(f'Accuracy: {skm.accuracy_score(y_test, predict) * 100:.2f}%')

In [None]:
%load_ext tensorboard
%tensorboard --logdir={logdir}

In [None]:
labels=['0', '1-7', '8-14', '15-21', '21-28', '29+']

classification_report = skm.classification_report(y_test, predict, target_names=labels)
print(classification_report)

In [None]:
cm = skm.confusion_matrix(y_test, predict)
plot_confusion_matrix(cm, target_names=labels, normalize=False)