In [None]:
!nvidia-smi

In [None]:
from google.colab import drive

drive.mount('./gdrive', force_remount=True)
%cd './gdrive/My Drive/cubricks'

In [None]:
!pip install -q tensorflow-gpu

In [None]:
%tensorflow_version 2.x

import calendar
import itertools
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import sklearn.model_selection as skms
import sklearn.metrics as skm
import tensorflow as tf

print("Tensorflow version " + tf.__version__)

In [None]:
df = pd.read_csv('InvoicedDocuments_v2.csv', na_values=['N/I'])
df.columns = df.columns.str.replace(' ', '')

df['LastCreditReview'] = pd.to_datetime(df['LastCreditReview'], errors='coerce')
df['DocumentDate'] = pd.to_datetime(df['DocumentDate'], errors='coerce')
df['DueDate'] = pd.to_datetime(df['DueDate'], errors='coerce')
df['ClearingDate'] = pd.to_datetime(df['ClearingDate'], errors='coerce')
df.dropna(subset=df.columns.drop('LastCreditReview'), inplace=True)


df['Company'] = pd.to_numeric(df['Company'], downcast='integer', errors='coerce')
df['CustomerCode'] = pd.to_numeric(df['CustomerCode'], downcast='integer', errors='coerce')
df['Document'] = pd.to_numeric(df['Document'], downcast='integer', errors='coerce')
df.dropna(subset=df.columns.drop('LastCreditReview'), inplace=True)


df['Amount'] = df['Amount'].apply(lambda x: str(x).replace(',', ''))
df['Amount'] = pd.to_numeric(df['Amount'], downcast='integer', errors='coerce')
df = df[df['Amount'] > 20]

df.info()
df.head(10)

In [None]:
def is_late(dates):
    curr = dates[1] if len(dates) > 1 else pd.datetime.now()
    return max(0, min((curr - dates[0]).days, 1))


df['IsLate'] = df[['DueDate', 'ClearingDate']].apply(is_late, axis=1)

df.head(10)

In [None]:
sns.countplot(df['IsLate'])

In [None]:
def get_days(dates):
  x = abs((dates[1] - dates[0]).days)
  return -1 if np.isnan(x) else x


df['DaysToLastCreditReview'] = df[['LastCreditReview', 'DocumentDate']].apply(get_days, axis=1)
df['DaysToDue'] = df[['DocumentDate', 'DueDate']].apply(get_days, axis=1)
df['DaysToClearingDate'] = df[['DocumentDate', 'ClearingDate']].apply(get_days, axis=1)

df.head(10)

In [None]:
bins = [0, 1, 8, 15, 22, 29, np.inf]
labels = ['ontime', '1-7', '8-14', '15-21', '22-28', '29+']

df['ClearingDateRange'] = pd.cut(df['DaysToClearingDate'], bins=bins, labels=labels, right=False, include_lowest=True)

df.head(10)

In [None]:
sns.countplot(df['ClearingDateRange'])

In [None]:
to_categorize = ['Company', 'CustomerCode', 'Terms', 'Region', 'Country', 'Document', 'ClearingDateRange']
categorized = [x + 'CAT' for x in to_categorize]

df[categorized] = df[to_categorize].apply(lambda x: pd.Categorical(x, ordered=True).codes)

df.head(10)

In [None]:
calendar.setfirstweekday(6)

def get_quarter(date):
    x = np.array(range(1, 13)).reshape(4, 3)
    return np.where(x==date.month)[0][0]

def get_day_week(date):
    x = np.array(calendar.monthcalendar(date.year, date.month))
    return np.where(x==date.day)[1][0]


df['DocumentQuarter'] = df['DocumentDate'].apply(get_quarter)
df['DocumentDayWeek'] = df['DocumentDate'].apply(get_day_week)

df.head(10)

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15,4))
sns.countplot(df['DocumentQuarter'], ax=ax[0])
sns.countplot(df['DocumentDayWeek'], ax=ax[1])

In [None]:
# #### # 
# TEMP #
# #### #

def get_amount_bin(x):
    if x > 20000: return 1
    else: return 0

def get_magnitude(x):
    # return max(2, min(5, int(np.log10(x))))
  return min(5, int(np.log10(x)))

def get_magnitude_relative(x):
    return x / (10 ** (get_magnitude(x) + 1))

def get_quarter_month(date):
    x = np.array(range(1, 13)).reshape(4, 3)
    return np.where(x==date.month)[1][0]

def get_week_month(date):
    x = np.array(calendar.monthcalendar(date.year, date.month))
    return min(np.where(x==date.day)[0][0], 5)


df['AmountBin'] = df['Amount'].apply(get_amount_bin)
df['AmountMagnitude'] = df['Amount'].apply(get_magnitude)
df['AmountRelativeMagnitude'] = df['Amount'].apply(get_magnitude_relative)

df['DocumentQuarterMonth'] = df['DocumentDate'].apply(get_quarter_month)
df['DocumentWeekMonth'] = df['DocumentDate'].apply(get_week_month)

df['DueQuarter'] = df['DueDate'].apply(get_quarter)
df['DueQuarterMonth'] = df['DueDate'].apply(get_quarter_month)
df['DueWeekMonth'] = df['DueDate'].apply(get_week_month)
df['DueDayWeek'] = df['DueDate'].apply(get_day_week)

In [None]:
y_column = np.array(['ClearingDateRangeCAT'])
features = np.array([
                    #  'Amount',
                    #  'AmountBin',
                    #  'AmountMagnitude',
                    #  'AmountRelativeMagnitude',
                    #  'DocumentQuarterMonth',
                    #  'DocumentWeekMonth',
                    #  'DueQuarter',
                    #  'DueQuarterMonth',
                    #  'DueWeekMonth',
                    #  'DueDayWeek',
                     'CompanyCAT',
                     'CustomerCodeCAT',
                     'TermsCAT',
                     'RegionCAT',
                     'CountryCAT',
                     'DocumentCAT',
                     'IsLate',
                     'DaysToDue',
                     'DaysToLastCreditReview',
                     'DocumentQuarter',
                     'DocumentDayWeek',
                     ])

plt.figure(figsize=(15, 10))
sns.heatmap(df[features].corr(), annot=True, cmap='Blues')

In [None]:
x_train, x_test, y_train, y_test = skms.train_test_split(df[features].values,
                                                         df[y_column].values,
                                                         test_size=0.1,
                                                         shuffle=True,
                                                         random_state=42,
                                                         stratify=df[y_column].values)

y_train_categorical = tf.keras.utils.to_categorical(y_train)
y_test_categorical = tf.keras.utils.to_categorical(y_test)

In [None]:
# dumb predict
print(f'Total items: {len(y_test)}')

for i in range(y_test_categorical.shape[1]):
    dumb_data = np.ones(y_test.shape, dtype=np.int) * i
    print(f'Always predicting the range "{i}",', f'accuracy is {skm.accuracy_score(y_test, dumb_data) * 100:.2f}%')

In [None]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=1)
random_forest.fit(x_train, np.squeeze(y_train))

In [None]:
predict = random_forest.predict(x_test)

importances = random_forest.feature_importances_
indices = np.argsort(importances)[::-1]

print(f'Total items: {len(y_test)}')
print(f'Accuracy: {skm.accuracy_score(y_test, predict) * 100:.2f}%\n')
print(f'Feature ranking:')

for f in range(x_train.shape[1]):
    print(f'{importances[indices[f]]}\t{features[indices[f]]}')

plt.barh(range(x_train.shape[1]), importances[indices])
plt.yticks(range(x_train.shape[1]), features[indices])
plt.title('Feature Importance')
plt.gca().invert_yaxis()
plt.show()