In [None]:
from google.colab import drive

drive.mount('./gdrive', force_remount=True)
%cd './gdrive/My Drive/Colab Notebooks/cubricks'

## Modules

In [None]:
!pip -q install imbalanced-learn xgboost --upgrade

In [None]:
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import confusion_matrix, classification_report

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

SEED = 42

## Plot Functions

In [None]:
def plot_countplot(df, cols=[0], title=None, rotation=0):
    for col in cols:
        fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8, 2))
        g = sns.countplot(x=np.squeeze(df[col] if isinstance(col, str) else df[:,col]), ax=ax)
        g.set_xticklabels(labels=g.get_xticklabels(), rotation=rotation)
        g.set_title(title)

def plot_confuncion_matrix(y_test, predict, title='Confusion Matrix', report=True):
    if report: print(classification_report(y_test, y_predict, digits=4, zero_division=True))
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8, 8))
    g = sns.heatmap(confusion_matrix(y_test, predict), fmt='d', square=True, annot=True, cmap='Blues', ax=ax)
    g.set_title(title)

def plot_feature_importance(features, importances):
    features = np.array(features)
    indices = np.argsort(importances)[::-1]
    print(f'Feature ranking:')

    for f in range(len(features)):
        print(f'{importances[indices[f]]}\t{features[indices[f]]}')

    plt.figure(figsize=(10, 8))
    plt.barh(range(len(features)), importances[indices])
    plt.yticks(range(len(features)), features[indices])
    plt.title('Feature Importance')
    plt.gca().invert_yaxis()
    plt.show()

## Data Manager

In [None]:
def setup_buckets(df, col, bins, sufix='Bkt'):
    bins = [-np.inf] + bins + [np.inf]
    labels = [f'{bins[i]} to {bins[i+1]-1}' for i in range(len(bins[:-1]))]
    df[f'{col}{sufix}Label'] = pd.cut(df[col], bins=bins, labels=labels, right=False, include_lowest=True)
    df[[f'{col}{sufix}']] = df[[f'{col}{sufix}Label']].apply(lambda x: pd.Categorical(x, ordered=True).codes)
    return df, labels

def split_data(df, date_column, split_date, train_months=6, test_months=1):
    split_date = pd.to_datetime(split_date)
    train_date = split_date - pd.DateOffset(months=train_months)
    test_date = split_date + pd.DateOffset(months=test_months)

    df_ranged = df[(df[date_column] >= train_date) & (df[date_column] < test_date)].copy()
    df_ranged.reset_index(drop=True, inplace=True)

    train = df_ranged[df_ranged[date_column] < split_date]
    test = df_ranged[df_ranged[date_column] >= split_date]
    return train, test

## Data Preprocessing

In [None]:
def preprocess_data(train, test, x_column, y_column, resample=None, nominal_columns=None):
    train, test = train.copy(), test.copy()

    def _resample(df, x_column, y_column, func):
        dtypes = df[x_column].dtypes.to_dict()
        dtypes.update(df[y_column].dtypes.to_dict())    
        x, y = df[x_column].values, df[y_column].values

        try:
            x, y = func.fit_resample(x, y)
            y = np.expand_dims(y, axis=1)
        except:
            pass

        xy = np.concatenate((x, y), axis=1)
        data = pd.DataFrame(xy, columns=np.concatenate((x_column, y_column)))
        data = data.astype(dtypes)
        return data

    if nominal_columns is not None:
        to_number = lambda x: [int(w) if str(w).isnumeric() else int(''.join(format(ord(c), '') for c in str(w).lower())) for w in x]
        train[nominal_columns] = train[nominal_columns].apply(lambda x: to_number(x))
        test[nominal_columns] = test[nominal_columns].apply(lambda x: to_number(x))

    if resample is not None:
        train = _resample(train, x_column, y_column, resample)

    x_train, y_train = train[x_column].values, train[y_column].values
    x_test, y_test = test[x_column].values, test[y_column].values

    qt = RobustScaler(quantile_range=(25.0, 75.0))
    qt.fit(x_train, y_train)

    x_train = qt.transform(x_train)
    x_test = qt.transform(x_test)
    return x_train, y_train, x_test, y_test

## Feature Selection

In [None]:
def feature_selection(train, test, x_column, y_column, nominal_columns=None, target_class='macro avg', random_state=None):
    insertion = lambda l, x: [l[:i] + [x] + l[i:] for i in range(len(l) + 1)]
    flatten = lambda l: [item for sublist in l for item in sublist]
    x_column = x_column.tolist()

    def _loop_selection(groups, score=-1, features=[]):
        scores_groups = []

        for feature_group in groups:
            x_column = np.array(flatten(feature_group))
            x_train, y_train, x_test, y_test = preprocess_data(train, test, x_column, y_column, nominal_columns)
 
            clf = RandomForestClassifier(256, criterion='entropy', class_weight='balanced', random_state=random_state, n_jobs=-1)
            clf.fit(x_train, np.squeeze(y_train))
            y_predict = clf.predict(x_test)

            cr = classification_report(test[y_column].values, y_predict, output_dict=True, zero_division=True)
            scores_groups.append(cr[target_class]['f1-score'])

        local_score = max(scores_groups)
        local_features = list(groups[scores_groups.index(local_score)])
        print(f'Score: {local_score:.8f} >>> {local_features}')

        if local_score > score:
            return local_score, local_features

        return score, features

    in_score, in_features = _loop_selection([[flatten(x_column)]])
    score, features = _loop_selection(np.expand_dims(x_column, axis=1).tolist())
    x_column.remove(features[0])

    for group in x_column:
        score, features = _loop_selection(insertion(features, group), score, features)

    if in_score > score:
        return in_score, in_features

    return score, features

## Read Dataset

In [None]:
### Read dataset ###
df = pd.read_csv('InvoicedDocuments_full.csv', sep=';', parse_dates=['DocumentDate', 'DueDate', 'ClearingDate'], low_memory=False)
df.sort_values(by=['DueDate'], ascending=False, ignore_index=False, inplace=True)

### Full fill ClearingDate column ###
df['Open'] = df['ClearingDate'].isnull() * 1

current_last_month = pd.to_datetime('now') - pd.DateOffset(months=1) + pd.tseries.offsets.MonthEnd(1)
df.loc[(df['Open'] == 1) & (df['DueDate'] < current_last_month), 'ClearingDate'] = current_last_month

### 'Days to' columns ###
date_int = lambda x: x.astype('timedelta64[D]')#.astype(int)
df['DueDateToClearingDate'] = date_int(df['ClearingDate'] - df['DueDate'])

### Date columns ###
for col in ['DocumentDate', 'DueDate']:
    df[col + 'Day'] = pd.DatetimeIndex(df[col]).day
    df[col + 'WeekDay'] = pd.DatetimeIndex(df[col]).weekday

### Full fill customer columns ###
for col in ['CustomerShipToKey', 'CustomerSoldToKey']:
    df[col] = df[col].fillna(df['CustomerKey'])

for col in ['CustomerRegion']:
    is_numeric = lambda x: np.nan if str(x).replace('-', '').isnumeric() else x
    df[col] = df[col].apply(is_numeric).fillna(df[col].value_counts().idxmax())

### Ratio columns ###
later_columns = ['TotalLatePaidInvoices', 'TotalPendingLateInvoices', 'SumAmountLatePaidInvoices', 'SumAmountPendingLateInvoices']
total_columns = ['TotalPaidInvoices', 'TotalPendingInvoices', 'SumAmountPaidInvoices', 'SumAmountPendingInvoices']
ratio_columns = ['RatioLatePaidInvoices', 'RatioPendingLateInvoices', 'RatioAmountPaidLateInvoices', 'RatioAmountPendingLateInvoices']

for l, t, r in zip(later_columns, total_columns, ratio_columns):
    df[r] = (df[l] / df[t]).fillna(0)

average_columns = ['AvgDaysLatePaidInvoices', 'AvgDaysLatePendingInvoices', 'StdevDaysLatePaidInvoices', 'StdevDaysLatePendingInvoices', 'PaymentFrequency']
df[average_columns] = df[average_columns].fillna(0)

history_columns = later_columns + total_columns + ratio_columns + average_columns
df.loc[df[total_columns[0]].isnull(), history_columns] = -1

df.head()

In [None]:
df, bin_class_names = setup_buckets(df, col='DueDateToClearingDate', bins=[4], sufix='Bin')
df, bucket_class_names = setup_buckets(df, col='DueDateToClearingDate', bins=[1, 4, 8], sufix='Bkt')

plot_countplot(df, cols=['DueDateToClearingDateBinLabel', 'DueDateToClearingDateBktLabel'])

## Train and Test Data

In [None]:
columns = {
    'target': [
        'DueDateToClearingDateBin',
    ],
    'numeric': [
        'StdevDaysLatePendingInvoices',
        'AvgDaysLatePendingInvoices',
        'StdevDaysLatePaidInvoices',
        'AvgDaysLatePaidInvoices',
        'RatioPendingLateInvoices',
        'RatioAmountPendingLateInvoices',
        'RatioLatePaidInvoices',
        'RatioAmountPaidLateInvoices',
        'PaymentFrequency',
        'PaymentTerm',
        'InvoiceAmount',
    ],
    'nominal': [
        'CompanyKey',
        'CustomerKey',
        'CustomerSoldToKey',
        'CustomerShipToKey',
        'CustomerRegion',
        'CorporateDivision',
    ],
    'date': [
        'DocumentDateWeekDay',
        'DueDateWeekDay',
        'DocumentDateDay',
        'DueDateDay',
    ],
}

y_column = np.array(columns['target'])
x_column = np.array(columns['numeric'] + columns['nominal'] + columns['date'])

train, test = split_data(df, date_column='DueDate', split_date='2020-01-01', train_months=6, test_months=1)

# from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN
x_train, y_train, x_test, y_test = preprocess_data(train, test, x_column, y_column,
                                                #    resample=ADASYN(sampling_strategy='auto', random_state=SEED),
                                                #    resample=SMOTE(sampling_strategy='auto', random_state=SEED),
                                                #    resample=BorderlineSMOTE(sampling_strategy='auto', kind='borderline-1', random_state=SEED),
                                                #    resample=BorderlineSMOTE(sampling_strategy='auto', kind='borderline-2', random_state=SEED),
                                                   nominal_columns=columns['nominal'] + columns['date'])

plot_countplot(y_train, cols=[0], title='Train')
plot_countplot(y_test, cols=[0], title='Test')

## Feature Selection

In [None]:
# x_column = np.array([...])

# score, features = feature_selection(train, test, x_column, y_column, nominal_columns=columns['nominal'] + columns['date'], random_state=SEED)
# print(f'\nFinal score: {score:.8f} >>> {features}')

## Methods

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=512,
                             criterion='entropy',
                             min_samples_split=8,
                             class_weight='balanced',
                             random_state=SEED,
                             n_jobs=-1)

clf.fit(x_train, np.squeeze(y_train))
y_predict = clf.predict(x_test)

plot_confuncion_matrix(test[y_column].values, y_predict)

In [None]:
# ### save csv ###
# cols = test.columns[:25].tolist() + ['DueDateToClearingDateBin']
# table = test[cols].copy().rename(columns={cols[-1]: 'GroundTruth'})

# table.loc[test['Open'] == 1, 'ClearingDate'] = np.nan
# table.sort_values(by=['DueDate'], ascending=True, ignore_index=False, inplace=True)

# y_prob = clf.predict_proba(x_test)
# table[['OnTime', 'Later4+']] = pd.DataFrame(y_prob, index=table.index)

# table.to_csv(f"{table['DueDate'].min().strftime('%Y-%m')}.csv", sep=';', index=False)

# ### plot feature importance ###
# plot_feature_importance(x_column, clf.feature_importances_)

# ### apply threshold inference ###
# y_predict = (clf.predict_proba(x_test)[:,1] >= 0.9999).astype('int')
# plot_confuncion_matrix(test[y_column].values, y_predict)

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

clf = BaggingClassifier(base_estimator=DecisionTreeClassifier(criterion='entropy', min_samples_split=8, class_weight='balanced'),
                        n_estimators=512,
                        random_state=SEED,
                        n_jobs=-1)

clf.fit(x_train, np.squeeze(y_train))
y_predict = clf.predict(x_test)

plot_confuncion_matrix(test[y_column].values, y_predict)

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(criterion='entropy', min_samples_split=8, class_weight='balanced'),
                         n_estimators=512,
                         learning_rate=0.3,
                         random_state=SEED)

clf.fit(x_train, np.squeeze(y_train))
y_predict = clf.predict(x_test)

plot_confuncion_matrix(test[y_column].values, y_predict)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(n_estimators=512,
                                 learning_rate=0.3,
                                 min_samples_split=8,
                                 max_depth=6,
                                 random_state=SEED)

clf.fit(x_train, np.squeeze(y_train))
y_predict = clf.predict(x_test)

plot_confuncion_matrix(test[y_column].values, y_predict)

In [None]:
from xgboost import XGBClassifier

clf = XGBClassifier(objective='binary:logistic',
                    n_estimators=512,
                    learning_rate=0.3,
                    scale_pos_weight=0.9,
                    seed=SEED,
                    n_jobs=-1)

clf.fit(x_train, np.squeeze(y_train), eval_metric='logloss')
y_predict = clf.predict(x_test)

plot_confuncion_matrix(test[y_column].values, y_predict)

In [None]:
from imblearn.ensemble import BalancedRandomForestClassifier

clf = BalancedRandomForestClassifier(n_estimators=512,
                                     criterion='entropy',
                                     min_samples_split=8,
                                     class_weight='balanced',
                                     sampling_strategy=0.7,
                                     random_state=SEED,
                                     n_jobs=-1)

clf.fit(x_train, np.squeeze(y_train))
y_predict = clf.predict(x_test)

plot_confuncion_matrix(test[y_column].values, y_predict)

In [None]:
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.tree import DecisionTreeClassifier

clf = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(criterion='entropy', min_samples_split=8, class_weight='balanced'),
                                n_estimators=512,
                                sampling_strategy=0.7,
                                random_state=SEED,
                                n_jobs=-1)

clf.fit(x_train, np.squeeze(y_train))
y_predict = clf.predict(x_test)

plot_confuncion_matrix(test[y_column].values, y_predict)

In [None]:
from imblearn.ensemble import RUSBoostClassifier
from sklearn.tree import DecisionTreeClassifier

clf = RUSBoostClassifier(base_estimator=DecisionTreeClassifier(criterion='entropy', min_samples_split=8, class_weight='balanced'),
                         n_estimators=512,
                         sampling_strategy=0.7,
                         learning_rate=0.3,
                         random_state=SEED)

clf.fit(x_train, np.squeeze(y_train))
y_predict = clf.predict(x_test)

plot_confuncion_matrix(test[y_column].values, y_predict)

In [None]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(hidden_layer_sizes=(128, 64, 32),
                    activation='tanh', 
                    batch_size='auto',
                    max_iter=1000,
                    solver='adam',
                    learning_rate_init=0.003,
                    learning_rate='adaptive',
                    alpha=0.0001,
                    early_stopping=True,
                    n_iter_no_change=20,
                    validation_fraction=0.1,
                    shuffle=True,
                    random_state=SEED,
                    verbose=False)

clf.fit(x_train, np.squeeze(y_train))

y_predict = clf.predict(x_test)
plot_confuncion_matrix(test[y_column].values, y_predict)

## Neural Network

In [None]:
# from sklearn.utils.class_weight import compute_class_weight
# from sklearn.model_selection import train_test_split

# import tensorflow_addons as tfa
# import tensorflow as tf

# def make_model():
#     model = tf.keras.models.Sequential(name='cubricks')
#     model.add(tf.keras.layers.Input(shape=x_train.shape[1]))

#     model.add(tf.keras.layers.Dense(256, kernel_initializer='glorot_normal'))
#     model.add(tf.keras.layers.PReLU())
#     model.add(tf.keras.layers.BatchNormalization(renorm=False))

#     model.add(tf.keras.layers.Dense(512, kernel_initializer='glorot_normal'))
#     model.add(tf.keras.layers.PReLU())
#     model.add(tf.keras.layers.BatchNormalization(renorm=False))

#     model.add(tf.keras.layers.Dense(256, kernel_initializer='glorot_normal'))
#     model.add(tf.keras.layers.PReLU())
#     model.add(tf.keras.layers.BatchNormalization(renorm=False))

#     # model.add(tf.keras.layers.Dense(32, kernel_initializer='glorot_normal', kernel_regularizer=tf.keras.regularizers.l1_l2(l1=1e-3, l2=1e-2)))
#     # model.add(tf.keras.layers.PReLU())
#     # model.add(tf.keras.layers.BatchNormalization(renorm=False))

#     # model.add(tf.keras.layers.Dense(64, kernel_initializer='glorot_normal', kernel_regularizer=tf.keras.regularizers.l1_l2(l1=1e-3, l2=1e-2)))
#     # model.add(tf.keras.layers.PReLU())
#     # model.add(tf.keras.layers.BatchNormalization(renorm=False))
#     # # model.add(tf.keras.layers.Dropout(rate=0.1))

#     # model.add(tf.keras.layers.Dense(128, kernel_initializer='glorot_normal', kernel_regularizer=tf.keras.regularizers.l1_l2(l1=1e-3, l2=1e-2)))
#     # model.add(tf.keras.layers.PReLU())
#     # model.add(tf.keras.layers.BatchNormalization(renorm=False))

#     model.add(tf.keras.layers.Dense(np.unique(y_train).shape[0], activation='softmax'))

#     model.compile(
#         optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4, epsilon=1e-8, amsgrad=True),
#         loss=tf.keras.losses.CategoricalCrossentropy(),
#         # loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.1),
#         metrics=[tfa.metrics.F1Score(num_classes=np.unique(y_train).shape[0], average='weighted')])

#     return model


# model = make_model()
# model.summary()

# train, test = split_data_month_window(df, col='DueDate', date='2020-08-01', month_window=12)
# x_train, y_train, x_test, y_test = prepare_data(train, test, x_column, y_column, random_state=SEED)

# x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.1, shuffle=True, random_state=SEED, stratify=y_train)

In [None]:
# BATCH = 256
# EPOCHS = 10000
# PATIENCE = 10000

# tensorboard = tf.keras.callbacks.TensorBoard(log_dir='./logs', profile_batch=0)

# checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath='./logs/weights.hdf5',
#                                                 monitor='val_f1_score', mode='max',
#                                                 save_best_only=True, save_weights_only=True, verbose=1)

# early_stopping = tf.keras.callbacks.EarlyStopping(patience=PATIENCE,
#                                                   monitor='val_f1_score', mode='max',
#                                                   restore_best_weights=True, verbose=1)

# class_weight = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train[:,0])
# class_weight = dict(zip(np.unique(y_train), class_weight))

# model = make_model()
# # model.load_weights(filepath='./logs/weights.hdf5')

# model_history = model.fit(x_train, tf.keras.utils.to_categorical(y_train),
#                           validation_data=(x_valid, tf.keras.utils.to_categorical(y_valid)),
#                           epochs=EPOCHS, batch_size=BATCH, shuffle=True,
#                           callbacks=[checkpoint, early_stopping, tensorboard],
#                           class_weight=class_weight)

In [None]:
# def plot_model_history(history):
#     colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
#     plt.figure(figsize=(15, 3))

#     for n, metric in enumerate(['loss', 'f1_score']):
#         plt.subplot(1, 2, n+1)
#         plt.plot(history.epoch,  history.history[metric], color=colors[0], label='Train')
#         plt.plot(history.epoch, history.history['val_' + metric], color=colors[0], linestyle='--', label='Val')
#         plt.ylim([plt.ylim()[0], 1] if n > 0 else [0, plt.ylim()[1]])
#         plt.ylabel(metric.replace('_', ' ').capitalize())
#         plt.xlabel('Epoch')
#         plt.legend()


# plot_model_history(model_history)

In [None]:
# model.load_weights(filepath='./logs/weights.hdf5')

# y_predict = classifier_predict(model, x_test, threshold=0.5, network=True)
# plot_confuncion_matrix(y_test, y_predict)