In [None]:
!nvidia-smi

In [None]:
from google.colab import drive

drive.mount('./gdrive', force_remount=True)
%cd './gdrive/My Drive/cubricks'

In [None]:
import datetime
import functools
import multiprocessing
import numpy as np
import pandas as pd


class CSVManager():
    def __init__(self):
        self.df = None
        self.bins = None
        self.labels = None

    def read(self, filename, sep=';', na_values=['N/I'], orderby=None, ascending=True):
        self.df = pd.read_csv(filename, sep=sep, na_values=na_values)

        if orderby is not None:
            self.df.sort_values(by=orderby, ascending=ascending, ignore_index=True, inplace=True)

    def save(self, name, sep=';', na_rep=-1):
        self.df.to_csv(name, sep=sep, na_rep=na_rep, float_format='%g', index=False)

    def info(self):
        print(self.df.info())

    def head(self, value=10):
        print(self.df.head(value))

    def null_sum(self):
        print(self.df.isnull().sum())

    def ppnan(self, dropna_cols=None, fillna_cols=None, fillna_value=None):
        if dropna_cols:
            self.df.dropna(subset=dropna_cols, inplace=True)
        if fillna_cols is not None and fillna_value is not None:
            self.df[fillna_cols] = self.df[fillna_cols].fillna(fillna_value)
        elif fillna_value is not None:
            self.df.fillna(value=fillna_value, inplace=True)

    def set_ratio(self, dividend_cols, divisor_cols):
        for dividend, divisor in zip(dividend_cols, divisor_cols):
            ratio_col = 'Ratio' + dividend + divisor
            self.df[ratio_col] = self.df[dividend] / self.df[divisor]
            self.df[ratio_col].fillna(0, inplace=True)

    def set_daysto(self, source_cols, target_cols):
        for src, tgt in zip(source_cols, target_cols):
            delta_col = 'DaysTo' + tgt
            self.df[delta_col] = self.df[tgt] - self.df[src]
            self.df[delta_col].fillna(pd.Timedelta(seconds=0), inplace=True)
            self.df[delta_col] = self.df[delta_col].astype('timedelta64[D]').astype(int)
            self.df[delta_col] = self.df[delta_col].clip(lower=0)

    def set_range(self, bins, cols):
        self.bins = bins + [np.inf]
        self.labels = [f'{self.bins[i]}-{self.bins[i+1]-1}' for i in range(len(self.bins[:-1]))]

        for col in cols:
            self.df[col + 'Range'] = pd.cut(self.df[col], bins=self.bins, labels=self.labels, right=False, include_lowest=True)
            self.df[[col + 'RangeCT']] = self.df[[col + 'Range']].apply(lambda x: pd.Categorical(x, ordered=True).codes)

    def extract_days(self, cols):
        for col in cols:
            self.df[col + 'Month'] = self.df[col].dt.month
            self.df[col + 'Day'] = self.df[col].dt.day
            self.df[col + 'WeekDay'] = self.df[col].dt.weekday

    def minmax_filter(self, min_cols, max_cols):
        for mi, ma in zip(min_cols, max_cols):
            self.df = self.df[self.df[mi] < self.df[ma]]

    def cast_to_number(self, cols):
        self.df = self.df.apply(lambda x: [int(''.join(format(ord(w), '') for w in str(y)))
                                           if not str(y).isnumeric() else y for y in x] if x.name in cols else x)

    def cast_to_integer(self, cols):
        self.df = self.df.apply(lambda x: pd.to_numeric(x, downcast='integer') if x.name in cols else x)

    def cast_to_date(self, cols):
        self.df = self.df.apply(lambda x: pd.to_datetime(x, errors='coerce') if x.name in cols else x)

    def get_data_range(self, col, date, month_window):
        date_0 = pd.to_datetime(date)
        date_1 = date_0 - pd.DateOffset(months=month_window)
        date_2 = date_0 + pd.DateOffset(months=1)
        train = self.df[(self.df[col] >= date_1) & (self.df[col] < date_0)]
        test = self.df[(self.df[col] >= date_0) & (self.df[col] < date_2)]
        return train, test

    def calculate_per_bucket(self, bucket_col, amount_col, date_col, key_col, month_window):
        self.df.reset_index(drop=True, inplace=True)
        dflocal = self.df[[bucket_col, amount_col, date_col, key_col]].copy()

        month_window = pd.DateOffset(months=month_window)
        arange_labels = np.arange(len(self.labels))

        min_month = dflocal[date_col].min()
        max_month = dflocal[date_col].max()
        one_month = pd.DateOffset(months=1)
        results = []

        while min_month <= max_month:
            records = dflocal[(dflocal[date_col] >= min_month - month_window) & (dflocal[date_col] < min_month + one_month)]
            curr_month = records[records[date_col] >= min_month].values

            batch = f'{min_month.year}-{min_month.month}/{max_month.year}-{max_month.month}'
            print(f'Preprocessing {curr_month.shape[0]} items ({batch})', end=' ')

            with multiprocessing.Pool(multiprocessing.cpu_count()) as pool:
                start_time = datetime.datetime.now()
                r = pool.map(functools.partial(self.apply_multiprocessing, records, arange_labels,
                                               month_window, bucket_col, amount_col, date_col, key_col), curr_month)
                print(f'~ {datetime.datetime.now() - start_time}')
                results.extend(r)
                pool.close()
                pool.join()

            min_month += one_month

        new_cols = np.array([[f'Range{i}Amount', f'Range{i}Count'] for i in arange_labels])
        results = pd.DataFrame(np.array(results), columns=new_cols.flatten())

        self.df.drop(labels=new_cols.flatten(), axis=1, inplace=True, errors='ignore')
        self.df = pd.concat([self.df, results], axis=1)

    @staticmethod
    def apply_multiprocessing(*args):
        records, arange_labels, month_window, bucket_col, amount_col, date_col, key_col, row = args
        t = records[(records[date_col] >= row[2] - month_window) & (records[date_col] < row[2]) & (records[key_col] == row[3])]

        total_a = t[amount_col].mean()
        total_b = t[bucket_col].count()
        result = []

        for i in arange_labels:
            a = t[t[bucket_col] == i][amount_col].mean()
            b = t[t[bucket_col] == i][bucket_col].count()

            mean = (a / total_a) if total_a > 0 else -1
            count = (b / total_b) if total_b > 0 else -1
            result.extend([mean, count])

        return result

In [None]:
# csv = CSVManager()
# csv.read('InvoicedDocuments_v4.csv', orderby=['DocumentDate'])
# csv.ppnan(dropna_cols=['ClearingDate'], fillna_value=0)

# csv.cast_to_number(cols=['CustomerRegion', 'PaymentTerms'])
# csv.cast_to_integer(cols=['InvoicedDocuments', 'PaidDocuments', 'PaidPastDocuments', 'OpenDocuments', 'PastDueDocuments'])
# csv.cast_to_date(cols=['CustomerLastCreditReview', 'DocumentDate', 'DueDate', 'ClearingDate'])

# csv.minmax_filter(min_cols=['DocumentDate', 'DocumentDate'], max_cols=['DueDate', 'ClearingDate'])
# csv.extract_days(cols=['DocumentDate', 'DueDate'])

# csv.set_ratio(dividend_cols=['InvoicedAmount', 'PaidAmount', 'PaidPastAmount', 'OpenAmount', 'PastDueAmount'],
#               divisor_cols=['InvoicedDocuments', 'PaidDocuments', 'PaidPastDocuments', 'OpenDocuments', 'PastDueDocuments'])

# csv.set_daysto(source_cols=['DocumentDate', 'DocumentDate', 'CustomerLastCreditReview'],
#                 target_cols=['DueDate', 'ClearingDate', 'DocumentDate'])

# csv.set_range(bins=list(range(1, 31, 28)), cols=['DaysToDueDate', 'DaysToClearingDate'])

# csv.calculate_per_bucket(bucket_col='DaysToClearingDateRangeCT', amount_col='DocumentAmount',
#                           date_col='DocumentDate', key_col='CustomerKey', month_window=2)

# csv.save(name='InvoicedDocuments_v4_pp.csv')

In [None]:
csv = CSVManager()
csv.read('InvoicedDocuments_v4_pp.csv', orderby=['DocumentDate'])

In [None]:
!pip install -q tensorflow-gpu

In [None]:
%tensorflow_version 2.x

import os
import numpy as np
import pandas as pd
import tensorflow as tf

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
def get_feature_importance(features, importances, plot=False):
  indices = np.argsort(importances)[::-1]
  print(f'Feature ranking:')

  for f in range(x_train.shape[1]):
    print(f'{importances[indices[f]]}\t{features[indices[f]]}')

  if plot:
    plt.figure(figsize=(10, 8))
    plt.barh(range(x_train.shape[1]), importances[indices])
    plt.yticks(range(x_train.shape[1]), features[indices])
    plt.title('Feature Importance')
    plt.gca().invert_yaxis()
    plt.show()

def get_report(y_test, predict, plot=False):
  print(f'Total items: {len(y_test)}')
  print(f'Accuracy: {accuracy_score(y_test, predict) * 100:.2f}%\n')

  # cm_report = classification_report(y_test, predict, target_names=labels)
  # print(cm_report)

  if plot:
    cm = confusion_matrix(y_test, predict)
    plt.figure(figsize=(10, 10))
    sns.heatmap(cm/cm.sum(axis=0), square=True, annot=True, fmt='.2%', cmap='Blues', xticklabels=labels, yticklabels=labels)

def plot_buckets(x1, x2=None):
  if x2 is None:
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(30, 4))
    sns.countplot(x1, ax=ax)
  else:
    fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(30, 4))
    sns.countplot(x1, ax=ax[0])
    sns.countplot(x2, ax=ax[1])

def set_ratio(df, cols):
    for col in cols:
        ratio_col = 'Ratio' + col[0] + col[1]
        df[ratio_col] = df[col[0]] / df[col[1]]
        df[ratio_col].fillna(0, inplace=True)
    return df

def set_daysto(df, cols):
    for col in cols:
        delta_col = 'DaysTo' + col[1]
        df[delta_col] = df[col[1]] - df[col[0]]
        df[delta_col].fillna(pd.Timedelta(seconds=0), inplace=True)
        df[delta_col] = df[delta_col].astype('timedelta64[D]').astype(int)
        df[delta_col] = df[delta_col].clip(lower=0)
    return df

def set_days(df, cols):
    for col in cols:
        df[col + 'Month'] = df[col].dt.month
        df[col + 'Day'] = df[col].dt.day
        df[col + 'WeekDay'] = df[col].dt.weekday
    return df

def set_range_cols(df, bins, labels, cols):
    for col in cols:
        df[col + 'Range'] = pd.cut(df[col], bins=bins, labels=labels, right=False, include_lowest=True)
        df[[col + 'RangeCT']] = df[[col + 'Range']].apply(lambda x: pd.Categorical(x, ordered=True).codes)
    return df

def get_data_range(df, col, date, window_month):
    date_0 = pd.to_datetime(date)
    date_1 = date_0 - pd.DateOffset(months=window_month)
    date_2 = date_0 + pd.DateOffset(months=1)

    train = df[(df[col] >= date_1) & (df[col] < date_0)]
    test = df[(df[col] >= date_0) & (df[col] < date_2)]

    return train, test

def binary_encoding(df, cols):
    for col in cols:
        bincol = np.array([str('{0:b}'.format(x)) for x in df[col[1]].values])
        header = np.array([f'{col[1]}{i}' for i in range(col[0])])
        newcol = np.zeros((bincol.shape[0], col[0]), dtype=np.int8)

        for i in range(bincol.shape[0]):
            a = np.array(list(bincol[i]), dtype=np.int8)
            newcol[i][col[0] - len(a):] = a

        df2 = pd.DataFrame(newcol, columns=header)
        df.reset_index(drop=True, inplace=True)
        df = pd.concat([df, df2], axis=1)
        df.drop(columns=[col[1]], inplace=True)
    return df

In [None]:
### Read data ###
df = pd.read_csv('InvoicedDocuments_v4.csv', sep=';', na_values=['N/I'])
df.sort_values(by=['DocumentDate'], ascending=True, ignore_index=True, inplace=True)

### Fill NaN data ###
df.dropna(subset=['ClearingDate'], inplace=True)
df.fillna(0, inplace=True)

### Generate ratio ###
df = set_ratio(df, cols=[('InvoicedAmount', 'InvoicedDocuments'),
                         ('PaidAmount', 'PaidDocuments'),
                         ('PaidPastAmount', 'PaidPastDocuments'),
                         ('OpenAmount', 'OpenDocuments'),
                         ('PastDueAmount', 'PastDueDocuments')])

## String to number ###
cols = ['CustomerRegion', 'PaymentTerms']
df = df.apply(lambda x: [int(''.join(format(ord(w), '') for w in str(y))) for y in x] if x.name in cols else x)

### Convert columns to integer columns ###
cols = ['InvoicedDocuments', 'PaidDocuments', 'PaidPastDocuments', 'OpenDocuments', 'PastDueDocuments']
df = df.apply(lambda x: pd.to_numeric(x, downcast='integer') if x.name in cols else x)

### Convert columns to date columns ###
cols = ['CustomerLastCreditReview', 'DocumentDate', 'DueDate', 'ClearingDate']
df = df.apply(lambda x: pd.to_datetime(x, errors='coerce') if x.name in cols else x)

### Setup 'DaysTo' columns ###
df = df[(df['DocumentDate'] < df['DueDate']) & (df['DocumentDate'] < df['ClearingDate'])]

df = set_daysto(df, cols=[('DocumentDate', 'DueDate'),
                          ('DocumentDate', 'ClearingDate'),
                          ('CustomerLastCreditReview', 'DocumentDate')])

### Setup Month, Day and WeekDay columns ###
df = set_days(df, cols=['DocumentDate', 'DueDate'])

### Setup range columns ###
bins = [1, 8, 15, 22, 29, np.inf]
labels = [f'{bins[i]}-{bins[i+1]-1}' for i in range(len(bins[:-1]))]
df = set_range_cols(df, bins, labels, cols=['DaysToDueDate', 'DaysToClearingDate'])

df.head(10)

In [None]:
# bins = [1, 8, 15, 22, 29, np.inf]
bins = list(range(1, 31, 28)) + [np.inf]

labels = [f'{bins[i]}-{bins[i+1]-1}' for i in range(len(bins[:-1]))]
df = set_range_cols(df, bins, labels, cols=['DaysToDueDate', 'DaysToClearingDate'])

plot_buckets(df['DaysToClearingDateRange'])

In [None]:
y_column = np.array(['DaysToClearingDateRangeCT'])
features = np.array([
                     'CompanyKey',
                     'CustomerKey',
                     'CustomerRegion',
                    ##  CustomerLastCreditReview,
                    ##  'DocumentDate',
                    ##  'DueDate',
                    ##  'ClearingDate',
                     'PaymentTerms',
                    ##  'DocumentNumber',
                     'DocumentAmount',
                    #  'InvoicedDocuments',
                    #  'InvoicedAmount',
                    #  'PaidDocuments',
                    #  'PaidAmount',
                    #  'PaidPastDocuments',
                    #  'PaidPastAmount',
                    #  'OpenDocuments',
                    #  'OpenAmount',
                    #  'PastDueDocuments',
                    #  'PastDueAmount',
                     'AvgDSOPastDueDocuments',
                     'PastDueDays',
                     'DaysToDueDate',
                    #  'DaysToDocumentDate',
                    ##  'DaysToClearingDate',
                    #  'DocumentDateMonth',
                    #  'DocumentDateDay',
                     'DocumentDateWeekDay',
                    #  'DueDateMonth',
                    #  'DueDateDay',
                     'DueDateWeekDay',
                    ##  'DaysToDueDateRange', 
                    #  'DaysToDueDateRangeCT',
                    ##  'DaysToClearingDateRange',
                    #  'DaysToClearingDateRangeCT',
                     'RatioInvoicedAmountInvoicedDocuments',
                    #  'RatioPaidAmountPaidDocuments',
                    #  'RatioPaidPastAmountPaidPastDocuments',
                    #  'RatioOpenAmountOpenDocuments',
                     'RatioPastDueAmountPastDueDocuments',
                     ])

In [None]:
import multiprocessing

class BucketGenerator:
  def __init__(self, labels, window_month=2):
    self.labels = np.arange(len(labels))
    self.window_month = pd.DateOffset(months=window_month)

  def clearing_per_bucket(self, df, workers=4):
    new_features = [[f'Range{i}Amount', f'Range{i}Count'] for i in self.labels]
    df = self.apply_by_multiprocessing(df, self.per_bucket, axis=1, workers=workers)
    df.fillna(-1, inplace=True)
    return df, np.array(new_features).flatten()

  def apply_by_multiprocessing(self, df, func, **kwargs):
      workers = kwargs.pop('workers')
      pool = multiprocessing.Pool(processes=workers)
      result = pool.map(_apply_df, [(d, func, kwargs) for d in np.array_split(df, workers)])
      pool.close()
      return pd.concat(list(result))

  def _apply_df(self, args):
      df, func, kwargs = args
      return df.apply(func, **kwargs)

  def per_bucket(self, x):
    t = df[(df['DocumentDate'] >= x['DocumentDate'] - self.window_month) & (df['DocumentDate'] < x['DocumentDate']) & (df['CustomerKey'] == x['CustomerKey'])]
    total_m = t['DocumentAmount'].mean()
    total_c = t['DaysToClearingDateRangeCT'].count()

    for i in self.labels:
      m = t[t['DaysToClearingDateRangeCT'] == i]['DocumentAmount'].mean()
      c = t[t['DaysToClearingDateRangeCT'] == i]['DaysToClearingDateRangeCT'].count()
      x[new_features[i][0]] = (m/total_m) if total_m > 0 else -1
      x[new_features[i][1]] = (c/total_c) if total_c > 0 else -1
    return x


train, test = get_data_range(df, col='DocumentDate', date='2020-08-01', window_month=4)

bg = BucketGenerator(labels, 2)
train = bg.clearing_per_bucket(train)

# train, new_features = clearing_per_bucket(train, ct=labels, window_month=2)
# # test, new_features = clearing_per_bucket(test, ct=labels, window_month=2)

# train, test = get_data_range(train, col='DocumentDate', date='2020-07-01', window_month=2)
# test, new_features = clearing_per_bucket(test, ct=labels, window_month=2)

# features = np.append(features, new_features)

# x_train, y_train = train[features].values, train[y_column].values
# x_test, y_test = test[features].values, test[y_column].values

# plot_buckets(train['DaysToClearingDateRange'], test['DaysToClearingDateRange'])

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

random_forest = GradientBoostingClassifier(n_estimators=1000, criterion='friedman_mse', min_weight_fraction_leaf=1e-4, random_state=42)
random_forest.fit(x_train, np.squeeze(y_train))

predict = random_forest.predict(x_test)

get_report(y_test, predict, plot=True)
get_feature_importance(features, random_forest.feature_importances_, plot=False)

In [None]:
# fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(30, 5))
# sns.heatmap(train[features].corr(), square=True, annot=False, fmt='.1g', vmin=-1, vmax=1, center=0, cmap='Pastel1', ax=ax[0])
# sns.heatmap(test[features].corr(), square=True, annot=False, fmt='.1g', vmin=-1, vmax=1, center=0, cmap='Pastel1', ax=ax[1])

print(f'Due Date Predict')
get_report(y_test, test["DaysToDueDateRangeCT"].values, plot=True)

In [None]:
random_forest = RandomForestClassifier(n_estimators=10, criterion='entropy', min_weight_fraction_leaf=1e-4, random_state=42)
random_forest.fit(x_train, np.squeeze(y_train))

predict = random_forest.predict(x_test)

get_report(y_test, predict, plot=True)
get_feature_importance(features, random_forest.feature_importances_, plot=False)

In [None]:
# from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV
# from scipy.stats import uniform, randint
# import xgboost as xgb


# def display_scores(scores):
#     print("Scores: {0}\nMean: {1:.3f}\nStd: {2:.3f}".format(scores, np.mean(scores), np.std(scores)))

# def report_best_scores(results, n_top=3):
#     for i in range(1, n_top + 1):
#         candidates = np.flatnonzero(results['rank_test_score'] == i)
#         for candidate in candidates:
#             print("Model with rank: {0}".format(i))
#             print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
#                   results['mean_test_score'][candidate],
#                   results['std_test_score'][candidate]))
#             print("Parameters: {0}".format(results['params'][candidate]), "\n")


# xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
# params = {
#     "colsample_bytree": uniform(0.7, 0.3),
#     "gamma": uniform(0, 0.5),
#     "learning_rate": uniform(0.003, 0.3), 
#     "max_depth": randint(2, 6),
#     "n_estimators": randint(10, 150),
#     "subsample": uniform(0.6, 0.4)
# }


# # search = RandomizedSearchCV(xgb_model, param_distributions=params, random_state=42, n_iter=100, cv=3, verbose=1, n_jobs=-1, return_train_score=True)
# # search.fit(x_train, np.squeeze(y_train))
# # report_best_scores(search.cv_results_, 1)


# # x_train, x_valid, y_train, y_valid = train_test_split(train[features].values,
# #                                                       train[y_column].values,
# #                                                       test_size=0.1, 
# #                                                       shuffle=True,
# #                                                       random_state=42,
# #                                                       stratify=train[y_column].values)

# # xgb_model.fit(x_train, y_train, early_stopping_rounds=10, eval_set=[(x_valid, y_valid)])


# xgb_model = xgb.XGBClassifier(max_depth=5,
#                               learning_rate=0.2830308924238449,
#                               n_estimators=10,
#                               subsample=0.815750979360025,
#                               colsample_bytree=0.7782680870025142,
#                               gamma=0.007652270145192375,
#                               objective="binary:logistic",
#                               random_state=42)

# xgb_model.fit(x_train, np.squeeze(y_train))
# predict = xgb_model.predict(x_test)

# # xgb.plot_importance(xgb_model)

# get_report(y_test, predict, plot=True)
# # get_feature_importance(features, xgb_model.feature_importances_, plot=False)

In [None]:
# ### Binary enconding columns ###
# dfNN_train = train[features].copy()
# dfNN_test = test[features].copy()

# cols = [(32, 'CompanyKey'),
#         (32, 'CustomerKey'),
#         (32, 'CustomerRegion'),
#         (32, 'PaymentTerms'),
#         (3, 'DocumentDateWeekDay'),
#         (3, 'DueDateWeekDay')]

# dfNN_train = binary_encoding(dfNN_train, cols)
# dfNN_test = binary_encoding(dfNN_test, cols)

# dfNN_train.head(5)

In [None]:
# x_train, x_valid, y_train, y_valid = train_test_split(dfNN_train.values,
#                                                       train[y_column].values,
#                                                       test_size=0.1, 
#                                                       shuffle=True,
#                                                       random_state=42,
#                                                       stratify=train[y_column].values)

# y_train_categorical = tf.keras.utils.to_categorical(y_train)
# y_valid_categorical = tf.keras.utils.to_categorical(y_valid)

In [None]:
# def create_model():
#     model = tf.keras.models.Sequential(name='cubricks')

#     model.add(tf.keras.layers.Input(shape=dfNN_train.values.shape[1]))
#     model.add(tf.keras.layers.BatchNormalization(renorm=True))

#     model.add(tf.keras.layers.Dense(512, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l=0.001)))
#     # model.add(tf.keras.layers.Dense(512, activation='relu'))
#     model.add(tf.keras.layers.BatchNormalization(renorm=True))
#     model.add(tf.keras.layers.Dropout(rate=0.1))

#     model.add(tf.keras.layers.Dense(512, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l=0.001)))
#     # model.add(tf.keras.layers.Dense(512, activation='relu'))
#     model.add(tf.keras.layers.BatchNormalization(renorm=True))
#     model.add(tf.keras.layers.Dropout(rate=0.1))

#     model.add(tf.keras.layers.Dense(512, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l=0.001)))
#     # model.add(tf.keras.layers.Dense(512, activation='relu'))
#     model.add(tf.keras.layers.BatchNormalization(renorm=True))
#     model.add(tf.keras.layers.Dropout(rate=0.1))

#     model.add(tf.keras.layers.Dense(len(labels), activation='softmax'))
#     return model


# model = create_model()

# model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001, epsilon=1e-8, amsgrad=True),
#               loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.1, reduction=tf.keras.losses.Reduction.SUM),
#               metrics=['accuracy'])

# model.summary()

In [None]:
# logdir = os.path.join('.', 'output')
# training_log = os.path.join(logdir, 'training.txt')
# model_checkpoint = os.path.join(logdir, 'model.hdf5')

# # os.makedirs(logdir, exist_ok=True)

# # if os.path.isfile(model_checkpoint):
# #     model.load_weights(model_checkpoint)

# callbacks = [
#     # tf.keras.callbacks.TensorBoard(logdir, profile_batch=0),
#     # tf.keras.callbacks.CSVLogger(training_log, separator=',', append=True),
#     # tf.keras.callbacks.ModelCheckpoint(model_checkpoint, monitor='val_loss', save_best_only=True, verbose=1),
#     # tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', min_delta=1e-8, factor=0.1, patience=10, verbose=1),
#     tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=1e-8, patience=40, restore_best_weights=True, verbose=1),
# ]

In [None]:
# model.fit(x_train,
#           y_train_categorical,
#           validation_data=(x_valid, y_valid_categorical),
#           callbacks=callbacks,
#           batch_size=1024,
#           epochs=10000,
#           verbose=1)

In [None]:
# predict = np.argmax(model.predict(dfNN_test.values), axis=1)
# print(f'Total: {len(test[y_column].values)}')
# print(f'Accuracy: {accuracy_score(test[y_column].values, predict) * 100:.2f}%\n')

# cm_report = classification_report(y_test, predict, target_names=labels)
# print(cm_report)

# cm = confusion_matrix(y_test, predict)
# plt.figure(figsize=(10, 10))
# sns.heatmap(cm/cm.sum(axis=0), square=True, annot=True, fmt='.2%', cmap='Blues', xticklabels=labels, yticklabels=labels)