In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import spacy
import tensorflow as tf
nlp = spacy.load('en')

%matplotlib inline
pd.options.display.max_rows = 6
sns.set(style='darkgrid')

# Set Parameters

In [None]:
FREQ_THRESHOLD = 20
MIN_SAMPLES_PER_CLASS = 400

In [None]:
ADD_STOPWORDS = [
    'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec',
    'janurary', 'february', 'march', 'april', 'june', 'july', 'august', 'september', 'october', 'november', 'december'
    'fy', 'mr', 'ms', 'mrs', 'pte', 'ltd'
]
for stopword in ADD_STOPWORDS:
    nlp.vocab[stopword].is_stop = True

# Read Data

In [None]:
dtypes = {
	'Fiscal Year (Accounting Date)': np.int64,
	'Business Unit': str,
	'Account Code': str,
	'Account Description': str,
	'Voucher ID': str,
	'Voucher Description': str,
	'Voucher Origin': str,
	'Vendor ID': str,
	'Vendor First Name': str,
	'Voucher Line Description': str,
	'Voucher Line Long Description': str,
	'Payment Voucher Line Amount S$ (Excluding GST, Including Freight S$)': str
}
df = pd.read_csv('data/raw/Raw Data for COA classification_einv1.csv', dtype=dtypes) \
    .append(pd.read_csv('data/raw/Raw Data for COA classification_einv2.csv', dtype=dtypes))
df.columns = [
    'fy',
    'business_unit',
    'acc_code',
    'acc_descr',
    'voucher_id',
    'voucher_descr',
    'voucher_origin',
    'vendor_id',
    'vendor_name',
    'voucher_line_descr',
    'voucher_line_long_descr',
    'payment_voucher_amt'
]
df = df.assign(
    fy = df.fy.astype('int64'),
    business_unit = df.business_unit.astype('category'),
    acc_code = df.acc_code.astype('object'),
    acc_descr = df.acc_descr.astype('object'),
    voucher_id = df.voucher_id.astype('object'),
    voucher_descr = df.voucher_descr.astype('object'),
    vendor_id = df.vendor_id.astype('category'),
    vendor_name = df.vendor_name.astype('object'),
    voucher_line_descr = df.voucher_line_descr.astype('object'),
    voucher_line_long_descr = df.voucher_line_long_descr.astype('object'),
    payment_voucher_amt = np.float64(df.payment_voucher_amt.str.replace(',', '').str.replace(r'\(([\d.]+)\)', r'-\1')),
    voucher_full_descr = df.voucher_descr + ' ' + df.voucher_line_long_descr
)
df

# Remove Stopwords

In [None]:
%%time
proc_descrs, proc_vendor_names = [], []
for doc in nlp.pipe(df.voucher_full_descr.str.lower().astype('unicode').values, batch_size=64, n_threads=-1, disable=['tagger', 'parser', 'ner']):
    try:
        proc_descrs.append(' '.join([word.text for word in doc if word.is_alpha and not word.is_stop and len(word.text) > 1]))
    except Exception as e:
        proc_descrs.append('')
for doc in nlp.pipe(df.vendor_name.str.lower().astype('unicode').values, batch_size=64, n_threads=-1, disable=['tagger', 'parser', 'ner']):
    try:
        proc_vendor_names.append(' '.join([word.text for word in doc if word.is_alpha and not word.is_stop and len(word.text) > 1]))
    except Exception as e:
        proc_vendor_names.append('')
df = df.assign(voucher_descr_proc = proc_descrs, vendor_name_proc = proc_vendor_names)

In [None]:
df

In [None]:
with pd.option_context('display.max_rows', 20):
    print(df.dtypes)

In [None]:
with pd.option_context('display.max_rows', 20):
    print(df.describe(include='all'))

# Remove Infrequent Account Codes

In [None]:
x_vars = ['voucher_full_descr', 'voucher_descr_proc', 'vendor_name', 'vendor_name_proc', 'payment_voucher_amt', 'business_unit']
y_var = ['acc_code']

# remove infrequent acc codes globally
freq_codes = df.groupby('acc_code').size().reset_index(name='freq')
freq_codes = freq_codes[freq_codes.freq >= FREQ_THRESHOLD][['acc_code']]
df = df.merge(freq_codes, how='inner', on='acc_code')

df = df[x_vars + y_var + ['acc_descr']].dropna()
acc_mapping = df[['acc_code', 'acc_descr']] \
    .drop_duplicates(subset=['acc_code', 'acc_descr']) \
    .reset_index(drop=True) 
acc_mapping = acc_mapping \
    .assign(acc_descr = acc_mapping.acc_code + ' - ' + acc_mapping.acc_descr) \
    .reset_index() \
    [['acc_code', 'acc_descr', 'index']]
df = df[x_vars + y_var]
acc_mapping

# Split Train and Test Set (Stratified)

In [None]:
x = df[x_vars]
y = df[y_var]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.08, random_state=42, stratify=y)
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=0.5, random_state=42, stratify=y_test)

# check that stratification worked successfully
assert sum(np.sort(y_train.acc_code.unique()) == np.sort(y_val.acc_code.unique())) == y_train.acc_code.unique().shape[0]
assert sum(np.sort(y_train.acc_code.unique()) == np.sort(y_test.acc_code.unique())) == y_train.acc_code.unique().shape[0]

print(
    'x_train.shape = ', x_train.shape, '\n',
    'y_train.shape = ', y_train.shape, '\n',
    'x_val.shape = ', x_val.shape, '\n',
    'y_val.shape = ', y_val.shape, '\n',
    'x_test.shape = ', x_test.shape, '\n',
    'y_test.shape = ', y_test.shape,
    sep = ''
)

# Over Sample Training Set

In [None]:
result = pd.DataFrame()
train = pd.concat([x_train, y_train], axis=1)
for acc_code in y_train.acc_code.unique():
    df_with_acc_code = train[train.acc_code == acc_code]
    if len(df_with_acc_code) >= MIN_SAMPLES_PER_CLASS:
        sample = df_with_acc_code
    else:
        sample = df_with_acc_code.sample(n=MIN_SAMPLES_PER_CLASS, replace=True, random_state=42)
    result = result.append(sample, ignore_index=True)
x_train = result[x_vars]
y_train = result[y_var]
y_train.groupby('acc_code').size()

In [None]:
df.reset_index(drop=True).to_feather('data/processed/coa_einv.feather')
x_train.reset_index(drop=True).to_feather('data/processed/train/x_train.feather')
x_val.reset_index(drop=True).to_feather('data/processed/val/x_val.feather')
x_test.reset_index(drop=True).to_feather('data/processed/test/x_test.feather')
y_train.reset_index(drop=True).to_feather('data/processed/train/y_train.feather')
y_val.reset_index(drop=True).to_feather('data/processed/val/y_val.feather')
y_test.reset_index(drop=True).to_feather('data/processed/test/y_test.feather')
acc_mapping.reset_index(drop=True).to_feather('data/misc/acc_mapping.feather')