In [None]:
%%bash
pip install pandas-profiling

In [None]:
from __future__ import division, print_function, absolute_import
import os
from glob import glob
import pandas as pd
from pandas_profiling import ProfileReport
import csv
from sklearn.model_selection import train_test_split
from config import REGION, BUCKET, PROJECT, DELIM, RAW_DATA_COLS, RENAMED_COLS, LABEL_COL, STRING_COLS, FLOAT_COLS

import sys
reload(sys)
sys.setdefaultencoding('utf8')

%matplotlib inline
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format

# Cloud Setup
This section is only required if running on cloud

In [None]:
os.environ['BUCKET'] = BUCKET
os.environ['PROJECT'] = PROJECT
os.environ['REGION'] = REGION

In [None]:
%%bash
gcloud config set project $PROJECT
gcloud config set compute/region $REGION

# Data Profiling
Data profiling is done to better understand the data, and to see if there are any invalid data (e.g. out of bounds data, unexpected data types). No data preprocessing should be done here; it should be done in tf.transform so as to have a consistent data pipeline.

In [None]:
# files = glob('data/raw/raw_data_invoices_2015-2017.csv')
# df = pd.concat([pd.read_csv(
#     f, usecols=RAW_DATA_COLS, quoting=csv.QUOTE_ALL, sep=',', encoding='utf-16', dtype='str'
#     ) for f in files], ignore_index=True)
# df.columns = RENAMED_COLS
# acc_code_freq = df.groupby('acc_code').size().rename('count').reset_index()
# acc_codes_to_include = list(acc_code_freq[acc_code_freq['count'] >= 30].acc_code)
# df = df[df['acc_code'].isin(acc_codes_to_include)]
# df

In [None]:
def read_data(filename):
    df = pd.read_csv(
        filename,
        sep='\t',
        quoting=csv.QUOTE_NONE,
        usecols=RAW_DATA_COLS
    )
    df.columns = RENAMED_COLS
    df['amount'] = df['amount'] \
        .str.replace(',', '') \
        .apply(lambda num: '-' + num if num.find('(') != -1 else num) \
        .str.replace('\(|\)', '') \
        .astype('float')
    
    return df

train_df = read_data('data/raw/raw_data_invoices_2015-2017_20181110.txt')
eval_df = read_data('data/raw/raw_data_invoices_2018_20181110.txt')

df = pd.concat([train_df, eval_df])
acc_code_freq = df.groupby('acc_code').size().rename('count').reset_index()
acc_codes_to_include = list(acc_code_freq[acc_code_freq['count'] >= 30].acc_code)
train_df = train_df[train_df['acc_code'].isin(acc_codes_to_include)]
eval_df = eval_df[eval_df['acc_code'].isin(acc_codes_to_include)]

train_df

In [None]:
ProfileReport(train_df)

In [None]:
ProfileReport(train_df).to_file('img/train.html')
ProfileReport(eval_df).to_file('img/eval.html')

# Split Data
Example uses 80-10-10 split for train, eval and test - change if necessary

In [None]:
# RANDOM_SEED = 42
# x = df.drop(LABEL_COL, axis=1)
# y = df[[LABEL_COL]]
# x_train, x_eval, y_train, y_eval = train_test_split(x, y, random_state=RANDOM_SEED, train_size=0.8, stratify=y)
# x_eval, x_test, y_eval, y_test = train_test_split(x_eval, y_eval, random_state=RANDOM_SEED, train_size=0.5, stratify=y_eval)
# train_df = pd.concat([x_train, y_train], axis=1)
# eval_df = pd.concat([x_eval, y_eval], axis=1)
# test_df = pd.concat([x_test, y_test], axis=1)

# # reorder columns
# train_df = train_df[RENAMED_COLS]
# eval_df = eval_df[RENAMED_COLS]
# test_df = test_df[RENAMED_COLS]

# len(train_df), len(eval_df), len(test_df)

In [None]:
test_df = eval_df[eval_df.business_unit.isin(['CCY', 'MND'])]
test_df

In [None]:
def export_datasets(on_cloud=False):
    if on_cloud:
        data_dir = 'gs://{bucket}/{project}/data/split'.format(bucket=BUCKET, project=PROJECT)
    else:
        data_dir = 'data/split'
    
    if not on_cloud:
        if not os.path.exists('data'):
            os.mkdir('data')
        if not os.path.exists('data/split'):
            os.mkdir('data/split')
        
#     def export_df(df, filename):
#         full_path = os.path.join(data_dir, filename)
#         csv_str = '\n'.join(DELIM.join(str(r) for r in rec) for rec in df.to_records(index=False))
#         with open(full_path, 'w') as f:
#             f.write(csv_str)
    
#     export_df(train_df, 'train.csv')
#     export_df(eval_df, 'eval.csv')
#     export_df(test_df, 'test.csv')

    def export_df(df, filename):
        full_path = os.path.join(data_dir, filename)
        df.to_csv(full_path, sep='\t', quoting=csv.QUOTE_NONE, index=False)
        
    export_df(train_df, 'train.tsv')
    export_df(eval_df, 'eval.tsv')
    export_df(test_df, 'test.tsv')
    
    if not os.path.exists('data/misc'):
        os.mkdir('data/misc')
    with open('./data/misc/labels.txt', 'w') as f:
        label_vocab = DELIM.join(list(df[LABEL_COL].astype('str').unique()))
        f.write(label_vocab)
  
    return
  
export_datasets(on_cloud=False)