In [None]:
%%bash
pip install pandas-profiling

In [None]:
from __future__ import division, print_function, absolute_import
import os
from glob import glob
import pandas as pd
from pandas_profiling import ProfileReport
import csv
from config import REGION, BUCKET, PROJECT, DELIM, RAW_DATA_COLS, RENAMED_COLS, LABEL_COL, STRING_COLS, NUMERIC_COLS

import sys
reload(sys)
sys.setdefaultencoding('utf8')

%matplotlib inline
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format

# Cloud Setup
This section is only required if running on cloud

In [None]:
os.environ['BUCKET'] = BUCKET
os.environ['PROJECT'] = PROJECT
os.environ['REGION'] = REGION

In [None]:
%%bash
gcloud config set project $PROJECT
gcloud config set compute/region $REGION

# Data Profiling
Data profiling is done to better understand the data, and to see if there are any invalid data (e.g. out of bounds data, unexpected data types). No data preprocessing should be done here; it should be done in tf.transform so as to have a consistent data pipeline.

In [None]:
files = glob('data/raw/Raw Data for COA classification201*.csv')
df = pd.concat([pd.read_csv(
    f, usecols=RAW_DATA_COLS, quoting=csv.QUOTE_ALL, sep=',', encoding='utf-16', dtype='str'
    ) for f in files], ignore_index=True)
df.columns = RENAMED_COLS
df = df[df.voucher_origin == 'EIN']
df['payment_voucher_amt'] = df['payment_voucher_amt'] \
    .str.replace(r'\(([\d,\.]+)\)', r'-\1') \
    .str.replace(',', '') \
    .astype(float)
acc_code_freq = df.groupby('acc_code').size().rename('count').reset_index()
acc_codes_to_include = list(acc_code_freq[acc_code_freq['count'] >= 30].acc_code)
df = df[df['acc_code'].isin(acc_codes_to_include)]
df

In [None]:
ProfileReport(df)

# Split Data
Example uses 80-10-10 split for train, eval and test - change if necessary

In [None]:
RANDOM_SEED = 42
train_df = df.sample(frac=0.9, random_state=RANDOM_SEED)
eval_df = df.drop(train_df.index)
test_df = eval_df.sample(frac=0.5, random_state=RANDOM_SEED)
eval_df = eval_df.drop(test_df.index)
len(train_df), len(eval_df), len(test_df)

In [None]:
def export_datasets(on_cloud=False):
    if on_cloud:
        data_dir = 'gs://{bucket}/{project}/data/split'.format(bucket=BUCKET, project=PROJECT)
    else:
        data_dir = 'data/split'
    
    if not on_cloud:
        if not os.path.exists('data'):
            os.mkdir('data')
        if not os.path.exists('data/split'):
            os.mkdir('data/split')
        
    def export_df(df, filename):
        full_path = os.path.join(data_dir, filename)
        csv_str = '\n'.join(DELIM.join(str(r) for r in rec) for rec in df.to_records(index=False))
        with open(full_path, 'w') as f:
            f.write(csv_str)
    
    export_df(train_df, 'train.csv')
    export_df(eval_df, 'eval.csv')
    export_df(test_df, 'test.csv')
    
    if not os.path.exists('data/misc'):
        os.mkdir('data/misc')
    with open('./data/misc/labels.txt', 'w') as f:
        label_vocab = DELIM.join(list(df[LABEL_COL].astype('str').unique()))
        f.write(label_vocab)
  
    return
  
export_datasets(on_cloud=False)