In [None]:
%%bash
pip install pandas-profiling

In [1]:
import os
from glob import glob
import pandas as pd
from pandas_profiling import ProfileReport
import csv
from config import REGION, BUCKET, PROJECT, DELIM, RAW_DATA_COLS, RENAMED_COLS

%matplotlib inline
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format

# Cloud Setup
This section is only required if running on cloud

In [None]:
os.environ['BUCKET'] = BUCKET
os.environ['PROJECT'] = PROJECT
os.environ['REGION'] = REGION

In [None]:
%%bash
gcloud config set project $PROJECT
gcloud config set compute/region $REGION

# Data Profiling
Data profiling is done to better understand the data, and to see if there are any invalid data (e.g. out of bounds data, unexpected data types). No data preprocessing should be done here; it should be done in tf.transform so as to have a consistent data pipeline.

In [2]:
files = glob('data/raw/Raw Data for COA classification_einv*.csv')
df = pd.concat([pd.read_csv(f, usecols=RAW_DATA_COLS) for f in files], ignore_index=True)
df.columns = RENAMED_COLS
df

Unnamed: 0,fy,business_unit,acc_code,acc_descr,voucher_id,voucher_descr,voucher_origin,vendor_id,vendor_name,voucher_line,voucher_line_descr,voucher_line_long_descr,payment_voucher_amt
0,2016,BU01,123456,DESCR 123456,V00950100,lorem ipsum dolor sit amet,EINV,VDNR2,VDNR2,6,some other text,,3453.0
1,2015,BU02,123457,DESCR 123457,V00741138,some text,EINV,VDNR2,VDNR2,4,some text,,6375.0
2,2016,BU02,123456,DESCR 123456,V00338789,hello hello,EINV,VDNR9,VDNR9,9,some text,,9870.1
3,2017,BU03,123457,DESCR 123457,V0026832,lorem ipsum dolor sit amet,EINV,VDNR8,VDNR8,1,hello world,,2868.7
4,2016,BU01,123456,DESCR 123456,V00497887,hello world,EINV,VDNR5,VDNR5,9,hello world,,8505.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2015,BU03,123456,DESCR 123456,V00844878,some other text,EINV,VDNR3,VDNR3,8,some text,,5852.1
96,2015,BU03,123457,DESCR 123457,V00865190,some text,EINV,VDNR4,VDNR4,8,some text,,8000.1
97,2015,BU02,123458,DESCR 123458,V00560899,hello hello,EINV,VDNR3,VDNR3,8,hello world,,2356.3
98,2017,BU01,123457,DESCR 123457,V00660038,hello world,EINV,VDNR4,VDNR4,4,some other text,,4389.4


In [3]:
ProfileReport(df)

0,1
Number of variables,13
Number of observations,100
Total Missing (%),0.0%
Total size in memory,10.2 KiB
Average record size in memory,104.0 B

0,1
Numeric,4
Categorical,6
Boolean,0
Date,0
Text (Unique),1
Rejected,2
Unsupported,0

0,1
Distinct count,3
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,123460
Minimum,123456
Maximum,123458
Zeros (%),0.0%

0,1
Minimum,123456
5-th percentile,123460
Q1,123460
Median,123460
Q3,123460
95-th percentile,123460
Maximum,123458
Range,2
Interquartile range,2

0,1
Standard deviation,0.80378
Coef of variation,6.5106e-06
Kurtosis,-1.4477
Mean,123460
MAD,0.6468
Skewness,0.03649
Sum,12345698
Variance,0.64606
Memory size,872.0 B

Value,Count,Frequency (%),Unnamed: 3
123457,36,0.0%,
123456,33,0.0%,
123458,31,0.0%,

Value,Count,Frequency (%),Unnamed: 3
123456,33,0.0%,
123457,36,0.0%,
123458,31,0.0%,

Value,Count,Frequency (%),Unnamed: 3
123456,33,0.0%,
123457,36,0.0%,
123458,31,0.0%,

0,1
Distinct count,3
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
DESCR 123457,36
DESCR 123456,33
DESCR 123458,31

Value,Count,Frequency (%),Unnamed: 3
DESCR 123457,36,0.0%,
DESCR 123456,33,0.0%,
DESCR 123458,31,0.0%,

0,1
Distinct count,3
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
BU01,40
BU02,32
BU03,28

Value,Count,Frequency (%),Unnamed: 3
BU01,40,0.0%,
BU02,32,0.0%,
BU03,28,0.0%,

0,1
Distinct count,3
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,2016
Minimum,2015
Maximum,2017
Zeros (%),0.0%

0,1
Minimum,2015
5-th percentile,2015
Q1,2015
Median,2016
Q3,2017
95-th percentile,2017
Maximum,2017
Range,2
Interquartile range,2

0,1
Standard deviation,0.8278
Coef of variation,0.00041062
Kurtosis,-1.5387
Mean,2016
MAD,0.6912
Skewness,0.075356
Sum,201596
Variance,0.68525
Memory size,872.0 B

Value,Count,Frequency (%),Unnamed: 3
2015,36,0.0%,
2017,32,0.0%,
2016,32,0.0%,

Value,Count,Frequency (%),Unnamed: 3
2015,36,0.0%,
2016,32,0.0%,
2017,32,0.0%,

Value,Count,Frequency (%),Unnamed: 3
2015,36,0.0%,
2016,32,0.0%,
2017,32,0.0%,

0,1
Distinct count,100
Unique (%),100.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,5353
Minimum,24.498
Maximum,9972.1
Zeros (%),0.0%

0,1
Minimum,24.498
5-th percentile,881.84
Q1,2819.7
Median,5665.8
Q3,7533.0
95-th percentile,9785.6
Maximum,9972.1
Range,9947.6
Interquartile range,4713.4

0,1
Standard deviation,2910.4
Coef of variation,0.54369
Kurtosis,-1.2154
Mean,5353
MAD,2528.6
Skewness,-0.057044
Sum,535300
Variance,8470200
Memory size,872.0 B

Value,Count,Frequency (%),Unnamed: 3
2067.53143404,1,0.0%,
7765.48142764,1,0.0%,
4365.02467153,1,0.0%,
1649.26142493,1,0.0%,
8079.50070859,1,0.0%,
3672.55585283,1,0.0%,
8448.52838983,1,0.0%,
6955.85491344,1,0.0%,
8948.2244473,1,0.0%,
7093.96679157,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
24.4980676469,1,0.0%,
470.073804825,1,0.0%,
529.233818382,1,0.0%,
601.350398347,1,0.0%,
727.004547783,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
9800.33506904,1,0.0%,
9870.13101124,1,0.0%,
9884.84662312,1,0.0%,
9953.98040673,1,0.0%,
9972.0743265,1,0.0%,

0,1
Distinct count,11
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
VDNR2,14
VDNR5,13
VDNR8,12
Other values (8),61

Value,Count,Frequency (%),Unnamed: 3
VDNR2,14,0.0%,
VDNR5,13,0.0%,
VDNR8,12,0.0%,
VDNR9,12,0.0%,
VDNR3,11,0.0%,
VDNR4,10,0.0%,
VDNR1,9,0.0%,
VDNR6,8,0.0%,
VDNR7,7,0.0%,
VDNR0,3,0.0%,

0,1
Distinct count,11
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
VDNR2,14
VDNR5,13
VDNR8,12
Other values (8),61

Value,Count,Frequency (%),Unnamed: 3
VDNR2,14,0.0%,
VDNR5,13,0.0%,
VDNR8,12,0.0%,
VDNR9,12,0.0%,
VDNR3,11,0.0%,
VDNR4,10,0.0%,
VDNR1,9,0.0%,
VDNR6,8,0.0%,
VDNR7,7,0.0%,
VDNR0,3,0.0%,

0,1
Distinct count,5
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
hello hello,30
some text,25
lorem ipsum dolor sit amet,17
Other values (2),28

Value,Count,Frequency (%),Unnamed: 3
hello hello,30,0.0%,
some text,25,0.0%,
lorem ipsum dolor sit amet,17,0.0%,
hello world,16,0.0%,
some other text,12,0.0%,

First 3 values
V00271436
V00151285
V00252272

Last 3 values
V00159498
V00526437
V00543348

Value,Count,Frequency (%),Unnamed: 3
V00100101,1,0.0%,
V00104083,1,0.0%,
V0012305,1,0.0%,
V00128361,1,0.0%,
V00136287,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
V009686,1,0.0%,
V00975496,1,0.0%,
V00983811,1,0.0%,
V00984189,1,0.0%,
V00998272,1,0.0%,

0,1
Distinct count,10
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,5.74
Minimum,1
Maximum,10
Zeros (%),0.0%

0,1
Minimum,1
5-th percentile,1
Q1,4
Median,6
Q3,8
95-th percentile,10
Maximum,10
Range,9
Interquartile range,4

0,1
Standard deviation,2.7029
Coef of variation,0.47088
Kurtosis,-0.92802
Mean,5.74
MAD,2.246
Skewness,-0.17545
Sum,574
Variance,7.3055
Memory size,872.0 B

Value,Count,Frequency (%),Unnamed: 3
6,15,0.0%,
8,13,0.0%,
5,13,0.0%,
1,10,0.0%,
10,9,0.0%,
9,9,0.0%,
7,9,0.0%,
4,9,0.0%,
3,9,0.0%,
2,4,0.0%,

Value,Count,Frequency (%),Unnamed: 3
1,10,0.0%,
2,4,0.0%,
3,9,0.0%,
4,9,0.0%,
5,13,0.0%,

Value,Count,Frequency (%),Unnamed: 3
6,15,0.0%,
7,9,0.0%,
8,13,0.0%,
9,9,0.0%,
10,9,0.0%,

0,1
Distinct count,5
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
some text,25
hello world,21
hello hello,20
Other values (2),34

Value,Count,Frequency (%),Unnamed: 3
some text,25,0.0%,
hello world,21,0.0%,
hello hello,20,0.0%,
some other text,18,0.0%,
lorem ipsum dolor sit amet,16,0.0%,

0,1
Constant value,

0,1
Constant value,EINV

Unnamed: 0,fy,business_unit,acc_code,acc_descr,voucher_id,voucher_descr,voucher_origin,vendor_id,vendor_name,voucher_line,voucher_line_descr,voucher_line_long_descr,payment_voucher_amt
0,2016,BU01,123456,DESCR 123456,V00950100,lorem ipsum dolor sit amet,EINV,VDNR2,VDNR2,6,some other text,,3453.0
1,2015,BU02,123457,DESCR 123457,V00741138,some text,EINV,VDNR2,VDNR2,4,some text,,6375.0
2,2016,BU02,123456,DESCR 123456,V00338789,hello hello,EINV,VDNR9,VDNR9,9,some text,,9870.1
3,2017,BU03,123457,DESCR 123457,V0026832,lorem ipsum dolor sit amet,EINV,VDNR8,VDNR8,1,hello world,,2868.7
4,2016,BU01,123456,DESCR 123456,V00497887,hello world,EINV,VDNR5,VDNR5,9,hello world,,8505.2


# Split Data
Example uses 80-10-10 split for train, eval and test - change if necessary

In [4]:
RANDOM_SEED = 42
train_df = df.sample(frac=0.8, random_state=RANDOM_SEED)
eval_df = df.drop(train_df.index)
test_df = eval_df.sample(frac=0.5, random_state=RANDOM_SEED)
eval_df = eval_df.drop(test_df.index)

In [5]:
def export_datasets(on_cloud=False):
    if on_cloud:
        data_dir = 'gs://{bucket}/spam-classification/data/split'.format(bucket=BUCKET)
    else:
        data_dir = 'data/split'
    
    if not on_cloud:
        if not os.path.exists('data'):
            os.mkdir('data')
        if not os.path.exists('data/split'):
            os.mkdir('data/split')
        
    def export_df(df, filename):
        mycsv = df.astype(str).apply(lambda x: DELIM.join(x), axis=1)
        mycsv = mycsv.rename(DELIM.join(df.columns))
        mycsv.to_csv(
            os.path.join(data_dir, filename), 
            header=False, 
            index=False,
            encoding='utf-8'
        )
    
    export_df(train_df, 'train.csv')
    export_df(eval_df, 'eval.csv')
    export_df(test_df, 'test.csv')
  
    return
  
export_datasets(on_cloud=False)