In [None]:
%%bash
pip install pandas-profiling

In [1]:
import os
from glob import glob
import pandas as pd
from pandas_profiling import ProfileReport
import csv
from config import REGION, BUCKET, PROJECT, DELIM

%matplotlib inline
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format

# Cloud Setup
This section is only required if running on cloud

In [None]:
os.environ['BUCKET'] = BUCKET
os.environ['PROJECT'] = PROJECT
os.environ['REGION'] = REGION

In [None]:
%%bash
gcloud config set project $PROJECT
gcloud config set compute/region $REGION

# Data Profiling
Data profiling is done to better understand the data, and to see if there are any invalid data (e.g. out of bounds data, unexpected data types). No data preprocessing should be done here; it should be done in tf.transform so as to have a consistent data pipeline.

In [None]:
files = glob('data/raw/Raw Data for COA classification_einv*.csv')
df = pd.concat([pd.read_csv(f) for f in files], ignore_index=True)
df

In [None]:
ProfileReport(df)

# Split Data
Example uses 80-10-10 split for train, eval and test - change if necessary

In [None]:
RANDOM_SEED = 42
train_df = df.sample(frac=0.8, random_state=RANDOM_SEED)
eval_df = df.drop(train_df.index)
test_df = eval_df.sample(frac=0.5, random_state=RANDOM_SEED)
eval_df = eval_df.drop(test.index)

In [None]:
def export_datasets(on_cloud=False):
    if on_cloud:
        data_dir = 'gs://{bucket}/spam-classification/data/split'.format(bucket=BUCKET)
    else:
        data_dir = 'data/split'
    
    if not on_cloud:
        if not os.path.exists('data'):
            os.mkdir('data')
        if not os.path.exists('data/split'):
            os.mkdir('data/split')
        
    def export_df(df, filename):
        df.to_csv(os.path.join(data_dir, filename), index=False, quoting=csv.QUOTE_NONE, sep=DELIM)
    
    export_df(train_df, 'train.csv')
    export_df(eval_df, 'eval.csv')
    export_df(test_df, 'test.csv')
  
    return
  
export_datasets(on_cloud=False)