## Setup<a class="anchor" id="setup"></a>

In [None]:
%%capture
%pip install awswrangler

In [None]:
import os
import time
import boto3
import util
import sagemaker
import awswrangler as wr
import pandas as pd
import numpy as np

In [None]:
def loadRaw():
    bucket = ''
    prefix = ''
    path = f's3://{bucket}/{prefix}'
    suffix = '.gz.parquet'
    cols = ['mt_sent_time', 'mt_category', 'operator', 'shortcode', 'mt_count', 'dn_count']
    df = wr.s3.read_parquet(path=path, path_suffix=suffix, columns=cols, dataset=True)
    return df

In [None]:
def featureEngineering(raw, *args):
    # args = groupby list
    
    df = raw.copy()
    # Only these 2 categories are needed)
    mt_cat_list = ['schedule', 'retry1']
    df = df[df.mt_category.isin(mt_cat_list)]
    
    # Convert date/time from string to datetime format
    df['mt_sent_time'] = pd.to_datetime(df['mt_sent_time'], errors='coerce').dt.floor('H')
    
    # Group entries
    df = df.groupby(['mt_category', 'mt_sent_time', *args]).agg({'mt_count':'sum', 'dn_count':'sum'}).reset_index()
    # add scs_date
    df['scs_dn_rate'] = df.dn_count / df.mt_count
    
    # add columns describing different time frame
    df['mt_sent_time_hour'] = df['mt_sent_time'].dt.hour
    df['mt_sent_time_dayofweek'] = df['mt_sent_time'].dt.weekday
    # handle outliers with low support
    df.loc[df.mt_count<10, 'scs_dn_rate']=np.nan
    # handle outliers with extreme value
    df.loc[(df.scs_dn_rate>0.2) | (df.scs_dn_rate<0.001), 'scs_dn_rate']=np.nan
    fill_max = lambda x: x.replace(1, x[x<1].max())   # Write function that Fills outliers in subset
    fill_min = lambda x: x.replace(0, x[x>0].min())
    df.loc[:,'scs_dn_rate'] = df.groupby(['mt_category', 'mt_sent_time_hour', 'mt_sent_time_dayofweek', *args])['scs_dn_rate'].apply(fill_max) 
    df.loc[:,'scs_dn_rate'] = df.groupby(['mt_category', 'mt_sent_time_hour', 'mt_sent_time_dayofweek', *args])['scs_dn_rate'].apply(fill_min)
    return df

In [None]:
def outputCSV(df, category):
    bucket_name = sagemaker.Session().default_bucket()
    region = sagemaker.Session().boto_region_name
    session = boto3.Session(region_name=region) 
    s3 = session.client('s3')

    df = df[df.mt_category == category][['mt_sent_time', 'mt_category', 'scs_dn_rate']]
    
    outdir = './data'
    if not os.path.exists(outdir):
        os.mkdir(outdir)
    
    csv_name = 'data/'+category+'.csv'
    df.to_csv(csv_name, header=False, index=False)
    
    key='puretech_data/'+category+'.csv'
    s3.upload_file(Filename=csv_name, Bucket=bucket_name, Key=key)

In [None]:
tStart = time.time()

In [None]:
raw = loadRaw()
df = featureEngineering(raw)
outputCSV(df, 'schedule')
outputCSV(df, 'retry1')

In [None]:
tEnd = time.time()
print ("Spent %f minutes" % ((tEnd - tStart)/60))

In [None]:
raw.mt_sent_time