In [None]:
import pandas as pd
pd.__version__

In [None]:
data_dir = 'data/'
data_transanctions = data_dir + 'transactions/'
data_clicks = data_dir + 'clicks/'
data_impressions = data_dir + 'impressions/'
data_pageViews = data_dir + 'pageViews/'
NUM_DATAFILES = 37

In [None]:
def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else: # Assuming pd.Series
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2
    return "{:03.2f} MB".format(usage_mb)

In [None]:
def extract_info(dataFrame):
    global user_ids, oss, sources, browsers
    user_ids, oss, sources, browsers, groups, sessions = [], [], [], [], [], []
    for info in dataFrame['info']:
        user_ids.append(info['browserId'])
        oss.append(info['os'])
        sources.append(info['source'])
        browsers.append(info['browser'])

In [None]:
def append_info(dataFrame):
    dataFrame['user_id'] = user_ids
    dataFrame['os'] = oss
    dataFrame['source'] = sources
    dataFrame['browser'] = browsers

In [None]:
def clear_tmp_variables():
    global user_ids, oss, sources, browsers
    user_ids, oss, sources, browsers, groups, sessions = [], [], [], [], [], []

In [None]:
def clean_browser_values(dataFrame):    
    dataFrame.browser = dataFrame.browser.str.replace('Mobile ', '')
    global browsers
    browsers = []
    for browser in dataFrame.browser.str.split(' '):
        browsers.append(browser[0])
    dataFrame.browser = browsers

In [None]:
def clean_os_values(dataFrame):
    global oss
    oss = []
    for os in dataFrame.os.str.split(' '):
        oss.append(os[0])
    dataFrame.os = oss

In [None]:
def clean_ab_values(dataFrame):
    global groups, sessions
    groups, sessions = [], []
    
    for ab in dataFrame.ab.str.split('/'):
        groups.append(ab[0])
        sessions.append(ab[1])
    dataFrame['group'] = groups
    dataFrame['session'] = sessions

In [None]:
def set_column_as_category(dataFrame, column_name):
    dataFrame[column_name] = dataFrame[column_name].astype('category')

In [None]:
def extract_shared_variables(dataFrame):
    extract_info(dataFrame)
    append_info(dataFrame)
    clean_browser_values(dataFrame)
    clear_tmp_variables()
    clean_os_values(dataFrame)
    clean_ab_values(dataFrame)
    clear_tmp_variables()

In [None]:
def reduce_memory_usage_pageViews(df_pageViews):
    df_pageViews.info(memory_usage='deep')
    print(mem_usage(df_pageViews))
    if 'type' in df_pageViews.columns:
        df_pageViews.drop('type', axis=1, inplace=True)
        print(mem_usage(df_pageViews))
    if 'info' in df_pageViews.columns:
        df_pageViews.drop('info', axis=1, inplace=True)
        print(mem_usage(df_pageViews))
    if 'tags' in df_pageViews.columns:
        df_pageViews.drop('tags', axis=1, inplace=True)
        print(mem_usage(df_pageViews))
    if 'ab' in df_pageViews.columns:
        df_pageViews.drop('ab', axis=1, inplace=True)
        print(mem_usage(df_pageViews))
    set_column_as_category(df_pageViews, 'group')
    print(mem_usage(df_pageViews))
    set_column_as_category(df_pageViews, 'name')
    print(mem_usage(df_pageViews))
    set_column_as_category(df_pageViews, 'source')
    print(mem_usage(df_pageViews))
    set_column_as_category(df_pageViews, 'browser')
    print(mem_usage(df_pageViews))
    set_column_as_category(df_pageViews, 'os')
    print(mem_usage(df_pageViews))
    set_column_as_category(df_pageViews, 'id')
    print(mem_usage(df_pageViews))
    set_column_as_category(df_pageViews, 'session')
    df_pageViews.info(memory_usage='deep')

In [None]:
def reduce_memory_usage_impressions(df_impressions):
    df_impressions.info(memory_usage='deep')
    print(mem_usage(df_impressions))
    if 'type' in df_impressions.columns:
        df_impressions.drop('type', axis=1, inplace=True)
        print(mem_usage(df_impressions))
    if 'vrlId' in df_impressions.columns:
        df_impressions.drop('vrlId', axis=1, inplace=True)
        print(mem_usage(df_impressions))
    if 'ab' in df_impressions.columns:
        df_impressions.drop('ab', axis=1, inplace=True)
        print(mem_usage(df_impressions))
    if 'info' in df_impressions.columns:
        df_impressions.drop('info', axis=1, inplace=True)
        print(mem_usage(df_impressions))
    set_column_as_category(df_impressions, 'algRef')
    print(mem_usage(df_impressions))
    set_column_as_category(df_impressions, 'feature')
    print(mem_usage(df_impressions))
    set_column_as_category(df_impressions, 'id')
    print(mem_usage(df_impressions))
    set_column_as_category(df_impressions, 'page')
    print(mem_usage(df_impressions))
    set_column_as_category(df_impressions, 'user_id')
    print(mem_usage(df_impressions))
    set_column_as_category(df_impressions, 'os')
    print(mem_usage(df_impressions))
    set_column_as_category(df_impressions, 'source')
    print(mem_usage(df_impressions))
    set_column_as_category(df_impressions, 'browser')
    print(mem_usage(df_impressions))
    set_column_as_category(df_impressions, 'group')
    print(mem_usage(df_impressions))
    set_column_as_category(df_impressions, 'session')
    print(mem_usage(df_impressions))
    df_impressions.info(memory_usage='deep')

In [None]:
def reduce_memory_usage_clicks(df_clicks):
    df_clicks.info(memory_usage='deep')
    print(mem_usage(df_clicks))
    if 'type' in df_clicks.columns:
        df_clicks.drop('type', axis=1, inplace=True)
        print(mem_usage(df_clicks))
    if 'vrlId' in df_clicks.columns:
        df_clicks.drop('vrlId', axis=1, inplace=True)
        print(mem_usage(df_clicks))
    if 'ab' in df_clicks.columns:
        df_clicks.drop('ab', axis=1, inplace=True)
        print(mem_usage(df_clicks))
    if 'info' in df_clicks.columns:
        df_clicks.drop('info', axis=1, inplace=True)
        print(mem_usage(df_clicks))
    set_column_as_category(df_clicks, 'feature')
    print(mem_usage(df_clicks))
    set_column_as_category(df_clicks, 'id')
    print(mem_usage(df_clicks))
    set_column_as_category(df_clicks, 'page')
    print(mem_usage(df_clicks))
    set_column_as_category(df_clicks, 'user_id')
    print(mem_usage(df_clicks))
    set_column_as_category(df_clicks, 'product')
    print(mem_usage(df_clicks))
    set_column_as_category(df_clicks, 'os')
    print(mem_usage(df_clicks))
    set_column_as_category(df_clicks, 'source')
    print(mem_usage(df_clicks))
    set_column_as_category(df_clicks, 'browser')
    print(mem_usage(df_clicks))
    set_column_as_category(df_clicks, 'group')
    print(mem_usage(df_clicks))
    set_column_as_category(df_clicks, 'session')
    print(mem_usage(df_clicks))
    df_clicks.info(memory_usage='deep')

In [None]:
def reduce_memory_usage_transactions(df_transactions):
    df_transactions.info(memory_usage='deep')
    print(mem_usage(df_transactions))
    if 'type' in df_transactions.columns:
        df_transactions.drop('type', axis=1, inplace=True)
        print(mem_usage(df_transactions))
    if 'ab' in df_transactions.columns:
        df_transactions.drop('ab', axis=1, inplace=True)
        print(mem_usage(df_transactions))
    if 'info' in df_transactions.columns:
        df_transactions.drop('info', axis=1, inplace=True)
        print(mem_usage(df_transactions))
    set_column_as_category(df_transactions, 'id')
    print(mem_usage(df_transactions))
    set_column_as_category(df_transactions, 'user_id')
    print(mem_usage(df_transactions))
    set_column_as_category(df_transactions, 'paymentType')
    print(mem_usage(df_transactions))
    set_column_as_category(df_transactions, 'os')
    print(mem_usage(df_transactions))
    set_column_as_category(df_transactions, 'source')
    print(mem_usage(df_transactions))
    set_column_as_category(df_transactions, 'browser')
    print(mem_usage(df_transactions))
    set_column_as_category(df_transactions, 'group')
    print(mem_usage(df_transactions))
    set_column_as_category(df_transactions, 'session')
    print(mem_usage(df_transactions))
    df_transactions.info(memory_usage='deep')

In [None]:
df_pageViews_to_merge = []
df_impressions_to_merge = []
df_clicks_to_merge = []
df_transactions_to_merge = []
for i in range(0, NUM_DATAFILES):
    print()
    print()
    print("=========Iteration:", i, "==========")
    print()
    datafile = "{:02.0f}".format(i)
    df_transactions = pd.read_json(data_transanctions + 'part-' + datafile, lines=True)
    df_clicks = pd.read_json(data_clicks + 'part-' + datafile, lines=True)
    df_impressions = pd.read_json(data_impressions + 'part-' + datafile, lines=True)
    df_pageViews = pd.read_json(data_pageViews + 'part-' + datafile, lines=True)
    
    if not df_pageViews.empty:
        extract_shared_variables(df_pageViews)
        print('PageViews:', len(df_pageViews))
        reduce_memory_usage_pageViews(df_pageViews)
        df_pageViews_to_merge.append(df_pageViews)
    
    if not df_impressions.empty:
        extract_shared_variables(df_impressions)
        print('Impressions:', len(df_impressions))
        reduce_memory_usage_impressions(df_impressions)
        df_impressions_to_merge.append(df_impressions)
    
    if not df_clicks.empty:
        extract_shared_variables(df_clicks)
        print('Clicks:', len(df_clicks))
        reduce_memory_usage_clicks(df_clicks)
        df_clicks_to_merge.append(df_clicks)
    
    if not df_transactions.empty:
        extract_shared_variables(df_transactions)
        print('Transactions:', len(df_transactions))
        reduce_memory_usage_transactions(df_transactions)
        df_transactions_to_merge.append(df_transactions)
    

In [None]:
print(len(df_pageViews_to_merge))
print(len(df_impressions_to_merge))
print(len(df_clicks_to_merge))
print(len(df_transactions_to_merge))

### Saving the .pickle files

##### pageViews

In [None]:
df_merged = pd.concat(df_pageViews_to_merge)
len(df_merged)

In [None]:
df_merged.info(memory_usage="deep")

In [None]:
reduce_memory_usage_pageViews(df_merged)
df_merged.info(memory_usage="deep")

In [None]:
df_merged.to_pickle(data_pageViews + 'full_pageViews.pickle')

##### impressions

In [None]:
df_merged = pd.concat(df_impressions_to_merge)
len(df_merged)

In [None]:
df_merged.info(memory_usage="deep")

In [None]:
reduce_memory_usage_impressions(df_merged)
df_merged.info(memory_usage="deep")

In [None]:
df_merged.to_pickle(data_impressions + 'full_impressions.pickle')

##### clicks

In [None]:
df_merged = pd.concat(df_clicks_to_merge)
len(df_merged)

In [None]:
df_merged.info(memory_usage="deep")

In [None]:
reduce_memory_usage_impressions(df_merged)
df_merged.info(memory_usage="deep")

In [None]:
df_merged.to_pickle(data_clicks + 'full_clicks.pickle')

##### transactions

In [None]:
df_merged = pd.concat(df_transactions_to_merge)
len(df_merged)

In [None]:
df_merged.info(memory_usage="deep")

In [None]:
reduce_memory_usage_impressions(df_merged)
df_merged.info(memory_usage="deep")

In [None]:
df_merged.to_pickle(data_clicks + 'full_clicks.pickle')