In [ ]:
# standard imports
import json
from pathlib import Path
from itertools import chain

# third party imports
import dask.bag as db
import dask.dataframe as dd
import numpy as np

In [ ]:
# constants
RAW_DATA = Path('./raw_data')
DATAFRAMES = Path('./dataframes')
JSON_LINES = Path('./json_lines')
ALL_KEYS = Path('./all_keys/all_keys.json')

In [ ]:
# columns
columns = {
    'PageInits': ['browserRequest.browser', 'browserRequest.browserFamily',
       'browserRequest.deviceFamily', 'browserRequest.os',
       'browserRequest.osFamily', 'browserRequest.referrerUrl',
       'browserRequest.screenHeight', 'browserRequest.screenPixelDepth',
       'browserRequest.screenWidth', 'browserRequest.userAgent',
       'campaignReferrer', 'deviceVentileGroup', 'eligiblePlacements',
       'forceAds', 'pageId', 'pageInitializationId', 'placementTypes',
       'productCategoryType', 'productPath', 'publisherType',
       'publisherUserId', 'pureGroupType', 'request.buildNumber',
       'request.requestId', 'request.requestedAt', 'segmentationType',
       'sessionId', 'siteCountry', 'siteId', 'siteType', 'ventileGroup',
       'webuserId'],
    'Conversions': ['advancePurchaseRangeType', 'browserRequest.browser',
       'browserRequest.browserFamily', 'browserRequest.deviceFamily',
       'browserRequest.os', 'browserRequest.osFamily',
       'browserRequest.referrerUrl', 'browserRequest.screenHeight',
       'browserRequest.screenPixelDepth', 'browserRequest.screenWidth',
       'browserRequest.userAgent', 'carDropoffAirportCode',
       'carPickupAirportCode', 'conversionTransactionType', 'cookieSize',
       'destination', 'entityId', 'isPublisherConversion',
       'knownHotelProperty', 'orderId', 'originalConversionCurrency',
       'origination', 'pageId', 'productCategoryType', 'publisherType',
       'publisherUserId', 'publisherUserIdPerPub', 'request.buildNumber',
       'request.requestId', 'request.requestedAt', 'siteId', 'siteType',
       'thirdPartyCookieAssigned', 'thirdPartyWebuserId', 'travelDateEnd',
       'travelDateStart', 'travelers', 'tripType'],
    'ClientSideInfos': ['browserRequest.browser', 'browserRequest.browserFamily',
       'browserRequest.deviceFamily', 'browserRequest.os',
       'browserRequest.osFamily', 'browserRequest.referrerUrl',
       'browserRequest.userAgent', 'clientId', 'clientTime',
       'customDimensions.clickTarget', 'customDimensions.documentHeight',
       'customDimensions.documentWidth', 'customDimensions.label',
       'customDimensions.pageX', 'customDimensions.pageY',
       'customDimensions.selector', 'customDimensions.viewportHeight',
       'customDimensions.viewportWidth', 'name', 'pageId',
       'pageInitializationId', 'publisherType', 'publisherUserId',
       'request.requestId', 'request.requestedAt', 'siteType']
}

In [ ]:
DROP_COLUMNS = {
    'PageInits': ['browserRequest.browser', 'browserRequest.browserFamily', 'browserRequest.deviceFamily', 'browserRequest.os',
       'browserRequest.osFamily', 'browserRequest.referrerUrl', 'browserRequest.userAgent', 'eligiblePlacements', 'forceAds',
       'placementTypes', 'publisherType', 'pureGroupType', 'request.buildNumber', 'request.requestId', 'sessionId', 'siteCountry', 
       'siteId', 'siteType', 'ventileGroup', 'webuserId'],
    'Conversions': ['browserRequest.browser', 'browserRequest.browserFamily', 'browserRequest.deviceFamily', 'browserRequest.os', 
        'browserRequest.osFamily', 'browserRequest.referrerUrl', 'browserRequest.userAgent', 'conversionTransactionType', 'knownHotelProperty',
        'originalConversionCurrency', 'publisherType', 'publisherUserIdPerPub', 'request.buildNumber', 'siteId', 'siteType',
        'thirdPartyCookieAssigned', 'thirdPartyWebuserId', 'isPublisherConversion'],
    'ClientSideInfos': ['browserRequest.browser', 'browserRequest.browserFamily', 'browserRequest.deviceFamily', 'browserRequest.os',
       'browserRequest.osFamily', 'browserRequest.referrerUrl', 'browserRequest.userAgent', 'publisherType', 'request.requestId', 'siteType']
}

In [ ]:
NUMERIC_COLUMNS = {
    'PageInits': ['browserRequest.screenHeight', 'browserRequest.screenPixelDepth', 'browserRequest.screenWidth', 'deviceVentileGroup', ],
    'Conversions': ['browserRequest.screenHeight', 'browserRequest.screenPixelDepth', 'browserRequest.screenWidth', 'cookieSize', 'travelers'],
    'ClientSideInfos': ['customDimensions.documentHeight', 'customDimensions.documentWidth', 'customDimensions.pageX', 'customDimensions.pageY',
        'customDimensions.viewportHeight', 'customDimensions.viewportWidth']
}

In [ ]:
DATETIME_COLUMNS = {
    'PageInits': ['request.requestedAt'],
    'Conversions': ['request.requestedAt', 'travelDateEnd', 'travelDateStart'],
    'ClientSideInfos': ['clientTime', 'request.requestedAt']
}

In [ ]:
# import json files as dask bags
bags = {folder.name: db.read_text(f'{str(folder)}/*.json').map(json.loads) 
        for folder in JSON_LINES.glob('*')}

In [ ]:
# convert bags to dataframes
# also drops nas
dfs = {folder: bag.to_dataframe(meta={col: str for col in columns[folder]}).dropna()
        for folder, bag in bags.items()}

In [ ]:
# drop columns
dfs = {folder: df.drop(columns=DROP_COLUMNS[folder])
        for folder, df in dfs.items()}

In [ ]:
# convert column types
dfs = {folder: df.astype({col: int for col in NUMERIC_COLUMNS[folder]})
        for folder, df in dfs.items()}

In [ ]:
# convert date columns
def convert_to_date_columns(df, columns):
    for col in columns:
        df[col] = dd.to_datetime(df[col])
    return df

dfs = {folder: convert_to_date_columns(df, DATETIME_COLUMNS[folder])
        for folder, df in dfs.items()}

In [ ]:
# export to hdf file
for folder, df in dfs.items():
    df.to_hdf(f'{str(DATAFRAMES)}/{folder}.hdf', key='df-*')

  check_attribute_name(name)
`; you will not be able to use natural naming to access this object; using ``getattr()`` will still work, though
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
`; you will not be able to use natural naming to access this object; using ``getattr()`` will still work, though
  check_attribute_name(name)
`; you will not be able to use natural naming to access this object; using ``getattr()`` will still work, though
  check_attribute_name(name)
`; you will not be able to use natural naming to access this object; using ``getattr()`` will still work, though
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
`; you will not be able to use natural naming to access this object; using ``getattr()`` will still work, though
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
`; you will not be able to use natural naming to access this object; using ``getattr()`