In [ ]:
# standard imports
import json
from pathlib import Path
from itertools import chain

# third party imports
import dask.bag as db
import numpy as np

In [ ]:
# constants
RAW_DATA = Path('./raw_data')
DATAFRAMES = Path('./dataframes')
JSON_LINES = Path('./json_lines')
ALL_KEYS = Path('./all_keys/all_keys.json')

In [ ]:
filter_dic = {
    'ClientSideInfos': {"browserRequest.browserFamily": "Chrome", "browserRequest.deviceFamily": "Computer", "browserRequest.osFamily": "Windows"},
    'Conversions': {"browserRequest.browserFamily": "CHROME", "browserRequest.deviceFamily": "COMPUTER", "browserRequest.osFamily": "WINDOWS"},
    'PageInits': {"browserRequest.browserFamily": "CHROME", "browserRequest.deviceFamily": "COMPUTER", "browserRequest.osFamily": "WINDOWS"}
}

def load_json(json_path, filter_dic_, keys_list):
    with json_path.open('r') as f:
        list_of_dics = json.load(f)
        
    for dic in list_of_dics:
        if all(dic[k].lower() == v.lower() for k, v in filter_dic_.items()):
            # add missing keys
            for k in dic.keys() - set(keys_list):
                dic[k] = None
            yield dic

In [ ]:
# load all keys
with ALL_KEYS.open('r') as f:
    all_keys = json.load(f)

In [ ]:
# gather json files
json_files = {folder.name: folder.glob('*.json')
                for folder in RAW_DATA.glob('*')}

In [ ]:
json_data = {folder: {file_.name: load_json(file_, filter_dic[folder], all_keys[folder]) for file_ in file_list}
            for folder, file_list in json_files.items()}

In [ ]:
for folder, files_dic in json_data.items():
    output_directory = (JSON_LINES / folder)

    # create directory
    output_directory.mkdir(parents=True, exist_ok=True)

    for file_, lines in files_dic.items():
        output_file = (output_directory / file_)

        with output_file.open('a') as f:
            for line in lines:
                json.dump(line, f)
                f.write('\n')