# Import libraries

In [None]:
import pandas as pd
from pandas import json_normalize
from concurrent.futures import ProcessPoolExecutor

import numpy as np
import matplotlib.pyplot as plt
import json
import os
import re
from tqdm import tqdm

# Functions

In [None]:
def remove_trailing_commas(file_path):
    with open(file_path, 'r') as json_file:
        data = json_file.read()

    # use regex to find and remove trailing commas at the end of arrays
    corrected_data = re.sub(r',(?=\s*[\]}]\s*[\r\n]*$)', '', data)

    with open(file_path, 'w') as json_file:
        json_file.write(corrected_data)

# function to load in the json file        
def load_json(file_path):
    with open(file_path, 'r') as json_file:
        try:
            data = json.load(json_file)
            return data
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON in {file_path}: {e}")
            return None

# function to create a dataframe from all the json files.
# trying it in batches since memory can't handle it
def create_dataframe(directory, batch_size=10, process_all_files=False):
    json_files = [f for f in os.listdir(directory) if f.endswith('.json')]

    # try it with the first 1000 files first to see if memory can handle it
    if not process_all_files:
        json_files = json_files[:1000]

    if json_files:
        # store df's for each JSON file
        dfs = []

        # process JSON files in batches
        for i in range(0, len(json_files), batch_size):
            batch_files = json_files[i:i + batch_size]

            # process all JSON files in the batch
            for current_json_file in batch_files:
                file_path = os.path.join(directory, current_json_file)

                remove_trailing_commas(file_path)

                # load the content of the current JSON file
                data = load_json(file_path)

                if data is not None:
                    # create a df for the current JSON file
                    df = pd.json_normalize(data, max_level=3)

                    # add new column for the file name without ".json"
                    df['file_name'] = os.path.splitext(current_json_file)[0]

                    # append the df to the list
                    dfs.append(df)

        # concatenate all df's in the list into a single df, should be out of the loop to improve efficiency(?)
        result_df = pd.concat(dfs, ignore_index=True, sort=False)

        return result_df
    else:
        print("No JSON files found.")
        return None

In [None]:
json_directory = r'../data/output/'
result_df_first_1000 = create_dataframe(json_directory, batch_size=5, process_all_files=False)

In [None]:
if result_df_first_1000 is not None:
    print(result_df_first_1000)

In [None]:
# Showcase the unique columns
for col in result_df_first_1000:
    print(col)

In [None]:
filtered_row = result_df_first_1000[result_df_first_1000['buff'].notna()].iloc[0]

# display row
print("Row where 'event.meta.triggered_by' is not NaN:")
print(filtered_row)

# print value in the row
eventmetatriggered_by_value = filtered_row['buff']
print(f"\nValue in the 'event.meta.triggered_by' column: {eventmetatriggered_by_value}")


In [None]:
# Export the dataframe to a csv file
# result_df_first_1000.to_csv(r'../data/first_1000_JSON_files.csv', index=False)