# Import libraries

In [2]:
import pandas as pd
from pandas import json_normalize
from concurrent.futures import ProcessPoolExecutor

import numpy as np
import matplotlib.pyplot as plt
import json
import os
import re
from tqdm import tqdm

# Functions

In [3]:
def remove_trailing_commas(file_path):
    with open(file_path, 'r') as json_file:
        data = json_file.read()

    # use regex to find and remove trailing commas at the end of arrays
    corrected_data = re.sub(r',(?=\s*[\]}]\s*[\r\n]*$)', '', data)

    with open(file_path, 'w') as json_file:
        json_file.write(corrected_data)

# function to load in the json file        
def load_json(file_path):
    with open(file_path, 'r') as json_file:
        try:
            data = json.load(json_file)
            return data
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON in {file_path}: {e}")
            return None

# function to create a dataframe from all the json files.
# trying it in batches since memory can't handle it
def create_dataframe(directory, batch_size=10, process_all_files=False):
    json_files = [f for f in os.listdir(directory) if f.endswith('.json')]

    # try it with the first 1000 files first to see if memory can handle it
    if not process_all_files:
        json_files = json_files[:1000]

    if json_files:
        # store df's for each JSON file
        dfs = []

        # process JSON files in batches
        for i in range(0, len(json_files), batch_size):
            batch_files = json_files[i:i + batch_size]

            # process all JSON files in the batch
            for current_json_file in batch_files:
                file_path = os.path.join(directory, current_json_file)

                remove_trailing_commas(file_path)

                # load the content of the current JSON file
                data = load_json(file_path)

                if data is not None:
                    # create a df for the current JSON file
                    df = pd.json_normalize(data, max_level=3)

                    # add new column for the file name without ".json"
                    df['file_name'] = os.path.splitext(current_json_file)[0]

                    # append the df to the list
                    dfs.append(df)

        # concatenate all df's in the list into a single df, should be out of the loop to improve efficiency(?)
        result_df = pd.concat(dfs, ignore_index=True, sort=False)

        return result_df
    else:
        print("No JSON files found.")
        return None

In [4]:
json_directory = r'../data/output/'
result_df_first_1000 = create_dataframe(json_directory, batch_size=5, process_all_files=False)

In [5]:
if result_df_first_1000 is not None:
    print(result_df_first_1000)

                                         _id          imei  \
0       fc9878d4-f775-48d8-b8af-20da7c7775f2  8.685001e+14   
1       753da335-f4cb-45c5-b580-49c74da8be4c  8.685001e+14   
2       fa6c8dc8-d4eb-4c00-8cfa-6b7343d18bb5  8.685001e+14   
3       061af346-83ce-4a31-8209-f1f6820800e9  8.685001e+14   
4       f7295490-5b01-459a-98f7-ef45888b27e9  8.685001e+14   
...                                      ...           ...   
725856  9859b82d-cb15-4441-bd93-cd05c217f90f  8.685001e+14   
725857  6347c278-b3d0-4f11-9769-27ccc893352d  8.685001e+14   
725858  83903a2c-0296-4e0d-86bb-4457bb0c9fa6  8.685001e+14   
725859  ecc0713c-0aa5-4257-b3ea-d7b72dd033c1  8.685001e+14   
725860  96b42cc2-b3ea-407a-8822-45a309d6eba3  8.685001e+14   

                         dts                   dtd       event.key  \
0       2023-11-25T11:54:55Z  2023-11-25T11:54:47Z    battery_info   
1       2023-11-26T11:54:31Z  2023-11-26T11:54:29Z             NaN   
2       2023-11-27T11:54:50Z  2023-11-27T11:5

In [6]:
# Showcase the unique columns
for col in result_df_first_1000:
    print(col)

_id
imei
dts
dtd
event.key
event.dte
tracker.metric.bbatv
tracker.metric.bbatp
tracker.loc.dtg
tracker.loc.hdop
tracker.loc.alt
tracker.loc.ang
tracker.loc.sp
tracker.gsm.mcc
tracker.gsm.mnc
tracker.gsm.lac
tracker.gsm.cid
tracker.metric.rssi
tracker.metric.moving
device.metric.deculock
device.metric.dstatus
tracker.config.poutput
device.metric.bsocp
device.metric.bmv
device.config.bfcc
device.config.bfccp
device.metric.btemp
device.metric.bpackv
tracker.config.ecutype
device.metric.bcyc
device.config.dwheel
device.config.bdcc
device.config.bnomv
device.config.ecutype
device.metric.delectemp
device.metric.dpedcad
device.metric.dactualsp
device.metric.dlight
device.metric.dwheels
device.metric.bcur
device.metric.bstate
device.metric.msupp
device.metric.mpow
device.metric.dmostu
device.metric.dmostv
device.metric.dmostw
device.metric.dridem
device.metric.ddayl
device.metric.dcontodo
device.metric.dtrip
device.metric.dwalk
event.meta.status
device.metric.bdtlastc
device.config.dcontver
de

In [7]:
filtered_row = result_df_first_1000[result_df_first_1000['buff'].notna()].iloc[0]

# display row
print("Row where 'event.meta.triggered_by' is not NaN:")
print(filtered_row)

# print value in the row
eventmetatriggered_by_value = filtered_row['buff']
print(f"\nValue in the 'event.meta.triggered_by' column: {eventmetatriggered_by_value}")


Row where 'event.meta.triggered_by' is not NaN:
_id                        c0182215-c6ef-41c2-93f2-86acf3234948
imei                                          868500050014075.0
dts                                        2023-11-26T11:40:00Z
dtd                                        2023-11-26T10:07:20Z
event.key                                          battery_info
                                           ...                 
device.metric.merr                                          NaN
tracker.config.fwver                                        NaN
tracker.config.gsmfwver                                     NaN
event.meta.message                                          NaN
event.meta.step                                             NaN
Name: 3297, Length: 99, dtype: object

Value in the 'event.meta.triggered_by' column: True


In [13]:
# Export the dataframe to a csv file
# result_df_first_1000.to_csv(r'../data/first_1000_JSON_files.csv', index=False)

# ydata profiling

In [None]:
from ydata_profiling import ProfileReport
profile = ProfileReport(result_df_first_1000, title="Profiling Report", minimal=True)
profile.to_file("profiling_report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]