In [6]:
#Import Libraries
import datetime
import json
import os
import csv
import numpy as np
import pandas as pd
import shapefile
import scripthelper

from glob import glob 
from enum import Enum
from functools import reduce
from itertools import count
from operator import add
from tqdm.notebook import tqdm
from numpy import inf
from pathlib import Path
from contextlib import contextmanager
from typing import Iterable, Dict, Tuple, List

# -----------------------------------------------------------------
# ---------------FUNCTION HELPERS-----------------
# -----------------------------------------------------------------

In [None]:
# ---------------------------------------------------------------
# ----------------------------CSVs IO----------------------------
# ---------------------------------------------------------------

@contextmanager
def write_csv(path) -> csv.writer:
    file = open(path, 'w', encoding='utf8', newline='')
    try:
        yield csv.writer(file)
    finally:
        file.close()

# -----------------------------------------------------------------
# ---------------WORKING WITH JSON FILES-----------------
# -----------------------------------------------------------------

# Get Json path file
path = os.getcwd() + '\\Project Data-20221104\\'+ "*.json"
str_path = path.replace("\\", "/") 

file_json = glob(str_path)
file_json[:2]

# Create .csv for writing
vehicle_positions_csv = (os.getcwd() + '\\Project Data-20221104\\' + "vehicle_positions_csv.csv").replace("\\", "/") 
print(vehicle_positions_csv)

csv_header = ['Timestamp', 'LineId', 'DirectionId', 'DistanceFromPoint', 'PointId']

# Transform JSON to CSV
with write_csv(vehicle_positions_csv) as writer:
    writer.writerow(csv_header)
    for raw_json_path in tqdm(file_json):
        file = open(raw_json_path, 'r', encoding='utf8')
        data = json.load(file)['data']
        file.close()
        for time in data:
            timestamp = time['time']
            for response in time['Responses']:
                if response is None:
                    # Skip if response is empty
                    continue
                for line in response['lines']:
                    line_id = line['lineId']
                    for vehiclePosition in line['vehiclePositions']:
                        writer.writerow([
                            timestamp,
                            line_id,
                            vehiclePosition['directionId'],
                            vehiclePosition['distanceFromPoint'],
                            vehiclePosition['pointId'],
                        ])

# Read Vehicle Position CSV
vehicle_positions_df = pd.read_csv(vehicle_positions_csv)
vehicle_positions_df.head()

In [7]:
# Read Vehicle Position CSV Locally
vehicle_positions_df = pd.read_csv('vehicle_positions_csv.csv')
vehicle_positions_df.head()

Unnamed: 0,Timestamp,LineId,DirectionId,DistanceFromPoint,PointId
0,1630914886924,1,8161,1,8012
1,1630914886924,1,8162,0,8142
2,1630914886924,1,8162,0,8282
3,1630914886924,1,8731,0,8111
4,1630914886924,1,8162,1,8062


In [8]:
# Vehicle Position data details
print("Data dimension:",vehicle_positions_df.shape)
vehicle_positions_df.info();

Data dimension: (19421883, 5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19421883 entries, 0 to 19421882
Data columns (total 5 columns):
 #   Column             Dtype
---  ------             -----
 0   Timestamp          int64
 1   LineId             int64
 2   DirectionId        int64
 3   DistanceFromPoint  int64
 4   PointId            int64
dtypes: int64(5)
memory usage: 740.9 MB


In [11]:
# Time attribute is the time in milliseconds (unix epoch)
# Convert timestamp -> date, hour, day

weekdays = ['Monday','Tuesday','Wednesday', 'Thursday','Friday']

vehicle_positions_df['Date'] = pd.to_datetime(vehicle_positions_df['Timestamp'], unit='ms')
vehicle_positions_df['Time'] = pd.to_datetime(vehicle_positions_df['Timestamp'], unit='ms')
vehicle_positions_df['Weekday'] = pd.to_datetime(vehicle_positions_df['Timestamp'], unit='ms')

vehicle_positions_df['Date'] = vehicle_positions_df['Date'].dt.strftime("%Y-%m-%d")
vehicle_positions_df['Time'] = vehicle_positions_df['Time'].dt.strftime("%H:%M:%S")
vehicle_positions_df['Weekday'] = vehicle_positions_df['Weekday'].dt.strftime("%A") \
                    .apply(lambda x: 0 if x in weekdays else 1 if x=='Saturday' else 2 if x=='Sunday' else 3)

In [12]:
#Write Timestamp convertion to New CSV
vehicle_positions_df.to_csv('vehicle_positions_time.csv',mode='w', index=False)

In [13]:
# Read New Vehicle Position CSV
vehicle_positions_time = pd.read_csv('vehicle_positions_time.csv')
vehicle_positions_time.head(5)

Unnamed: 0,Timestamp,LineId,DirectionId,DistanceFromPoint,PointId,Date,Time,Weekday
0,1630914886924,1,8161,1,8012,2021-09-06,07:54:46,0
1,1630914886924,1,8162,0,8142,2021-09-06,07:54:46,0
2,1630914886924,1,8162,0,8282,2021-09-06,07:54:46,0
3,1630914886924,1,8731,0,8111,2021-09-06,07:54:46,0
4,1630914886924,1,8162,1,8062,2021-09-06,07:54:46,0


## END

In [None]:
##LOAD DATA INTO DB using DM_LoadJsonIntoDb