In [1]:
import os
import sys
sys.path.insert(0, os.path.join('..', 'app'))

In [2]:
import csv
import json
import numpy as np
import os
import polars as pl

import glob
import pandas as pd
from random import randrange
from datetime import datetime, timedelta

from lib.data import DataObject, load_json
from config.data import *
from config.graph import *

DATA_DIR = '../app/data'

In [3]:
data_obj = DataObject(data_dir=DATA_DIR)

In [4]:
device_emb = load_json(os.path.join(DATA_DIR, EMB_JSON_NAME))

In [5]:
def random_date(start, end):
    delta = end + timedelta(days=1) - start
    return (start + timedelta(days=randrange(delta.days))).date()

def random_time(start, end):
    delta = end - start
    return (start + timedelta(seconds=randrange(delta.seconds)))

def spark_s_data():
    from pyspark.sql import SparkSession, functions as f

    spark = SparkSession \
        .builder \
        .appName("parking") \
        .getOrCreate()

    g_df = spark.read.csv(os.path.join(DATA_DIR, G_DATA_NAME_CSV), header=True)
    g_df = g_df.select(f.col('bay_id').alias('BayID')).distinct()

    b_df = spark.read.csv(os.path.join(DATA_DIR, B_DATA_NAME), header=True)
    b_df = b_df.select(['BayID', 'DeviceID']).distinct()

    b_df = b_df.join(g_df, on='BayID', how='inner').select(f.col('DeviceID').alias('DeviceId'))

    df = spark.read.csv(os.path.join(DATA_DIR, S_DATA_NAME), header=True)\
            .join(b_df, on='DeviceId', how='inner')\
            .withColumn("ArrivalTime", f.to_timestamp('ArrivalTime', 'MM/dd/yyyy h:m:s a'))\
            .withColumn("DepartureTime", f.to_timestamp('DepartureTime', 'MM/dd/yyyy h:m:s a'))\
            .withColumn("ArrivalDate", f.to_timestamp('DepartureTime', 'MM/dd/yyyy h:m:s a').cast('Date'))
    df = df.toDF(*[col.replace(' ', '') for col in df.columns])
    df.coalesce(1).write.partitionBy('ArrivalDate').mode('overwrite').format("parquet").save(os.path.join(DATA_DIR, S_DATA_TABLE))

# spark_s_data()

In [6]:
b_data = pd.read_csv(os.path.join(DATA_DIR, B_DATA_NAME)).set_index('DeviceID').to_dict('index')

In [7]:
start = datetime(year=2017, month=1, day=1)
end = datetime(year=2017, month=12, day=31)

batch_data_path = glob.glob(os.path.join(DATA_DIR, S_DATA_TABLE, f'ArrivalDate={random_date(start, end)}', '*.parquet'))[0]
batch_data = pd.read_parquet(batch_data_path)

In [8]:
sample_idx = np.random.randint(batch_data.shape[0])
sample_data = batch_data.loc[sample_idx, :]
sample_dt = random_time(sample_data['ArrivalTime'], sample_data['DepartureTime'])

weekday = sample_dt.dayofweek # 0 = Mon, 7 = Sun
duration = sample_data['DurationSeconds']
device_id = sample_data['DeviceId']

is_violation = sample_data['InViolation']
is_use = sample_data['VehiclePresent']

In [9]:
device_id

'21867'

In [31]:
sample_data

DeviceId                         21867
ArrivalTime        2017-06-21 22:06:29
DepartureTime      2017-06-21 23:00:00
DurationSeconds                   3211
StreetMarker                     4527W
Sign                              None
Area                    West Melbourne
StreetId                           839
StreetName                 KING STREET
BetweenStreet1           BATMAN STREET
BetweenStreet2         JEFFCOTT STREET
SideOfStreet                         5
InViolation                      False
VehiclePresent                   False
Name: 339, dtype: object

In [30]:
data_obj.g_data.collect().filter(pl.col('bay_id') == b_data[int(device_id)]['BayID']).to_numpy()

array([['21937',
        'Clarendon Street between Wellington Parade and George Street',
        '11947W', 5264, 'IAW1', 20201223122248, 144.982644328797,
        -37.815125222171]], dtype=object)

# sensor data and geo data are different

In [33]:
b_data[int(device_id)]

{'BayID': 5264,
 'Description1': '3P MTR M-F 10:00-15:00',
 'Description2': 'P10 M-F 1500-1600',
 'Description3': '3P MTR M-F 19:00-20:30',
 'Description4': '3P MTR SAT 7:30-20:30',
 'Description5': nan,
 'Description6': nan,
 'DisabilityExt1': 360,
 'DisabilityExt2': 20.0,
 'DisabilityExt3': 360.0,
 'DisabilityExt4': 360.0,
 'DisabilityExt5': nan,
 'DisabilityExt6': nan,
 'Duration1': 180,
 'Duration2': 10.0,
 'Duration3': 180.0,
 'Duration4': 180.0,
 'Duration5': nan,
 'Duration6': nan,
 'EffectiveOnPH1': 0,
 'EffectiveOnPH2': 0.0,
 'EffectiveOnPH3': 0.0,
 'EffectiveOnPH4': 0.0,
 'EffectiveOnPH5': nan,
 'EffectiveOnPH6': nan,
 'EndTime1': '15:00:00',
 'EndTime2': '16:00:00',
 'EndTime3': '20:30:00',
 'EndTime4': '20:30:00',
 'EndTime5': nan,
 'EndTime6': nan,
 'Exemption1': nan,
 'Exemption2': nan,
 'Exemption3': nan,
 'Exemption4': nan,
 'Exemption5': nan,
 'Exemption6': nan,
 'FromDay1': 1,
 'FromDay2': 1.0,
 'FromDay3': 1.0,
 'FromDay4': 6.0,
 'FromDay5': nan,
 'FromDay6': nan,
 '

In [16]:
device_emb[device_id]

[0.65949547290802,
 0.04236556217074394,
 -0.8195937275886536,
 0.17929166555404663,
 0.9067535400390625,
 0.25302648544311523,
 0.7187158465385437,
 0.2816624641418457,
 -0.029325049370527267,
 -0.42516499757766724,
 -0.9476456046104431,
 0.7799811959266663,
 -1.0594756603240967,
 -0.21646928787231445,
 0.8539930582046509,
 1.1114743947982788,
 -0.22079038619995117,
 0.04981782287359238,
 -0.5824868083000183,
 0.6851041316986084,
 0.607040524482727,
 -0.7498977184295654,
 0.7843365669250488,
 -0.19353941082954407,
 -0.5831809043884277,
 -0.3065613806247711,
 0.7278852462768555,
 -0.002504889154806733,
 -0.5381037592887878,
 0.46480509638786316,
 0.7337797284126282,
 -0.7526929974555969]