In [1]:
%matplotlib inline

import os
import sys
import numpy as np
import pandas as pd
import json
from pprint import pprint
from scipy import stats
import gzip

from tqdm.auto import tqdm

import matplotlib.pyplot as plt

In [2]:
# ! curl "https://data.transportation.gov/api/views/8ect-6jqj/rows.json?accessType=DOWNLOAD" | gzip -1 -c > data.json.gz

In [3]:
def get_header_df(fname):
    lines_sofar = []
    with gzip.open(fname) as fh:
        for iline,line in enumerate(tqdm(fh)):
            line = line.decode()
            if line.strip().startswith('"data" :'):
                break
            lines_sofar.append(line)
    lines_sofar[-1] = lines_sofar[-1].replace(",","}")
    js = json.loads("".join(lines_sofar))
    return pd.DataFrame(js["meta"]["view"]["columns"])

def get_column_names(fname):
    return get_header_df(fname)["fieldName"].values.tolist()

column_names = get_column_names("data.json.gz")
def get_df_from_chunk(lines):
    js = '{"data": [%s]}' % (",".join("[%s]"%line for line in lines))
    df = pd.DataFrame(json.loads(js)["data"],columns=column_names)
    df = df[df.columns[~df.columns.str.startswith(":")]]

    ks = ["o_zone","d_zone","int_id","section_id","direction","movement","section_id"]
    for k in ks:
        df[k] = pd.to_numeric(df[k],errors="coerce").fillna(-1).astype(int)

    for k in ["vehicle_id","frame_id","total_frames","global_time","v_class","lane_id","preceding","following"]:
        df[k] = df[k].astype(int)

    for k in ["local_x","local_y","global_x","global_y","v_length","v_width","v_vel","v_acc","space_headway","time_headway"]:
        df[k] = df[k].astype(float)

    location_map = {'i-80': 0, 'lankershim': 1, 'peachtree': 2, 'us-101': 3}
    df["location"] = df["location"].map(location_map)
    return df


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [5]:
meta = get_header_df("data.json.gz")
meta.to_json("meta.json.gz")
meta

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




Unnamed: 0,id,name,dataTypeName,fieldName,position,renderTypeName,format,flags,description,tableColumnId,width,cachedContents
0,-1,sid,meta_data,:sid,0,meta_data,{},[hidden],,,,
1,-1,id,meta_data,:id,0,meta_data,{},[hidden],,,,
2,-1,position,meta_data,:position,0,meta_data,{},[hidden],,,,
3,-1,created_at,meta_data,:created_at,0,meta_data,{},[hidden],,,,
4,-1,created_meta,meta_data,:created_meta,0,meta_data,{},[hidden],,,,
5,-1,updated_at,meta_data,:updated_at,0,meta_data,{},[hidden],,,,
6,-1,updated_meta,meta_data,:updated_meta,0,meta_data,{},[hidden],,,,
7,-1,meta,meta_data,:meta,0,meta_data,{},[hidden],,,,
8,372619689,Vehicle_ID,number,vehicle_id,1,number,"{'precisionStyle': 'standard', 'noCommas': 'tr...",,Vehicle identification number (ascending by ti...,52723321.0,100.0,"{'largest': '3366', 'non_null': 11850526, 'ave..."
9,372619690,Frame_ID,number,frame_id,2,number,"{'precisionStyle': 'standard', 'noCommas': 'tr...",,Frame Identification number (ascending by sta...,52723322.0,100.0,"{'largest': '11691', 'non_null': 11850526, 'av..."


In [63]:
remake_parquet = False
if remake_parquet:
    data_lines = []
    dfs = []
    in_data = False
    ichunk = 0
    outdir = "outputs/"
    os.system("mkdir -p {}".format(outdir))
    with gzip.open("data.json.gz") as fh:
        for iline,line in enumerate(tqdm(fh)):

            line = line.decode()
            if not in_data:
                if line.strip().startswith('"data" :'):
                    in_data = True

            if in_data:
                line = line.replace('"data" : [ [ ',"",1).lstrip(",[ ").rstrip(" ]\n")
                data_lines.append(line)

            if len(data_lines) > 250000:
    #             dfs.append(get_df_from_chunk(data_lines))
                df = get_df_from_chunk(data_lines)
                df.to_parquet("{}/df_{}.parquet".format(outdir,ichunk),compression="lz4")
                ichunk += 1
                data_lines = []

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))