# Creating Data Formats
Origin file is in CSV. 
Use pandas to convert to 
- parquet with snappy and gzip conversion. Dash only support gzip. Snappy is default in Pandas
- hdf for modin
- hdf5 for vaex 

In [7]:
import os
import sys
import yaml
import json
import numpy as np

with open("config.yaml", 'r') as yml_file:
    cfg = yaml.safe_load(yml_file)
    
#Get data path from config 
data_path = cfg['brian_laptop']['data_path']
mb_div = cfg['constant']['mb_div']
gb_div = cfg['constant']['gb_div']
             

#Get dyype for meta file 
meta_data_file = os.path.join(data_path, cfg['data']['metadata'])
with open(meta_data_file) as f:
    meta_data = json.load(f)

In [8]:
import pandas as pd
print(pd.__version__)
import datetime as dt

0.25.1


## Covert Data
- read in the csv
- print out the file sizes

In [3]:
yellow_trip_file = os.path.join(data_path, cfg['data']['nyc_yellow_trip_csv'])
print("File Size(GB):",round(os.path.getsize(yellow_trip_file)/gb_div,3))

#get the datatype
yellow_taxi_dtype = dict(zip(
    meta_data['yellow_trip_taxi']['colnames'],
    meta_data['yellow_trip_taxi']['dtype']))

yellow_trip = pd.read_csv(yellow_trip_file, 
                          parse_dates = meta_data['yellow_trip_taxi']['datecols'])

File Size(GB): 2.72


In [49]:
## parquet_snappy
## snappy is defaut for pd.to_parquet
yellow_trip_file = os.path.join(data_path, cfg['data']['nyc_yellow_trip_parquet_snappy'])
yellow_trip.to_parquet(yellow_trip_file, compression = "snappy")
print("File Size(GB):",round(os.path.getsize(yellow_trip_file)/gb_div,3))

File Size(GB): 0.699


In [50]:
## parquet_gzip
yellow_trip_file = os.path.join(data_path, cfg['data']['nyc_yellow_trip_parquet_gzip'])
yellow_trip.to_parquet(yellow_trip_file, compression = "gzip")
print("File Size(GB):",round(os.path.getsize(yellow_trip_file)/gb_div,3))

File Size(GB): 0.523


In [51]:
## hdf
yellow_trip_file  = os.path.join(data_path, cfg['data']['nyc_yellow_trip_hdf'])
yellow_trip.to_hdf(yellow_trip_file, key = "year_2010_06", mode = "w")
print("File Size(GB):",round(os.path.getsize(yellow_trip_file)/gb_div,3))

File Size(GB): 2.16


## HDF5 created with VAEX

In [4]:
import vaex
print(vaex.__version__)

1.0.0-beta.6


In [5]:
print(type(yellow_trip))
df = vaex.from_pandas(yellow_trip)
print(type(df))

<class 'pandas.core.frame.DataFrame'>
<class 'vaex.dataframe.DataFrameArrays'>


In [9]:
yellow_trip_file = os.path.join(data_path, cfg['data']['nyc_yellow_trip_hdf5'])
df.export_hdf5(yellow_trip_file)
print("File Size(GB):",round(os.path.getsize(yellow_trip_file)/gb_div,3))

File Size(GB): 2.224


In [56]:
yellow_trip.head()

Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,rate_code,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,surcharge,mta_tax,tip_amount,tolls_amount,total_amount
0,CMT,2010-06-18 10:43:46,2010-06-18 11:22:12,4,20.0,-73.972894,40.79281,2,0.0,-73.776308,40.645528,Cas,45.0,0.0,0.5,0.0,4.57,50.07
1,CMT,2010-06-26 15:02:57,2010-06-26 15:07:15,1,0.7,-73.987815,40.74848,1,0.0,-73.977535,40.75388,Cas,4.5,0.0,0.5,0.0,0.0,5.0
2,CMT,2010-06-24 07:36:37,2010-06-24 07:43:29,1,1.0,-74.006497,40.732921,1,0.0,-73.998122,40.725982,Cas,5.7,0.0,0.5,0.0,0.0,6.2
3,CMT,2010-06-23 18:42:21,2010-06-23 18:55:41,2,3.5,-73.987534,40.733397,1,0.0,-73.951097,40.770543,Cre,10.9,1.0,0.5,1.0,0.0,13.4
4,CMT,2010-06-23 14:10:43,2010-06-23 14:17:02,1,1.2,-73.955832,40.779476,1,0.0,-73.968117,40.765118,Cas,5.7,0.0,0.5,0.0,0.0,6.2


## TODO: 
- Find CLI for streaming csv files to different formats. 