In [1]:
%run utils/utils_data_cleaning.py

In [2]:
import numpy as np
import pickle as pkl
import sys
import os

from IPython.display import clear_output

### Load data from csv file

In [3]:
%%time 
df = load_taxi_data_chunk(chunk=50000)

Data loaded with 50000 entries and 10 columns
Wall time: 20.7 s


### Dump it to a binary file

In [67]:
%%time
pkl.dump(df, open('file.pkl', 'wb'))

CPU times: user 6.94 ms, sys: 8 ms, total: 14.9 ms
Wall time: 268 ms


### Read it back 

In [68]:
%%time
data = pkl.load(open('file.pkl', 'rb'))

CPU times: user 0 ns, sys: 3.67 ms, total: 3.67 ms
Wall time: 3.94 ms


**Remark:** Loading the data using pickle, from a binary file, is much faster than doing from the .csv file directly.

Just delete the file:

In [70]:
os.remove('file.pkl')

----
### Generate pkl files from csv

The original csv file has 131165043 (**~130M**) rows. With a chunkesiz of **50k**, we should **2624 chunks**. 

In [3]:
chunk_size = 50000
chunk_number = int(131165043 / chunk_size + 1)
chunk_number

2624

Assuming that it takes 25 seconds to load one chunk with that size, the total would take **18 hours**. 

To avoid that for now, let's load a smaller number of chunks (e.g. 10). We can load more later using the parameter *skiprows*.

In [7]:
rows_to_skip = []#[i for i in range(1, 1 + 100 * 50000)]
chunks_to_load = 10
start = 1
rows_to_skip

[]

In [9]:
i = start
for data_chunk in pd.read_csv('../data/2016_Yellow_Taxi_Trip_Data.csv', chunksize=50000, skiprows=rows_to_skip, parse_dates=date_columns):
    
    clear_output()
    print("\rChunk {0}/{1} ".format(i, chunk_number))    
    
    data_chunk.drop(columns_to_drop, axis=1, inplace=True)
    data_chunk.rename(columns_to_rename,axis=1, inplace=True)
    data_chunk['duration'] = data_chunk.apply(lambda r: (r['do_t'] - r['pu_t']).seconds, axis=1)
    data_chunk['vec_dist'] = data_chunk.apply(lambda s : geopy.distance.geodesic((s.pu_lat, s.pu_lon),(s.do_lat, s.do_lon)).miles, axis=1)
    data_chunk['trip_ratio'] = data_chunk.trip_dist / data_chunk.vec_dist
    
    # Cleaning up
    prior_size = data_chunk.shape[0]
    handle_missing_data(data_chunk)
    handle_duration_outliers(data_chunk, 7200)
    handle_spatial_outliers(data_chunk)
    handle_invalid_trips(data_chunk)
    
    # Dump it
    pkl.dump(data_chunk, open('../data/bin_chunks/ttd_chunk_{0}.pkl'.format(i), 'wb'))

    print("Final chunk size is {0} ({1} dropped).\n".format(data_chunk.shape[0], prior_size - data_chunk.shape[0]), end='\n')
    
    i = i + 1
    if i - start + 1 > chunks_to_load:
        break

Chunk 10/2624 
Size reduction from 50000 to 48822 (1178 samples dropped for missing data)
Size reduction from 48822 to 48752 (70 samples dropped for having longer duration than 7200 seconds)
Size reduction from 48752 to 48721 (31 samples dropped for having outside the region of interest)
Size reduction from 48721 to 48621 (100 samples dropped for being invalid)
Final chunk size is 48621 (1379 dropped).



### Test one of the files

In [11]:
pkl.load(open('../data/bin_chunks/ttd_chunk_5.pkl', 'rb')).head()

Unnamed: 0,pu_t,do_t,trip_dist,pu_lon,pu_lat,do_lon,do_lat,duration,vec_dist,trip_ratio
200000,2016-02-26 09:05:13,2016-02-26 09:18:37,1.35,-73.978851,40.7621,-73.991226,40.750309,804,1.040968,1.29687
200001,2016-02-03 23:45:23,2016-02-04 00:03:34,4.16,-73.993233,40.755428,-73.945961,40.775242,1091,2.831954,1.46895
200002,2016-02-05 12:02:55,2016-02-05 12:09:11,0.8,-73.97802,40.786503,-73.986504,40.779713,376,0.646173,1.238058
200003,2016-02-08 17:25:44,2016-02-08 17:34:07,1.23,-73.967323,40.763527,-73.978668,40.747883,503,1.232742,0.997776
200004,2016-02-10 10:52:41,2016-02-10 11:05:29,0.8,-73.996033,40.732567,-73.986298,40.734493,768,0.527994,1.515167
