In [1]:
%run utils/utils_data_cleaning.py

In [2]:
import numpy as np
import pickle as pkl
import sys
import os
import logging

from IPython.display import clear_output

### Load data from csv file

In [None]:
%%time 
df = load_taxi_data_chunk(chunk=50000)

### Dump it to a binary file

In [None]:
%%time
pkl.dump(df, open('file.pkl', 'wb'))

### Read it back 

In [None]:
%%time
data = pkl.load(open('file.pkl', 'rb'))

**Remark:** Loading the data using pickle, from a binary file, is much faster than doing from the .csv file directly.

Just delete the file:

In [None]:
os.remove('file.pkl')

----
### Generate pkl files from csv

The original csv file has 131165043 (**~130M**) rows. With a chunkesiz of **50k**, we should **2624 chunks**. 

In [3]:
chunk_size = 50000
chunk_number = int(131165043 / chunk_size + 1)
chunk_number

2624

Assuming that it takes 25 seconds to load one chunk with that size, the total would take **18 hours**. 

To avoid that for now, let's load a smaller number of chunks (e.g. 10). We can load more later using the parameter *skiprows*.

In [142]:
chunks_to_skip = 626
rows_to_skip = chunks_to_skip * chunk_size

start = chunks_to_skip + 1

# How many chunks to load (-1 for all)
chunks_to_load = -1    

In [143]:
%run utils/utils_data_cleaning.py

logging.basicConfig(filename='_data_dumping.log',
                    format='%(asctime)s | %(levelname)s | %(message)s',
                    datefmt='%m/%d/%Y %I:%M:%S %p',
                    level=logging.INFO)

i = start
for data_chunk in pd.read_csv('../data/2016_Yellow_Taxi_Trip_Data.csv', chunksize=chunk_size, skiprows=range(1, rows_to_skip + 1), parse_dates=date_columns):
    
    clear_output()
    print("\rChunk {0}/{1} ".format(i, chunk_number))    
    
    data_chunk.drop(columns_to_drop, axis=1, inplace=True)
    data_chunk.rename(columns_to_rename,axis=1, inplace=True)
    
    # Handle Missing Data
    prior_size = data_chunk.shape[0]
    data_chunk = handle_missing_data(data_chunk)
    
    # Augment data
    data_chunk['duration'] = data_chunk.apply(lambda r: (r['do_t'] - r['pu_t']).seconds, axis=1)
    data_chunk['vec_dist'] = data_chunk.apply(lambda s : geopy.distance.geodesic((s.pu_lat, s.pu_lon),(s.do_lat, s.do_lon)).miles, axis=1)
    data_chunk['trip_ratio'] = data_chunk.trip_dist / data_chunk.vec_dist
    
    # Handle outliers and invalid trips
    handle_duration_outliers(data_chunk, 7200)
    handle_spatial_outliers(data_chunk)
    handle_invalid_trips(data_chunk)
    
    # Dump it
    pkl.dump(data_chunk, open('../data/bin_chunks/ttd_chunk_{0}.pkl'.format(i), 'wb'))

    print("Final chunk size is {0} ({1} dropped).\n".format(data_chunk.shape[0], prior_size - data_chunk.shape[0]), end='\n')

    logging.info("Chunk {0}: Size went from {1} to {2}".format(i, prior_size, data_chunk.shape[0]))
    
    i = i + 1
    if chunks_to_load is not -1:
        if i - start + 1 > chunks_to_load:
            break

Chunk 627/2624 
Size reduction from 17353 to 16932 (421 samples dropped for missing data)
Size reduction from 16932 to 16905 (27 samples dropped for having longer duration than 7200 seconds)
Size reduction from 16905 to 16891 (14 samples dropped for having outside the region of interest)
Size reduction from 16891 to 16861 (30 samples dropped for being invalid)
Final chunk size is 16861 (492 dropped).



### Test one of the files

In [3]:
test = pkl.load(open('../data/bin_chunks/ttd_chunk_500.pkl', 'rb'))
test.head()

Unnamed: 0,pu_t,do_t,trip_dist,pu_lon,pu_lat,do_lon,do_lat,duration,vec_dist,trip_ratio
24950000,2016-03-16 00:42:57,2016-03-16 00:57:12,4.6,-74.000359,40.727055,-73.944664,40.691372,855,3.822906,1.203273
24950001,2016-03-07 13:45:13,2016-03-07 14:02:16,2.03,-73.987152,40.756435,-73.982407,40.736088,1023,1.425953,1.423609
24950002,2016-03-10 17:34:17,2016-03-10 17:45:44,1.11,-73.963234,40.768913,-73.973885,40.7616,687,0.752898,1.474303
24950003,2016-03-07 07:00:55,2016-03-07 07:19:18,6.6,-73.98632,40.76759,-73.944054,40.841949,1103,5.589228,1.180843
24950004,2016-03-27 12:30:35,2016-03-27 12:34:20,0.5,-73.952156,40.771736,-73.952454,40.766136,225,0.38673,1.292892


### Combine enough chunks to make a 10M size dataset

In [3]:
n_chunks = 200
chunks = [pkl.load(open('../data/bin_chunks/ttd_chunk_{0}.pkl'.format(i), 'rb')) for i in range(1, n_chunks + 1)]

In [6]:
#dataset = pd.concat(chunks,axis=1)

In [7]:
# print("Data shape:", dataset.shape)
# dataset.head()

In [7]:
dataset_10M = dataset[:10000000]
dataset_10M = dataset_10M.reset_index()
pkl.dump(dataset_10M, open('../data/dataset_10M.pkl', 'wb'))