In [2]:
import os
import pandas as pd 
import dask.dataframe as dd
from tqdm import tqdm_notebook

TRAIN_PATH = '/Volumes/transcend/大檔案/Taxi/NYT/train.csv'

In [3]:
%%time
# Assume we only know that the csv file is somehow large, but not the exact size
# we want to know the exact number of rows

with open(TRAIN_PATH) as file:
    n_rows = len(file.readlines())

print('Exact number of rows: {n_rows}')

Exact number of rows: 55423857
CPU times: user 23 s, sys: 23.3 s, total: 46.3 s
Wall time: 1min 28s


In [5]:
traintypes = {'fare_amount': 'float32',
              'pickup_datetime': 'str', 
              'pickup_longitude': 'float32',
              'pickup_latitude': 'float32',
              'dropoff_longitude': 'float32',
              'dropoff_latitude': 'float32',
              'passenger_count': 'uint8'}

cols = list(traintypes.keys())

In [8]:
chunksize = 5000000 # 5 million rows at one go. Or try 10 million

In [9]:
%%time
df_list = [] # list to hold the batch dataframe

for df_chunk in tqdm_notebook(pd.read_csv(TRAIN_PATH, usecols=cols, dtype=traintypes, chunksize=chunksize)):
     
    # Neat trick from https://www.kaggle.com/btyuhas/bayesian-optimization-with-xgboost
    # Using parse_dates would be much slower!
    df_chunk['pickup_datetime'] = df_chunk['pickup_datetime'].str.slice(0, 16)
    df_chunk['pickup_datetime'] = pd.to_datetime(df_chunk['pickup_datetime'], utc=True, format='%Y-%m-%d %H:%M')
    
    # Can process each chunk of dataframe here
    # clean_data(), feature_engineer(),fit()
    
    # Alternatively, append the chunk to list and merge all
    df_list.append(df_chunk) 


CPU times: user 2min 8s, sys: 14.6 s, total: 2min 23s
Wall time: 2min 32s


In [10]:
# Merge all dataframes into one dataframe
df = pd.concat(df_list)

# Delete the dataframe list to release memory
del df_list

# See what we have loaded
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55423856 entries, 0 to 55423855
Data columns (total 7 columns):
fare_amount          float32
pickup_datetime      datetime64[ns, UTC]
pickup_longitude     float32
pickup_latitude      float32
dropoff_longitude    float32
dropoff_latitude     float32
passenger_count      uint8
dtypes: datetime64[ns, UTC](1), float32(5), uint8(1)
memory usage: 1.5 GB


### 匯出成Feather格式

In [14]:
%%time
# Save into feather format, about 1.5Gb.
df.to_feather('/Volumes/transcend/大檔案/Taxi/NYT/nyc_taxi_data_raw.feather')

CPU times: user 4.31 s, sys: 4.37 s, total: 8.68 s
Wall time: 35.8 s
