In [31]:
import pandas as pd
import numpy as np
import pickle


# Read process and save sample for testing reasons

In [2]:
# constants

FILE_NAME = "train.csv"
SAMPLE_SIZE = 100000

In [27]:
# `read_sample` takes ~2 minutes

def read_sample(file_name, sample_size=100000):
    numpy.random.seed(42)
    n_rows = sum(1 for line in open(file_name)) - 1 #number of records in file (excludes header)
    ix = np.array(range(1,n_rows+1))
    skip = np.sort(np.random.choice(a=ix, size=n_rows-sample_size, replace=False)) #the 0-indexed header will not be included in the skip list
    return pd.read_csv(file_name, skiprows=skip)
    
def process_df(df):
    df2 = df.copy()

    # drop not useful key
    df2 = df2.drop('key', axis=1)

    # convert timestamp from UTC to localtime New York
    time_col = pd.to_datetime(df2['pickup_datetime'])
    time_col = time_col.dt.tz_localize('utc').dt.tz_convert('America/New_York')
    df2['pickup_datetime'] = time_col
    
    # downcast float precision
    numerical_cols = ['fare_amount', 'pickup_longitude', 
                  'pickup_latitude', 'dropoff_longitude', 
                  'dropoff_latitude']
    df2[numerical_cols] = df2[numerical_cols].apply(pd.to_numeric,downcast='float')
    
    return df2


df = read_sample(file_name=FILE_NAME, sample_size=SAMPLE_SIZE)
df = process_df(df)
df.head()
    

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,11.3,2010-04-16 13:02:00-04:00,-73.972412,40.781231,-73.989189,40.758373,5
1,13.3,2010-08-17 23:46:00-04:00,0.0,0.0,0.0,0.0,5
2,12.9,2011-04-09 10:11:00-04:00,-74.008331,40.725842,-73.9832,40.767673,1
3,10.0,2012-11-26 04:49:00-05:00,-73.994148,40.75156,-73.981911,40.771706,1
4,8.0,2013-12-27 17:35:00-05:00,-73.987686,40.732952,-73.987556,40.749565,1


In [29]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
fare_amount          100000 non-null float32
pickup_datetime      100000 non-null datetime64[ns, America/New_York]
pickup_longitude     100000 non-null float32
pickup_latitude      100000 non-null float32
dropoff_longitude    100000 non-null float32
dropoff_latitude     100000 non-null float32
passenger_count      100000 non-null int64
dtypes: datetime64[ns, America/New_York](1), float32(5), int64(1)
memory usage: 3.4 MB


In [32]:
# save as pickle to keep format

with open('data/sample_100000.csv.pickle', 'wb') as f:
    pickle.dump(df, f)