In [1]:
import pandas as pd
import numpy as np
import pickle


# Read process and save sample for testing reasons

In [2]:
# constants

FILE_NAME = "train.csv"
SAMPLE_SIZE = 10000000

In [None]:
# `read_sample` takes ~2 minutes

def read_sample(file_name, sample_size=100000):
    np.random.seed(42)
    n_rows = sum(1 for line in open(file_name)) - 1 #number of records in file (excludes header)
    ix = np.array(range(1,n_rows+1))
    skip = np.sort(np.random.choice(a=ix, size=n_rows-sample_size, replace=False)) #the 0-indexed header will not be included in the skip list
    return pd.read_csv(file_name, skiprows=skip)
    
def process_df(df):
    df2 = df.copy()

    # drop not useful key
    df2 = df2.drop('key', axis=1)

    # convert timestamp from UTC to localtime New York
    time_col = pd.to_datetime(df2['pickup_datetime'])
    time_col = time_col.dt.tz_localize('utc').dt.tz_convert('America/New_York')
    df2['pickup_datetime'] = time_col
    
    # downcast float precision
    numerical_cols = ['fare_amount', 'pickup_longitude', 
                  'pickup_latitude', 'dropoff_longitude', 
                  'dropoff_latitude']
    df2[numerical_cols] = df2[numerical_cols].apply(pd.to_numeric,downcast='float')
    
    return df2


df = read_sample(file_name=FILE_NAME, sample_size=SAMPLE_SIZE)
df = process_df(df)
df.head()
    

In [None]:
df.info(memory_usage='deep')

In [None]:
# save as pickle to keep format

# with open('data/sample_100000.csv.pickle', 'wb') as f:
#     pickle.dump(df, f)

# save as csv
df.to_csv(f'data/sample_{SAMPLE_SIZE}.csv', index=False)


In [6]:
# save as pickle to keep format

# with open('data/sample_100000.csv.pickle', 'rb') as f:
#     df = pickle.load(f)