In [1]:
import pandas as pd
import numpy as np
import pickle


# Read process and save sample for testing reasons

In [2]:
# constants

FILE_NAME = "train.csv"
SAMPLE_SIZE = 100000

In [3]:
# `read_sample` takes ~2 minutes

def read_sample(file_name, sample_size=100000):
    np.random.seed(42)
    n_rows = sum(1 for line in open(file_name)) - 1 #number of records in file (excludes header)
    ix = np.array(range(1,n_rows+1))
    skip = np.sort(np.random.choice(a=ix, size=n_rows-sample_size, replace=False)) #the 0-indexed header will not be included in the skip list
    return pd.read_csv(file_name, skiprows=skip)
    
def process_df(df):
    df2 = df.copy()

    # drop not useful key
    df2 = df2.drop('key', axis=1)

    # convert timestamp from UTC to localtime New York
    time_col = pd.to_datetime(df2['pickup_datetime'])
    time_col = time_col.dt.tz_localize('utc').dt.tz_convert('America/New_York')
    df2['pickup_datetime'] = time_col
    
    # downcast float precision
    numerical_cols = ['fare_amount', 'pickup_longitude', 
                  'pickup_latitude', 'dropoff_longitude', 
                  'dropoff_latitude']
    df2[numerical_cols] = df2[numerical_cols].apply(pd.to_numeric,downcast='float')
    
    return df2


df = read_sample(file_name=FILE_NAME, sample_size=SAMPLE_SIZE)
df = process_df(df)
df.head()
    

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,8.1,2009-01-23 02:28:00-05:00,-73.996071,40.732605,-73.980675,40.761864,1
1,9.0,2014-01-04 16:11:00-05:00,-73.977638,40.752346,-73.97039,40.768867,2
2,6.5,2015-06-21 21:54:24-04:00,-74.000076,40.728611,-73.988731,40.722172,6
3,4.5,2010-11-24 07:46:52-05:00,-73.962685,40.775646,-73.957436,40.777767,1
4,4.5,2011-10-31 22:19:34-04:00,-73.9963,40.753601,-73.998802,40.761101,1


In [4]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
fare_amount          100000 non-null float32
pickup_datetime      100000 non-null datetime64[ns, America/New_York]
pickup_longitude     100000 non-null float32
pickup_latitude      100000 non-null float32
dropoff_longitude    99997 non-null float32
dropoff_latitude     99997 non-null float32
passenger_count      100000 non-null int64
dtypes: datetime64[ns, America/New_York](1), float32(5), int64(1)
memory usage: 3.4 MB


In [9]:
%ls

AMLD- Baseline Model.ipynb
Helper Notebook AMLD 2020.ipynb
Isochrone Maps.ipynb
List of Exercises for getting started at some geospatial data analysis.ipynb
ORSM Test.ipynb
README.md
Traces Visualisation.ipynb
[1m[36mdata[m[m/
[1m[36mrouter[m[m/
[31mtrain.csv[m[m*


In [12]:
# save as pickle to keep format

# with open('data/sample_100000.csv.pickle', 'wb') as f:
#     pickle.dump(df, f)

# save as csv
df.to_csv('data/sample_100000.csv', index=False)


In [6]:
# save as pickle to keep format

# with open('data/sample_100000.csv.pickle', 'rb') as f:
#     df = pickle.load(f)