In [2]:
import numpy as np
import pandas as pd
# from fastai.tabular import *

from pathlib import Path

In [3]:
from geopy import distance

In [4]:
data_pth = Path('./data')

In [6]:
nrows=2000

In [5]:
def get_date_stuff(dt):
    return pd.Series(
        data = {
            'Year': dt.year - 2009,
            'Month': dt.month,
            'Day': dt.day,
            'Hour': dt.hour,
            'Quarter': dt.quarter
        }, 
        dtype = 'int')

In [7]:
dtypes = {
    'fare_amount': 'float32',
    'pickup_datetime': 'str',
    'pickup_longitude': 'float32',
    'pickup_latitude': 'float32',
    'dropoff_longitude': 'float32',
    'dropoff_latitude': 'float32',
    'passenger_count': 'int16'
}
use_cols = [
    'fare_amount',
    'pickup_datetime',
    'pickup_longitude',
    'pickup_latitude', 
    'dropoff_longitude', 
    'dropoff_latitude',
    'passenger_count'
]
date_cols = ['pickup_datetime']
df = pd.read_csv(data_pth/'train.csv', nrows=nrows, 
                 dtype=dtypes, usecols=use_cols, 
                 parse_dates=date_cols, infer_datetime_format=True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   fare_amount        2000 non-null   float32       
 1   pickup_datetime    2000 non-null   datetime64[ns]
 2   pickup_longitude   2000 non-null   float32       
 3   pickup_latitude    2000 non-null   float32       
 4   dropoff_longitude  2000 non-null   float32       
 5   dropoff_latitude   2000 non-null   float32       
 6   passenger_count    2000 non-null   int16         
dtypes: datetime64[ns](1), float32(5), int16(1)
memory usage: 58.7 KB


In [9]:
df.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,4.5,2009-06-15 17:26:21,-73.844315,40.721317,-73.841614,40.712276,1
1,16.9,2010-01-05 16:52:16,-74.016045,40.711304,-73.979271,40.782005,1
2,5.7,2011-08-18 00:35:00,-73.982735,40.761269,-73.991241,40.750561,2
3,7.7,2012-04-21 04:30:42,-73.987129,40.733143,-73.99157,40.758091,1
4,5.3,2010-03-09 07:51:00,-73.968094,40.768009,-73.956657,40.783764,1


In [10]:
df = df.dropna(how='any')

In [11]:
df = df.reset_index(drop=True)

In [12]:
df = df.sort_values('pickup_datetime', ascending=True, ignore_index=True)

In [13]:
lat_long_cols = [
    'pickup_longitude',
    'pickup_latitude',
    'dropoff_longitude',
    'dropoff_latitude'
]

In [14]:
df = df.merge(df.pickup_datetime.apply(get_date_stuff), left_index=True, right_index=True)

In [15]:
df.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,Year,Month,Day,Hour,Quarter
0,14.5,2009-01-02 10:13:46,-73.947044,40.780106,-73.989037,40.751587,2,0,1,2,10,1
1,3.7,2009-01-03 13:27:00,-73.963821,40.774162,-73.957649,40.782627,1,0,1,3,13,1
2,3.7,2009-01-06 19:06:00,-73.981186,40.759338,-73.985878,40.758652,2,0,1,6,19,1
3,4.5,2009-01-06 22:30:00,-73.998962,40.738312,-73.997162,40.747028,2,0,1,6,22,1
4,4.9,2009-01-09 14:41:24,-73.95826,40.768902,-73.954742,40.77935,1,0,1,9,14,1


In [19]:
df['crow_distance'] = df[lat_long_cols].apply(lambda d: distance.distance(
                                    (d['pickup_latitude'], d['pickup_longitude']),
                                    (d['dropoff_latitude'], d['dropoff_longitude'])).km, axis=1)

In [20]:
df.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,Year,Month,Day,Hour,Quarter,crow_distance
0,14.5,2009-01-02 10:13:46,-73.947044,40.780106,-73.989037,40.751587,2,0,1,2,10,1,4.753977
1,3.7,2009-01-03 13:27:00,-73.963821,40.774162,-73.957649,40.782627,1,0,1,3,13,1,1.074757
2,3.7,2009-01-06 19:06:00,-73.981186,40.759338,-73.985878,40.758652,2,0,1,6,19,1,0.403474
3,4.5,2009-01-06 22:30:00,-73.998962,40.738312,-73.997162,40.747028,2,0,1,6,22,1,0.979841
4,4.9,2009-01-09 14:41:24,-73.95826,40.768902,-73.954742,40.77935,1,0,1,9,14,1,1.197686


In [20]:
lat_cols = [
    'pickup_latitude',
    'dropoff_latitude'
]
long_cols = [
    'pickup_longitude',
    'dropoff_longitude'
]
def check_latlongs(d):
    if np.abs(d[lat_cols]) > 90.:
        return True
    elif np.abs(d[long_cols]) > 180.:
        return True
    else:
        return False

In [21]:
df['lat_long_wrong'] = False

In [23]:
df['lat_long_wrong'] = df[lat_long_cols].apply(check_latlongs, axis=1)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [None]:
df.head()

In [None]:
df.to_feather(data_pth/'checkpoints/train_data_no_nans.feather')

In [6]:
df = pd.read_feather(data_pth/'checkpoints/train_data_no_nans.feather')

In [7]:
df.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,30.200001,2009-01-01 00:00:27,-73.782104,40.644882,-73.963562,40.67635,1
1,15.0,2009-01-01 00:00:46,-73.953735,40.806763,-73.989426,40.769543,1
2,4.2,2009-01-01 00:00:49,-73.993187,40.72784,-73.990784,40.730007,1
3,5.8,2009-01-01 00:01:04,-73.995132,40.734112,-73.99823,40.722874,2
4,14.6,2009-01-01 00:01:04,-73.972481,40.742744,-73.918938,40.764496,1


In [None]:
df = df.drop(['pickup_latitude', 'pickup_longitude'], axis=1)