## Define a function to get the real distance between to lat/long points
- Manhattan distance should be useful, but I think we can do better with real distance
- Here we compare a manual calculation to the geopy library

In [5]:
from math import sin, cos, sqrt, atan2, radians
import geopy.distance

def geo_manhattan_distance(lat1, lat2, long1, long2):
    """
    returns the manhattan distance between two geo points
    """
    return abs(lat2 - lat1) + abs(long2 - long1)

def geopy_dist(coord1, coord2):
    try:
        return geopy.distance.distance(coord1, coord2).kilometers
    except:
        return -1

def haversine(lat1, lon1, lat2, lon2, km_const=6371.0):
    lat1, lon1, lat2, lon2 = map(abs, [lat1, lon1, lat2, lon2])
    lat1, lon1, lat2, lon2 = map(np.deg2rad, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1 
    dlon = lon2 - lon1 
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    mi = km_const * c
    return mi

## Import our dataset
- our dataset consists of several files:
    - train.csv: our training data
    - test.csv: our testing data
    - sample_submissions.csv: A sample submission file in the correct format (columns key and fare_amount). This dummy file 'predicts' fare_amount to be $11.35 for all rows, which is the mean fare_amount from the training set.

In [6]:
import pandas as pd
import os
import sys
import numpy as np

TOTAL_ROWS = 55423855

DATA_FILES_PATH = 'projectDataFiles/'

# training data types
TRAINING_TYPES = {
    'fare_amount': 'float32',
    'pickup_datetime': 'str',
    'pickup_longitude': 'float32',
    'pickup_latitude': 'float32',
    'dropoff_longitude': 'float32',
    'dropoff_latitude': 'float32',
    'passenger_count': 'uint8'
}

COLUMNS = list(TRAINING_TYPES.keys()) + ['real_dist']

FEATURES = [item for item in COLUMNS if item != 'fare_amount']

LABEL = 'fare_amount'

def import_training_dataset_limit(file_path, row_limit=100000):
    """
    function to import the dataset into a pandas dataframe.

    Takes a row limit to limit the number of rows read.
    """
    if row_limit:
        return pd.read_csv(file_path, nrows=row_limit)
    else:
        return pd.read_csv(file_path)


def get_df_list(file_path, chunksize=1000000):
    df_list = []
    pd.set_option('use_inf_as_na', True)
    for df_chunk in pd.read_csv(file_path, chunksize=chunksize, dtype=TRAINING_TYPES):
        df_chunk['pickup_datetime'] = df_chunk['pickup_datetime'].str.slice(0, 16)
        df_chunk['pickup_datetime'] = pd.to_datetime(df_chunk['pickup_datetime'], utc=True, format='%Y-%m-%d %H:%M')
        df_chunk['real_dist'] = haversine(df_chunk['pickup_latitude'], df_chunk['pickup_longitude'], df_chunk['dropoff_latitude'], df_chunk['dropoff_longitude'])
        df_chunk['hour'] = df_chunk['pickup_datetime'].dt.hour
        add_col = pd.get_dummies(df_chunk['hour'], prefix='hour')
        df_chunk = pd.concat([df_chunk, add_col], axis=1)
        df_chunk.drop(['hour', 'key', 'pickup_datetime'], axis=1, inplace=True)
        df_chunk.dropna(axis=1, how='any', inplace=True)
        df_list.append(df_chunk)
    return df_list
        

def read_feathered_data(file_path):
    return pd.read_feather(file_path)

def feather_dataset(dataframe, file_out):
    dataframe.to_feather(file_out)

# import the dataset as a list of chunks, from here we can do our processing at a chunk level
print('Importing Datasets...')
TRAINING_LIST = get_df_list(f'{DATA_FILES_PATH}train.csv')
TEST = pd.concat(get_df_list(f'{DATA_FILES_PATH}test.csv'))

TRAINING_LIST[0].head()
    

Importing Datasets...


  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,passenger_count,hour_0,hour_1,hour_2,hour_3,hour_4,hour_5,...,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23
0,4.5,-73.844315,40.721317,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,16.9,-74.016045,40.711304,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,5.7,-73.982735,40.761269,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,7.7,-73.987129,40.733143,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,5.3,-73.968094,40.768009,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
TEST.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,real_dist,hour_0,hour_1,hour_2,hour_3,...,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23
0,-73.97332,40.763805,-73.98143,40.743835,1,2.323358,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,-73.986862,40.719383,-73.998886,40.739201,1,2.425299,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,-73.982521,40.751259,-73.979652,40.74614,1,0.618403,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,-73.981163,40.767807,-73.990448,40.751637,1,1.960912,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,-73.966049,40.789776,-73.988564,40.744427,1,5.387211,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


## Perform a SGD partial fit
- SGD stands for stochastic gradient descent
- Here we are feeding our chunks into the partial fit

In [8]:
from sklearn.linear_model import SGDRegressor

def fit_training_chunks(chunk_list):
    my_sgd_regressor = SGDRegressor()
    for chunk in chunk_list:
        print(chunk.columns.difference(['pickup_datetime', LABEL]))
        my_sgd_regressor.partial_fit(chunk[chunk.columns.difference(['pickup_datetime', LABEL])], chunk[LABEL])
    y_predict = my_sgd_regressor.predict(TEST[TEST.columns.difference(['pickup_datetime', LABEL])])
    return y_predict


fit_training_chunks(TRAINING_LIST)

Index(['hour_0', 'hour_1', 'hour_10', 'hour_11', 'hour_12', 'hour_13',
       'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18', 'hour_19',
       'hour_2', 'hour_20', 'hour_21', 'hour_22', 'hour_23', 'hour_3',
       'hour_4', 'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9',
       'passenger_count', 'pickup_latitude', 'pickup_longitude'],
      dtype='object')
Index(['hour_0', 'hour_1', 'hour_10', 'hour_11', 'hour_12', 'hour_13',
       'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18', 'hour_19',
       'hour_2', 'hour_20', 'hour_21', 'hour_22', 'hour_23', 'hour_3',
       'hour_4', 'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9',
       'passenger_count', 'pickup_latitude', 'pickup_longitude'],
      dtype='object')
Index(['hour_0', 'hour_1', 'hour_10', 'hour_11', 'hour_12', 'hour_13',
       'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18', 'hour_19',
       'hour_2', 'hour_20', 'hour_21', 'hour_22', 'hour_23', 'hour_3',
       'hour_4', 'hour_5', 'hour_6', 'hour_7'

Index(['hour_0', 'hour_1', 'hour_10', 'hour_11', 'hour_12', 'hour_13',
       'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18', 'hour_19',
       'hour_2', 'hour_20', 'hour_21', 'hour_22', 'hour_23', 'hour_3',
       'hour_4', 'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9',
       'passenger_count', 'pickup_latitude', 'pickup_longitude'],
      dtype='object')
Index(['hour_0', 'hour_1', 'hour_10', 'hour_11', 'hour_12', 'hour_13',
       'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18', 'hour_19',
       'hour_2', 'hour_20', 'hour_21', 'hour_22', 'hour_23', 'hour_3',
       'hour_4', 'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9',
       'passenger_count', 'pickup_latitude', 'pickup_longitude'],
      dtype='object')
Index(['hour_0', 'hour_1', 'hour_10', 'hour_11', 'hour_12', 'hour_13',
       'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18', 'hour_19',
       'hour_2', 'hour_20', 'hour_21', 'hour_22', 'hour_23', 'hour_3',
       'hour_4', 'hour_5', 'hour_6', 'hour_7'

Index(['hour_0', 'hour_1', 'hour_10', 'hour_11', 'hour_12', 'hour_13',
       'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18', 'hour_19',
       'hour_2', 'hour_20', 'hour_21', 'hour_22', 'hour_23', 'hour_3',
       'hour_4', 'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9',
       'passenger_count', 'pickup_latitude', 'pickup_longitude'],
      dtype='object')
Index(['hour_0', 'hour_1', 'hour_10', 'hour_11', 'hour_12', 'hour_13',
       'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18', 'hour_19',
       'hour_2', 'hour_20', 'hour_21', 'hour_22', 'hour_23', 'hour_3',
       'hour_4', 'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9',
       'passenger_count', 'pickup_latitude', 'pickup_longitude'],
      dtype='object')
Index(['hour_0', 'hour_1', 'hour_10', 'hour_11', 'hour_12', 'hour_13',
       'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18', 'hour_19',
       'hour_2', 'hour_20', 'hour_21', 'hour_22', 'hour_23', 'hour_3',
       'hour_4', 'hour_5', 'hour_6', 'hour_7'

ValueError: shapes (9914,30) and (27,) not aligned: 30 (dim 1) != 27 (dim 0)