## Import our dataset
- our dataset consists of several files:
    - train.csv: our training data
    - test.csv: our testing data
    - sample_submissions.csv: A sample submission file in the correct format (columns key and fare_amount). This dummy file 'predicts' fare_amount to be $11.35 for all rows, which is the mean fare_amount from the training set.

In [9]:
import pandas as pd
import os

DATA_FILES_PATH = 'projectDataFiles/'

def import_training_dataset(file_path, row_limit):
    """
    function to import the dataset into a pandas dataframe.

    Takes a row limit to limit the number of rows read.
    """
    return pd.read_csv(file_path, nrows=row_limit)

# assign the dataset to the TRAIN Dataframe, right now we are only loading 1,000,000 rows (possibly chunk and feather to reduce loading time)
TRAIN = import_training_dataset(f'{DATA_FILES_PATH}train.csv', 1000000)

# show the head of the the dataset to see its columns
TRAIN.head()
    

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1


## Define a function to get the manhattan distance between two lat/long points
- manhattan distance should be fairly relistic to new york because of the ways that streets work there, but we might want to use the real travel distance between locations somehow

In [10]:
def geo_manhattan_distance(lat1, lat2, long1, long2):
    """
    returns the manhattan distance between two geo points
    """
    return abs(lat2 - lat1) + abs(long2 - long1)


# test it out
geo_manhattan_distance(TRAIN['pickup_latitude'].iloc[0], TRAIN['dropoff_latitude'].iloc[0],\
TRAIN['pickup_longitude'].iloc[0], TRAIN['dropoff_longitude'].iloc[0])



0.011741999999998143

## Define a function to get the real distance between to lat/long points
- Manhattan distance should be useful, but I think we can do better with real distance
- Here we compare a manual calculation to the geopy library

In [11]:
from math import sin, cos, sqrt, atan2, radians
import geopy.distance

def real_distance(lat1, lat2, long1, long2):
    """
    returns the real distance between two datapoints
    """
    R = 6373.0 #approximate radius of earth in km
    rad_lat1, rad_lat2, rad_long1, rad_long2 = (radians(abs(meas)) for meas in [lat1, lat2, long1, long2])
    long_dist = rad_long2 - rad_long1
    lat_dist = rad_lat2 - rad_lat1
    a = sin(lat_dist / 2)**2+ cos(rad_lat1) * cos(rad_lat2) * sin(long_dist / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    return R * c

def geopy_dist(coord1, coord2):
    return geopy.distance.distance(coord1, coord2).kilometers

# test out both functions
dist_test_1 = real_distance(TRAIN['pickup_latitude'].iloc[0], TRAIN['dropoff_latitude'].iloc[0],\
TRAIN['pickup_longitude'].iloc[0], TRAIN['dropoff_longitude'].iloc[0])

dist_test_2 = geopy_dist((TRAIN['pickup_latitude'].iloc[0], TRAIN['pickup_longitude'].iloc[0]), (TRAIN['dropoff_latitude'].iloc[0], TRAIN['dropoff_longitude'].iloc[0]))

print(f'Manual: {dist_test_1} km\nGeopy: {dist_test_2} km')

Manual: 1.0310875150001755km
Geopy: 1.0296007434994463
