In [1]:
import pandas as pd
import numpy as np
import time
from math import sin, cos, sqrt, atan2, radians

In [2]:
nyc_taxi = pd.read_csv('nyc_taxi.csv')
nyc_taxi_test = pd.read_csv('nyc_taxi_test.csv')
others = pd.read_csv('cab_rides.csv')

In [3]:
others.dtypes

distance            float64
cab_type             object
time_stamp            int64
destination          object
source               object
price               float64
surge_multiplier    float64
id                   object
product_id           object
name                 object
dtype: object

In [4]:
def convertEpochToRegular(epoch):
    return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(epoch/1000))

others['time_stamp'] = others['time_stamp'].map(convertEpochToRegular)

In [5]:
others['time_stamp'].value_counts()

2018-11-27 03:21:14    156
2018-11-28 22:11:08    156
2018-11-29 06:47:08    156
2018-11-29 06:29:08    156
2018-11-27 03:39:14    156
                      ... 
2018-12-02 16:03:04      6
2018-12-16 16:55:06      4
2018-11-28 13:32:17      3
2018-11-30 17:17:58      1
2018-12-13 19:10:13      1
Name: time_stamp, Length: 31350, dtype: int64

In [6]:
others.to_csv('others_processed.csv', index=False)

In [7]:
def getDistance(arr):
    # approximate radius of earth in km
    R = 6373.0

    lat1 = radians(arr[0])
    lon1 = radians(arr[1])
    lat2 = radians(arr[2])
    lon2 = radians(arr[3])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c

    return round(distance,2)



In [8]:
nyc_taxi.dtypes

key                   object
fare_amount          float64
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
dtype: object

In [9]:
nyc_taxi['combined'] = nyc_taxi[['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']].values.tolist()
nyc_taxi_test['combined'] = nyc_taxi_test[['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']].values.tolist()

In [10]:
nyc_taxi['distance'] = nyc_taxi['combined'].map(getDistance)
nyc_taxi_test['distance'] = nyc_taxi_test['combined'].map(getDistance)

In [11]:
nyc_taxi['distance']

0          1.03
1          8.45
2          1.39
3          2.80
4          2.00
          ...  
693065     3.23
693066    12.73
693067     0.32
693068     2.27
693069     1.50
Name: distance, Length: 693070, dtype: float64

In [12]:
nyc_taxi.to_csv('nyc_taxi_processed.csv', index=False)

In [13]:
nyc_taxi_test.to_csv('nyc_taxi_test_processed.csv', index=False)