In [1]:
import pandas as pd
import numpy as np
import time
from math import sin, cos, sqrt, atan2, radians

In [2]:
nyc_taxi = pd.read_csv('nyc_taxi.csv')
others = pd.read_csv('cab_rides.csv')

In [3]:
others.dtypes

distance            float64
cab_type             object
time_stamp            int64
destination          object
source               object
price               float64
surge_multiplier    float64
id                   object
product_id           object
name                 object
dtype: object

In [4]:
def convertEpochToRegular(epoch):
    return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(epoch))

others['time_stamp'] = others['time_stamp'].map(convertEpochToRegular)

In [10]:
others['time_stamp'].value_counts()

50874-04-17 16:09:11    56
50878-08-07 13:33:10    51
50873-08-08 14:07:58    42
50872-11-29 12:09:39    42
50874-04-15 14:13:58    42
                        ..
50921-02-19 01:24:43     1
50880-05-14 09:36:24     1
50921-03-25 17:14:37     1
50889-03-16 18:01:50     1
50879-05-22 01:32:15     1
Name: time_stamp, Length: 114335, dtype: int64

In [6]:
others.to_csv('others_processed.csv', index=False)

In [43]:
def getDistance(arr):
    # approximate radius of earth in km
    R = 6373.0

    lat1 = radians(arr[0])
    lon1 = radians(arr[1])
    lat2 = radians(arr[2])
    lon2 = radians(arr[3])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c

    return round(distance,2)



In [44]:
nyc_taxi.dtypes

key                   object
fare_amount          float64
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
combined              object
distance             float64
dtype: object

In [45]:
nyc_taxi['combined'] = nyc_taxi[['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']].values.tolist()

In [46]:
nyc_taxi['distance'] = nyc_taxi['combined'].map(getDistance)

In [47]:
nyc_taxi['distance']

0          1.03
1          8.45
2          1.39
3          2.80
4          2.00
          ...  
693065     3.23
693066    12.73
693067     0.32
693068     2.27
693069     1.50
Name: distance, Length: 693070, dtype: float64

In [48]:
nyc_taxi.to_csv('nyc_taxi_processed.csv', index=False)