In [1]:
import numpy as np
import pandas as pd

In [2]:
uber = pd.read_csv('uber_rides_sample.csv')

In [3]:
uber.columns

Index(['Unnamed: 0', 'key', 'fare_amount', 'pickup_datetime',
       'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
       'dropoff_latitude', 'passenger_count'],
      dtype='object')

In [4]:
def get_distance(lon1,lat1,lon2,lat2):
    from math import sin, cos, sqrt, atan2, radians
    R = 6373
    lon1 = radians(lon1)
    lat1 = radians(lat1)
    lon2 = radians(lon2)
    lat2 = radians(lat2)
    
    dlon = lon1 - lon2
    dlat = lat1 - lat2
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    
    distance = R * c
    return distance

In [5]:
def add_distance(df):
    distance = []
    lon1 = list(df['pickup_longitude'])
    lon2 = list(df['dropoff_longitude'])
    lat1 = list(df['pickup_latitude'])
    lat2 = list(df['dropoff_latitude'])
    for i in range(len(lon1)):
        distance.append(get_distance(lon1[i],lat1[i],lon2[i],lat2[i]))
    df['distance']  = distance

In [6]:
def create_uber(df):
    #clean uber data
    df = df.rename(columns = lambda x: x.strip())
   
    #drop and rename column
    to_drop = [
        "Unnamed: 0",
        "key",
        "passenger_count"
    ]
    
    mapper = {
        "pickup_datetime" :"pickup_time",
        "fare_amount" : "charge"
    }
    df = df.drop(to_drop, axis = 1,errors = "ignore")
    df = df.rename(mapper, axis = 1)
    df.dropna(inplace = True)
    
    #modify datatype
    df = df.astype({"pickup_time":np.datetime64})
    
    
    #add distance
    add_distance(df)
    df.to_csv('uber.csv',index = False)

In [7]:
create_uber(uber)

In [8]:
pd.read_csv('uber.csv')

Unnamed: 0,charge,pickup_time,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,distance
0,7.5,2015-05-07 19:52:06,-73.999817,40.738354,-73.999512,40.723217,1.683851
1,7.7,2009-07-17 20:04:56,-73.994355,40.728225,-73.994710,40.750325,2.458361
2,12.9,2009-08-24 21:45:00,-74.005043,40.740770,-73.962565,40.772647,5.037958
3,5.3,2009-06-26 08:22:21,-73.976124,40.790844,-73.965316,40.803349,1.662205
4,16.0,2014-08-28 17:47:00,-73.925023,40.744085,-73.973082,40.761247,4.476855
...,...,...,...,...,...,...,...
199994,3.0,2012-10-28 10:49:00,-73.987042,40.739367,-73.986525,40.740297,0.112245
199995,7.5,2014-03-14 01:09:00,-73.984722,40.736837,-74.006672,40.739620,1.875639
199996,30.9,2009-06-29 00:42:00,-73.986017,40.756487,-73.858957,40.692588,12.854353
199997,14.5,2015-05-20 14:56:25,-73.997124,40.725452,-73.983215,40.695415,3.540827
