In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re

Get Distance Function

In [2]:
def get_distance(lon1,lat1,lon2,lat2):
    from math import sin, cos, sqrt, atan2, radians
    R = 6373
    lon1 = radians(lon1)
    lat1 = radians(lat1)
    lon2 = radians(lon2)
    lat2 = radians(lat2)
    
    dlon = lon1 - lon2
    dlat = lat1 - lat2
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    
    distance = R * c
    return distance
    

Get all Yellow Taxi csv links

In [3]:
def get_csv_links():
    #this function visits： https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page
    #It requests the source code on the website and get all the hrefs related to csv
    #the urls are saved in link_lists
    link_lists = []
    url = "https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page"
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    aclasses = soup.find_all('a')
    for a in aclasses:
        link_lists.append(a.get('href'))
    #Then we filter on link_lists using re because we only want to grab csv for yellow taxi ranging from 2009 - 2015.
    csv_links = []
    pattern = re.compile(r'.yellow_tripdata_(200[9]|201[0-5])-\d\d\.csv$')
    for i in link_lists:
        if re.search(pattern,i):
            csv_links.append(i)
    return csv_links
    

Remove trip outside

In [4]:
westlimit=-74.242330; southlimit=40.560445; eastlimit=-73.717047; northlimit=40.908524
#Remove the data that is not within the limits specified above

def fix_longitude(input_longitude):
    try:
        input_longitude = float(input_longitude)
    except:
        return np.NaN
    if input_longitude < westlimit or input_longitude > eastlimit:
        return np.NaN
    return input_longitude


def fix_latitude(input_latitude):
    try:
        input_latitude = float(input_latitude)
    except:
        return np.NaN
    if input_latitude < southlimit or input_latitude > northlimit:
        return np.NaN
    return input_latitude


def fix_df(df):
    df['pickup_longitude']=df['pickup_longitude'].apply(fix_longitude)
    df['dropoff_longitude']=df['dropoff_longitude'].apply(fix_longitude)
    df['pickup_latitude']=df['pickup_latitude'].apply(fix_latitude)
    df['dropoff_latitude']=df['dropoff_latitude'].apply(fix_latitude)
    df.dropna(inplace = True)

Add distance column in dataframe #math module don't support series

In [5]:
def add_distance(df):
    distance = []
    lon1 = list(df['pickup_longitude'])
    lon2 = list(df['dropoff_longitude'])
    lat1 = list(df['pickup_latitude'])
    lat2 = list(df['dropoff_latitude'])
    for i in range(len(lon1)):
        distance.append(get_distance(lon1[i],lat1[i],lon2[i],lat2[i]))
    df['distance']  = distance

download csv link into a pandas dataframe and clean it, then output into clean csv

In [6]:
def create_csv(csv_link):
   
    # make sure the output title is consistent 
    title = ""
    m = re.search(r'(yellow.+)', csv_link)
    if m:
        title = m.group(1)
   
    #read the data into a dataframe
    df = pd.read_csv(csv_link,on_bad_lines='skip')
    df = df.rename(columns = lambda x: x.strip())
   
    #drop and rename column
    to_drop = [
        "Unnamed: 0",
        "vendor_name",
        "vendor_id",
        "Vendor_id",
        'VendorID',
        "Trip_distance",
        "Trip_Distance",
        "trip_distance",
        "Rate_Code",
        "store_and_forward",
        "store_and_fwd_flag",
        "Payment_Type",
        "Fare_Amt",
        "surcharge",
        "mta_tax",
        "Tolls_Amt",
        "rate_code",
        "RatecodeID",
        "RateCodeID",
        "payment_type",
        "fare_amount",
        "extra",
        "tolls_amount",
        "improvement_surcharge",
        "Passenger_Count",
        "passenger_count"
    ]
    
    mapper = {
        "Trip_Pickup_DateTime" : "pickup_time",
        "tpep_pickup_datetime" : "pickup_time",
        "pickup_datetime": "pickup_time",
        "dropoff_datetime" : "dropoff_time",
        "Trip_Dropoff_DateTime" : "dropoff_time",
        "tpep_dropoff_datetime" : "dropoff_time",
        "Start_Lon" : "pickup_longitude",
        "Start_Lat" : "pickup_latitude",
        "End_Lon" : "dropoff_longitude",
        "End_Lat" : "dropoff_latitude",
        "Tip_Amt" : 'tip',
        "tip_amount" : "tip",
        "Total_Amt" : "charge",
        "total_amount" : "charge"
    }
    df = df.drop(to_drop, axis = 1,errors = "ignore")
    df = df.rename(mapper, axis = 1)
    df.dropna(inplace = True)
    
    #modify datatype
    df = df.astype({"pickup_time":np.datetime64,"dropoff_time": np.datetime64})
    
    
    #make sure the trip is within(40.560445, -74.242330) and (40.908524, -73.717047)
    fix_df(df)
    
    #sample 3000 rows
    df = df.sample(n=3000)
    
    #add distance
    add_distance(df)
    df.reset_index(inplace = True, drop = True)
    
    df.to_csv(title,index= False)

Run to get all yellow taxi data

In [7]:
from tqdm import tqdm
links = get_csv_links()
for i in tqdm(range(len(links))):
    create_csv(links[i])
    

  create_csv(links[i])
100%|███████████████████████████████████████████████████████████████████████████████| 23/23 [1:14:43<00:00, 194.92s/it]


In [None]:
DF,DROP([],)