In [2]:
import pandas as pd
import numpy as np
import re
import math
from scipy import spatial
import time
from ast import literal_eval
import itertools

# Filtering Data

Using the randomly picked 350,000 taxi trips from each month, we want to find which ones:
- Start and end near subway stations 
- Require transfers

In [4]:
#load in data
i=1
tripData = pd.read_csv('../Data/trip_data_sample%d.csv'%i,
                        usecols=['pickup_datetime','dropoff_datetime','passenger_count',
                                 'trip_time_in_secs','trip_distance','pickup_longitude',
                                 'pickup_latitude','dropoff_longitude','dropoff_latitude'])

stationData = pd.read_csv('../Data/stops_updated.txt',
                          usecols=['stop_id','stop_name','stop_lat','stop_lon','location_type',
                                   'parent_station','lines'])
stationData = stationData[stationData['location_type']==1]
stationData['lines'] = stationData['lines'].apply(literal_eval)

transfers = pd.read_csv('../Data/transfers_updated.txt',usecols = ['start_lines','end_lines'])

We calculate the distance from each subway station to the starting and ending points of the taxi ride to pick out the closest station and then merge the details of the closest station to the taxi trip data.

In [182]:
startDistances = pd.DataFrame(spatial.distance.cdist(np.column_stack((stationData['stop_lat'], stationData['stop_lon'])),np.column_stack((tripData['pickup_latitude'],tripData['pickup_longitude']))), index=stationData['stop_id'])
endDistances = pd.DataFrame(spatial.distance.cdist(np.column_stack((stationData['stop_lat'], stationData['stop_lon'])),np.column_stack((tripData['dropoff_latitude'],tripData['dropoff_longitude']))), index=stationData['stop_id'])

In [183]:
tripData['starting_station'] = startDistances.idxmin()
tripData['ending_station'] = endDistances.idxmin()

In [184]:
tripData = pd.merge(tripData,stationData[['stop_id','stop_name','stop_lat','stop_lon','lines']],left_on='starting_station',right_on='stop_id')
tripData = pd.merge(tripData,stationData[['stop_id','stop_name','stop_lat','stop_lon','lines']],left_on='ending_station',right_on='stop_id',suffixes=('_start','_end'))
del tripData['starting_station']
del tripData['ending_station']

Using the haversine function, we can calculate the distance in miles from the subway station

In [185]:
R = 3958.754641 #radius of earth in miles
tripData['dist_start'] = 2*R*np.arctan2(np.sqrt(np.sin((np.radians(tripData['stop_lat_start'])-np.radians(tripData['pickup_latitude']))/2)**2 + \
                                                  np.cos(np.radians(tripData['pickup_latitude']))*np.cos(np.radians(tripData['stop_lat_start']))* \
                                                  np.sin((np.radians(tripData['stop_lon_start'])-np.radians(tripData['pickup_longitude']))/2)**2),\
                                        np.sqrt(1-(np.sin((np.radians(tripData['stop_lat_start'])-np.radians(tripData['pickup_latitude']))/2)**2 + \
                                                     np.cos(np.radians(tripData['pickup_latitude']))*np.cos(np.radians(tripData['stop_lat_start']))* \
                                                     np.sin((np.radians(tripData['stop_lon_start'])-np.radians(tripData['pickup_longitude']))/2)**2)))


tripData['dist_end'] = 2*R*np.arctan2(np.sqrt(np.sin((np.radians(tripData['stop_lat_end'])-np.radians(tripData['dropoff_latitude']))/2)**2 + \
                                                  np.cos(np.radians(tripData['dropoff_latitude']))*np.cos(np.radians(tripData['stop_lat_end']))* \
                                                  np.sin((np.radians(tripData['stop_lon_end'])-np.radians(tripData['dropoff_longitude']))/2)**2),\
                                        np.sqrt(1-(np.sin((np.radians(tripData['stop_lat_end'])-np.radians(tripData['dropoff_latitude']))/2)**2 + \
                                                     np.cos(np.radians(tripData['dropoff_latitude']))*np.cos(np.radians(tripData['stop_lat_end']))* \
                                                     np.sin((np.radians(tripData['stop_lon_end'])-np.radians(tripData['dropoff_longitude']))/2)**2)))

We can then see if there are any subway lines in common between the starting and ending station to see if a transfer is required, and if so if a single transfer can be made or if multiple transfers are required.

In [186]:
transfer_list = []
for j in xrange(len(transfers)):
    combs = list(itertools.product(transfers.loc[j]['start_lines'],transfers.loc[j]['end_lines']))
    for comb in combs:
        if comb not in transfer_list:
            transfer_list.append(comb)

In [187]:
def num_transfer(row):
    direct_lines = list(set(row['lines_start']).intersection(row['lines_end']))
    if direct_lines == []:
        combs = list(itertools.product(row['lines_start'],row['lines_end']))
        if set(transfer_list).isdisjoint(combs):
            return 'multiple transfers'
        else:
            return 'single transfer'
    else:
        return 'direct'

tripData['num_transfers'] = tripData.apply(num_transfer,axis=1)

Finally, we filter out any trips that started or ended more than 0.25 miles (~5 minute walk) and any trips that have direct subway routes to get our final data set.

In [None]:
tripData.to_csv('../Data/Analyzed/trip_data_analyzed%d.csv'%i)

In [190]:
tripData = tripData[(tripData['dist_start']<0.25) & (tripData['dist_end']<0.25) & (tripData['num_transfers'] == 'single transfer')]

In [192]:
tripData.to_csv('../Data/Filtered/trip_data_filtered%d.csv'%i)