# Process RTS Data

In [1]:
# Data processing and visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import csv
from geopy.distance import vincenty

# Common DGLIM utilities
import os, sys
sys.path.insert(1, os.path.join(sys.path[0], '..'))
import dglim
dglim.setProjectPath('../../')

%matplotlib inline

### Load datasets

In [2]:
master_df = dglim.loadData('Master Dataset')

## Get Distances to Nearest Bus Stop

*** Get Bus Stop Data ***

In [3]:
stops_df = pd.read_csv(dglim.datasets_path + 'RTS Data/stops.csv')
stops_df.head()

Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url
0,1,1,Rosa Parks Downtown Station,Northbound SE 3rd ST @ Nearside SE 3rd ST,29.645567,-82.322697,,
1,2,2,Hampton Inn Hotel,Westbound SE 2nd AVE @ Nearside SE 1st ST,29.650346,-82.323804,,
2,3,3,Westbound SW 2nd Avenue @ SW 2nd Street,Westbound SW 2nd AVE @ Nearside SW 2nd ST,29.650397,-82.326459,,
3,4,4,The Continuum,Westbound SW 2nd AVE @ Nearside SW 6th ST,29.650325,-82.329674,,
4,6,6,Ayers Medical Plaza,Westbound SW 2nd AVE @ Nearside SW 8th ST,29.650335,-82.33279,,


*** Find Distance to Closest Stop for each Business ***

In [4]:
def findNearestBusStop(business_location):
    l = 0
    r = len(stops_df)-1
    closest_distance = np.inf;
    closest_stop = -1;
    for i in range(0, r):
        stop_location = (float(stops_df['stop_lat'][i]), float(stops_df['stop_lon'][i]))
        distance = vincenty(business_location, stop_location)
        if distance < closest_distance:
            closest_distance = distance;
            closest_stop = i;
    return (closest_stop, closest_distance)

** WARNING ** This method is very non-optimal and may take a while!

In [5]:
# Let's print a dot every 2% of the job
two_percent = len(master_df)/50

# Set this to limit the number of iterations performed (set to -1 for no limit)
sample_size = -1;

print "Beginning search"

# Sample progress bar to gauge actual progress
print "[",
for i in range(0, 50):
    print ".",
print "]"

# Find the distance between each business and the closest bus stop
distancesDict = {}

print "[",
for i, act_bus in master_df.iterrows():
    bus_loc = (float(act_bus['Latitude']), float(act_bus['Longitude']))
    if not (np.isnan(bus_loc[0]) or np.isnan(bus_loc[1])):
        closest = findNearestBusStop(bus_loc)
        distancesDict[act_bus['DGLIM ID']] = closest;

    # Show a progress bar
    if (i != 0 and i % two_percent == 0):
        print ".",
    if (i == sample_size):
        break;
print "]"
print "Done!"

Beginning search
[ . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ]
[ . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ]
Done!


In [6]:
# Create dataset from results
closest_stop_df = pd.DataFrame()
closest_stop_df['DGLIM ID'] = master_df['DGLIM ID']
closest_stop_df['Distance to Bus Stop'] = closest_stop_df['DGLIM ID'].apply(lambda x : distancesDict[x][1].meters if x in distancesDict else np.nan)

closest_stop_df.head()

Unnamed: 0_level_0,DGLIM ID,Distance to Bus Stop
DGLIM ID,Unnamed: 1_level_1,Unnamed: 2_level_1
16,16,110.521226
17,17,20.692666
24,24,209.539654
25,25,87.251811
38,38,60.741727


*** Save data ***

In [7]:
dglim.saveData(closest_stop_df, 'Bus Stop Distances')