# Project 1: Mapping tech hubs and combining stations data with turnstile data to plot traffic on a map of NYC

## Importing all the things

In [2]:
import pickle
from geopy.distance import great_circle
import pandas as pd
import numpy as np
import re

## Using geopy to make a column for tech hubs in stations dataset (coordinates found via google maps)

In [2]:
midtownCenter = (40.754925, -73.984063)
fidiCenter = (40.709761, -74.006453)
flatCenter = (40.741568, -73.989095)
dumboCenter = (40.701602, -73.985842)
chelseaCenter = (40.746531, -73.997195)
sohoCenter = (40.724173, -74.000683)
boroughDict = {}
boroughDict["midtown"] = midtownCenter
boroughDict["fidi"] = fidiCenter
boroughDict["flatiron"] = flatCenter
boroughDict["dumbo"] = dumboCenter
boroughDict["chelsea"] = chelseaCenter
boroughDict['soho'] = sohoCenter

#This function returns the string representation of the likely borough, given a set of latitude/longitude coordinates
#If the distance to the borough center is too far away from the closest borough, we assume that the location
#is outside of a tech hub
def get_closest_borough(latitude,longitude,max_dist = 2):
    global boroughDict
    borough_distances = {borough:great_circle(boroughDict[borough],(latitude,longitude)).miles for borough in boroughDict}
    min_borough = min(borough_distances, key=borough_distances.get)
    if borough_distances[min_borough] < max_dist:
        return min_borough 
    else:
        return "outside_tech_hub"

In [3]:
# Checking function
get_closest_borough(40.754925, -73.984063)

'midtown'

In [921]:
station_df.columns

Index(['Station ID', 'Division', 'Stop Name', 'Borough', 'GTFS Latitude',
       'GTFS Longitude'],
      dtype='object')

Mapping tech hubs onto stations dateset and grabbing tech stations

In [922]:
latitude = station_df['GTFS Latitude']
longitude = station_df['GTFS Longitude']

In [923]:
station_df['tech_borough'] = list(map(get_closest_borough, latitude, longitude))

In [924]:
tech_stations = station_df.loc[station_df['tech_borough']!= 'outside_tech_hub', 'Stop Name'].unique()

Stations dataset now holds coordinates and a column for which tech hub (or outside of tech hub) these coordinates are located in

## Commence effort to combine stations dataset with turnstile dataset

In [919]:
with open('data/clean_df.pickle','rb') as read_file:
    clean_df = pickle.load(read_file)

In [920]:
with open('data/station_df.pickle','rb') as read_file:
    station_df = pickle.load(read_file)

In [925]:
clean_df.head()

Unnamed: 0,station,entries,exits,date_time,weekday,hour,year,turnstile_id,exit_counts,entry_counts,total_traffic
0,59 ST,6470223,2190140,2018-01-06 07:00:00,5,7,2018,0,17.0,7.0,24.0
1,59 ST,6470256,2190229,2018-01-06 11:00:00,5,11,2018,0,89.0,33.0,122.0
2,59 ST,6470379,2190299,2018-01-06 15:00:00,5,15,2018,0,70.0,123.0,193.0
3,59 ST,6470665,2190366,2018-01-06 19:00:00,5,19,2018,0,67.0,286.0,353.0
4,59 ST,6470809,2190398,2018-01-06 23:00:00,5,23,2018,0,32.0,144.0,176.0


## Column cleaning: Strategy 1
#### The only column which could be used to merge these datasets is a text column. And it is messy. 
#### Strategy 1 involves regex in attempts to bring these columns closer together. Trying to remove -, /, spaces, capitalize, etc.

In [931]:
stop_name = station_df['Stop Name']
station_name = clean_df['station']

In [934]:
stop_name = [x.upper() for x in stop_name]
station_name = [x.upper() for x in station_name]

In [935]:
stop_name = [re.sub(r'AVE','AV', x) for x in stop_name]
station_name = [re.sub(r'AVE','AV',x) for x in station_name]

In [936]:
stop_name = [re.sub(r'AVS','AV', x) for x in stop_name]
station_name = [re.sub(r'AVS','AV',x) for x in station_name]

In [937]:
stop_name = [re.sub(r'/',' ',x) for x in stop_name]
station_name = [re.sub('/',' ',x) for x in station_name]

In [938]:
stop_name = [re.sub(r'-', '',x) for x in stop_name]
station_name = [re.sub('-', '',x) for x in station_name]

In [939]:
stop_name = [x.strip() for x in stop_name]
station_name = [x.strip() for x in station_name]

In [940]:
stop_name = [re.sub(r' +',' ', x) for x in stop_name]
station_name = [re.sub(r' +',' ', x) for x in station_name]

In [941]:
# Strips all spaces. Not super practical. Not readable.
#tech_stations = sorted([re.sub(r"\s+", "", x) for x in tech_stations])
#unique_stations_turn = sorted([re.sub(r"\s+","",x) for x in unique_stations_turn])

Checking how many columns line up after cleaning

In [979]:
combined_stations = []
for station in station_name:
    if station in stop_name:
        combined_stations.append(station)
len(combined_stations)

1342325

Ok. Columns are better. We now capture greater than 50% in common, but still not enough. Pickling to not have to redo these cleaning steps

In [945]:
with open('data/cleaner_df.pickle', 'wb') as to_write:
    pickle.dump(clean_df, to_write)

In [3]:
with open('data/cleaner_df.pickle','rb') as read_file:
    clean_df = pickle.load(read_file)

In [947]:
with open('data/clean_station_df.pickle', 'wb') as to_write:
    pickle.dump(station_df, to_write)

In [4]:
with open('data/clean_station_df.pickle','rb') as read_file:
    station_df = pickle.load(read_file)

## Strategy 2: Brittany's Insanity
#### Brittany's Insanity: Going through and manually filtering column data to merge stations with turnstiles

In [6]:
## Deleting 14th st from stations
station_df = station_df.drop(station_df[station_df['Stop Name'] == '14TH STREET'].index, axis = 0)
clean_df = clean_df.drop(clean_df[clean_df['station'] == '14TH STREET'].index, axis = 0)

In [7]:
## Deleting 168 Washington from turnstile
clean_df = clean_df.drop(clean_df[clean_df['station'] == '34 STHUDSON YD'].index, axis = 0)
## Deleting 34 Hudson from station
station_df = station_df.drop(station_df[station_df['Stop Name'] == '168 ST WASHINGTON HTS'].index, axis = 0)

In [8]:
# '46 ST BLISS ST' from turnstile
clean_df = clean_df.drop(clean_df[clean_df['station'] == '46 ST BLISS ST'].index, axis = 0)

In [9]:
# 4AV9 ST from turnstile
clean_df = clean_df.drop(clean_df[clean_df['station'] == '4AV9 ST'].index, axis = 0)

In [10]:
# '72 ST2 AV' from turnstile
clean_df = clean_df.drop(clean_df[clean_df['station'] == '72 ST2 AV'].index, axis = 0)

In [11]:
# '86 ST2 AV','96 ST2 AV','9TH STREET' from turnstile '9 ST' from station
clean_df = clean_df.drop(clean_df[clean_df['station'] == '86 ST2 AV'].index, axis = 0)
clean_df = clean_df.drop(clean_df[clean_df['station'] == '96 ST2 AV'].index, axis = 0)
clean_df = clean_df.drop(clean_df[clean_df['station'] == '9TH STREET'].index, axis = 0)
station_df = station_df.drop(station_df[station_df['Stop Name'] == '9 ST'].index, axis = 0)

In [12]:
#'ANNADALE' AND 'ARTHUR KILL' from station
station_df = station_df.drop(station_df[station_df['Stop Name'] == 'ANNADALE'].index, axis = 0)
station_df = station_df.drop(station_df[station_df['Stop Name'] == 'ARTHUR KILL'].index, axis = 0)

In [13]:
#"B'WAYLAFAYETTE" and 'BAY TERRACE'
station_df = station_df.drop(station_df[station_df['Stop Name'] == 'BAY TERRACE'].index, axis = 0)
clean_df = clean_df.drop(clean_df[clean_df['station'] == "B'WAYLAFAYETTE"].index, axis = 0)

In [14]:
#BAY RIDGE AV': 'BAY RIDGE 95 ST',
 #'BAY RIDGE95 ST': 'BAY RIDGE AV',
 #'BEDFORD PK BLVD': 'BEDFORD NOSTRAND AV',
 #'BEDFORDNOSTRAN': 'BEDFORD PARK BLVD',
    #'BEDFORD PARK BLVD LEHMAN COLLEGE' from stations
station_df = station_df.drop(station_df[station_df['Stop Name'] == 'BAY RIDGE AV'].index, axis = 0)
clean_df = clean_df.drop(clean_df[clean_df['station'] == 'BAY RIDGE AV'].index, axis = 0)
station_df = station_df.drop(station_df[station_df['Stop Name'] == 'BAY RIDGE 95 ST'].index, axis = 0)
clean_df = clean_df.drop(clean_df[clean_df['station'] == 'BAY RIDGE95 ST'].index, axis = 0)
station_df = station_df.drop(station_df[station_df['Stop Name'] == 'BEDFORD NOSTRAND AV'].index, axis = 0)
clean_df = clean_df.drop(clean_df[clean_df['station'] == 'BEDFORD PK BLVD'].index, axis = 0)
station_df = station_df.drop(station_df[station_df['Stop Name'] == 'BEDFORD PARK BLVD'].index, axis = 0)
clean_df = clean_df.drop(clean_df[clean_df['station'] == 'BEDFORDNOSTRAN'].index, axis = 0)
station_df = station_df.drop(station_df[station_df['Stop Name'] == 'BEDFORD PARK BLVD LEHMAN COLLEGE'].index, axis = 0)

In [15]:
# 'BROADWAYLAFAYETTE ST' from stations
station_df = station_df.drop(station_df[station_df['Stop Name'] == 'BROADWAYLAFAYETTE ST'].index, axis = 0)

In [16]:
# 'CATHEDRAL PKWY (110 ST)' from stations
station_df = station_df.drop(station_df[station_df['Stop Name'] == 'CATHEDRAL PKWY (110 ST)'].index, axis = 0)

In [17]:
# 'CITY BUS' from turnstile 'CLIFTON' from stations
clean_df = clean_df.drop(clean_df[clean_df['station'] == 'CITY BUS'].index, axis = 0)
station_df = station_df.drop(station_df[station_df['Stop Name'] == 'CLIFTON'].index, axis = 0)

In [18]:
# 'DONGAN HILLS' from stations
station_df = station_df.drop(station_df[station_df['Stop Name'] == 'DONGAN HILLS'].index, axis = 0)

In [19]:
# 'ELTINGVILLE' from stations
station_df = station_df.drop(station_df[station_df['Stop Name'] == 'ELTINGVILLE'].index, axis = 0)

In [20]:
clean_df = clean_df.drop(clean_df[clean_df['station'] == 'GROVE STREET'].index, axis = 0)
station_df = station_df.drop(station_df[station_df['Stop Name'] == 'GRANT CITY'].index, axis = 0)
station_df = station_df.drop(station_df[station_df['Stop Name'] == 'GRASMERE'].index, axis = 0)
station_df = station_df.drop(station_df[station_df['Stop Name'] == 'GREAT KILLS'].index, axis = 0)

In [21]:
# 'HARRISON' from turnstil
clean_df = clean_df.drop(clean_df[clean_df['station'] == 'HARRISON'].index, axis = 0)

In [22]:
# 'HOYT SCHERMERHORN STS' from stations
station_df = station_df.drop(station_df[station_df['Stop Name'] == 'HOYT SCHERMERHORN STS'].index, axis = 0)

In [23]:
# 'JACKSON HTS ROOSEVELT AV' from stations
station_df = station_df.drop(station_df[station_df['Stop Name'] == 'JACKSON HTS ROOSEVELT AV'].index, axis = 0)

In [24]:
# 'JOURNAL SQUARE' 'JFK JAMAICA CT1' from turnstiles
clean_df = clean_df.drop(clean_df[clean_df['station'] == 'JOURNAL SQUARE'].index, axis = 0)
clean_df = clean_df.drop(clean_df[clean_df['station'] == 'JFK JAMAICA CT1'].index, axis = 0)

In [25]:
# 'LACKAWANNA' AND 'LEXINGTON AV 53 ST' from turnstile and station
clean_df = clean_df.drop(clean_df[clean_df['station'] == 'LACKAWANNA'].index, axis = 0)
station_df = station_df.drop(station_df[station_df['Stop Name'] == 'LEXINGTON AV 53 ST'].index, axis = 0)

In [26]:
# 'MIDDLE VILLAGE METROPOLITAN AV' from station
station_df = station_df.drop(station_df[station_df['Stop Name'] == 'MIDDLE VILLAGE METROPOLITAN AV'].index, axis = 0)

In [27]:
#NEWARK BM BW','NEWARK C', 'NEWARK HM HE', 'NEWARK HW BMEBE' in turnstiles
clean_df = clean_df.drop(clean_df[clean_df['station'] == 'NEWARK BM BW'].index, axis = 0)
clean_df = clean_df.drop(clean_df[clean_df['station'] == 'NEWARK C'].index, axis = 0)
clean_df = clean_df.drop(clean_df[clean_df['station'] == 'NEWARK HM HE'].index, axis = 0)
clean_df = clean_df.drop(clean_df[clean_df['station'] == 'NEWARK HW BMEBE'].index, axis = 0)

In [28]:
# 'OAKWOOD HEIGHTS' from stations
station_df = station_df.drop(station_df[station_df['Stop Name'] == 'OAKWOOD HEIGHTS'].index, axis = 0)

In [29]:
# 'PATH NEW WTC', 'PATH WTC 2', 'PAVONIA NEWPORT' from turnstiles 'PLEASANT PLAINS' from stations
clean_df = clean_df.drop(clean_df[clean_df['station'] == 'PATH NEW WTC'].index, axis = 0)
clean_df = clean_df.drop(clean_df[clean_df['station'] == 'PATH WTC 2'].index, axis = 0)
clean_df = clean_df.drop(clean_df[clean_df['station'] == 'PAVONIA NEWPORT'].index, axis = 0)
station_df = station_df.drop(station_df[station_df['Stop Name'] == 'PLEASANT PLAINS'].index, axis = 0)

In [30]:
# "PRINCE'S BAY" from stations, 'RITMANHATTAN' FROM TURNSTILE
clean_df = clean_df.drop(clean_df[clean_df['station'] == 'RITMANHATTAN'].index, axis = 0)
station_df = station_df.drop(station_df[station_df['Stop Name'] == "PRINCE'S BAY"].index, axis = 0)

In [31]:
# STAPLETON' from stations and 'SUTPHINARCHER' from turnstile
clean_df = clean_df.drop(clean_df[clean_df['station'] == 'SUTPHINARCHER'].index, axis = 0)
station_df = station_df.drop(station_df[station_df['Stop Name'] == "STAPLETON"].index, axis = 0)

In [32]:
# 'SUTPHIN BLVD ARCHER AV JFK AIRPORT' from station THIRTY ST','THIRTY THIRD ST' from turnstiles
clean_df = clean_df.drop(clean_df[clean_df['station'] == 'THIRTY ST'].index, axis = 0)
clean_df = clean_df.drop(clean_df[clean_df['station'] == 'THIRTY THIRD ST'].index, axis = 0)
station_df = station_df.drop(station_df[station_df['Stop Name'] == 'SUTPHIN BLVD ARCHER AV JFK AIRPORT'].index, axis = 0)

In [33]:
# TOTTENVILLE','UNION SQ 14 ST' from station, 'TWENTY THIRD ST' from turnstiles
clean_df = clean_df.drop(clean_df[clean_df['station'] == 'TWENTY THIRD ST'].index, axis = 0)
station_df = station_df.drop(station_df[station_df['Stop Name'] == 'TOTTENVILLE'].index, axis = 0)
station_df = station_df.drop(station_df[station_df['Stop Name'] == 'UNION SQ 14 ST'].index, axis = 0)

In [34]:
# 'WOODSIDE 61 ST' from station
station_df = station_df.drop(station_df[station_df['Stop Name'] == 'WOODSIDE 61 ST'].index, axis = 0)

Checking how many columns now line up

In [979]:
combined_stations = []
for station in station_name:
    if station in stop_name:
        combined_stations.append(station)
len(combined_stations)

1342325

## Manual Check Steps Below

I print the length of each new column, compare them side by side, and fix issues above (by deleting column items) and below (by deleting entire rows from my new dictionary)

This means that I do not capture absolutely **all** of the dataset, but I get far more of it by pulling out errors than I would by manually trying to find every GPS coordinate for 3xx stations

In [35]:
unique_stations_turn = sorted(list(clean_df['station'].unique()))
unique_stations_station = sorted(list(station_df['Stop Name'].unique()))

In [36]:
len(unique_stations_turn)

347

In [37]:
len(unique_stations_station)

347

In [38]:
df_combined = {'Station': unique_stations_station, 'Turnstile': unique_stations_turn}
df_combined = pd.DataFrame(data = df_combined)

I want to make a dictionary mapping station names to turnstile names

In [39]:
df_combined = df_combined.set_index('Turnstile')

In [40]:
df_combined

Unnamed: 0_level_0,Station
Turnstile,Unnamed: 1_level_1
1 AV,1 AV
103 ST,103 ST
103 STCORONA,103 ST CORONA PLAZA
104 ST,104 ST
110 ST,110 ST
...,...
WOODLAWN,WOODLAWN
WORLD TRADE CTR,WORLD TRADE CENTER
WTCCORTLANDT,WTC CORTLANDT
YORK ST,YORK ST


Creating a dictionary of unequal items (items which are the same station but spelled differently or abbreviated differently). This will be used when I need to map these items from one dataset to the other in order to make them both equal

In [41]:
all_unequal = {}

In [42]:
for i,v in enumerate(df_combined['Station']):
     if df_combined.index[i] != v:
            all_unequal[df_combined.index[i]] = v

In [43]:
del all_unequal["E 143 ST MARY'S"]
del all_unequal['E 149 ST']
del all_unequal['E 180 ST']
del all_unequal['EAST 105 ST']
del all_unequal['DELANCEY ESSEX']
del all_unequal['EXCHANGE PLACE']
del all_unequal['EUCLID AV']
del all_unequal['GRAND ST']
del all_unequal['GRANT AV']
del all_unequal['GRANDNEWTOWN']
all_unequal['GRD CNTRL42 ST'] = 'GRAND CENTRAL 42 ST'
del all_unequal['JEFFERSON ST']
del all_unequal['JKSN HTROOSVLT']
del all_unequal['LEXINGTON AV 53']
del all_unequal['NEW LOTS']
del all_unequal['RITROOSEVELT']
del all_unequal['ST LAWRENCE AV']
del all_unequal['ST. GEORGE']

In [44]:
all_unequal

{'103 STCORONA': '103 ST CORONA PLAZA',
 '116 STCOLUMBIA': '116 ST COLUMBIA UNIVERSITY',
 '137 ST CITY COL': '137 ST CITY COLLEGE',
 '138 GRAND CONC': '138 ST GRAND CONCOURSE',
 '14 STUNION SQ': '14 ST UNION SQ',
 '149 GRAND CONC': '149 ST GRAND CONCOURSE',
 '15 STPROSPECT': '15 ST PROSPECT PARK',
 '161 YANKEE STAD': '161 ST YANKEE STADIUM',
 '163 STAMSTERDM': '163 ST AMSTERDAM AV',
 '21 STQNSBRIDGE': '21 ST QUEENSBRIDGE',
 '3 AV149 ST': '3 AV 149 ST',
 '33 STRAWSON ST': '34 ST 11 AV',
 '34 STHERALD SQ': '34 ST HERALD SQ',
 '34 STPENN STA': '34 ST PENN STATION',
 '4 AV9 ST': '4 AV',
 '40 ST LOWERY ST': '40 ST',
 '42 STBRYANT PK': '42 ST BRYANT PK',
 '42 STPORT AUTH': '42 ST PORT AUTHORITY BUS TERMINAL',
 '4750 STS ROCK': '4750 STS ROCKEFELLER CTR',
 '57 ST7 AV': '57 ST 7 AV',
 '59 ST COLUMBUS': '59 ST COLUMBUS CIRCLE',
 '61 ST WOODSIDE': '62 ST',
 '63 DRREGO PARK': '63 DR REGO PARK',
 '66 STLINCOLN': '66 ST LINCOLN CENTER',
 '68STHUNTER CO': '68 ST HUNTER COLLEGE',
 '74 STBROADWAY': '7

## Replace Columns:
#### Finally....a dictionary to map

In [46]:
clean_df['station'] = clean_df['station'].replace(all_unequal)

In [76]:
# Creating copies, because merging is hard and I want to be able to go back to my clean dfs
station_df_copy = station_df.copy()

In [49]:
clean_df_copy = clean_df.copy()

In [78]:
# In order to do a .join() the column names need to be equal
station_df_copy = station_df_copy.rename(columns = {'Stop Name':'station'})

In [83]:
# I discovered that there were duplicate stations in the stations_df causing the merge to add a million rows
station_df_copy = station_df_copy.drop_duplicates('station')

In [85]:
# Trying to remove more problems from the merge by dropping this useless column
station_df_copy = station_df_copy.drop('Station ID', axis = 1)

## Troubleshooting:

#### Figuring out that there were duplicates in the stop name column, dropping nas, etc.

In [67]:
combined = combined.reset_index().dropna()

# missing = []
# for column in combined.columns:
#     bad_indices = combined[combined[column].isna()].index.values
#     missing.extend(bad_indices)
# missing = set(missing)

In [90]:
combined.shape

(2067574, 15)

In [63]:
# stations = combined['station'].unique()
 
# for station in stations:
 #   print(combined[combined['station'] == station]['Station ID'].unique())

[119.]
[156. 309. 395.]
[450.]
[ 82. 193.]
[394.]
[ 81. 194. 449.]
[154. 393. 440.]
[307.]
[80.]
[153. 306. 392. 439.]
[152. 438.]
[305.]
[391.]
[166. 229. 322.]
[ 15. 406.]
[151. 304. 437.]
[390. 435.]
[241.]
[150. 220.]
[303.]
[219. 389.]
[149.]
[218. 388.]
[148.]
[255.]
[217. 387.]
[428.]
[216.]
[147.]
[385.]
[ 66.  74. 245.]
[321.]
[146. 301.]
[214.]
[383.]
[145.]
[300.]
[232.]
[67. 75.]
[298.]
[282.]
[221.]
[297.]
[420.]
[419.]
[ 14. 165. 228. 320. 405.]
[295.]
[418.]
[294.]
[69.]
[31.]
[ 13. 319. 404.]
[118.]
[377.]
[434.]
[3.]
[403. 460.]
[471.]
[ 12. 227.]
[164. 318.]
[5.]
[ 32. 272.]
[6.]
[239.]
[459.]
[226.]
[163.]
[33.]
[270. 458.]
[225.]
[10.]
[466.]
[276.]
[8.]
[ 61. 162. 316.]
[401.]
[457.]
[34.]
[62.]
[224.]
[9.]
[ 35. 400.]
[161. 315.]
[116.]
[63.]
[263.]
[268.]
[314.]
[262.]
[399.]
[455.]
[ 41. 240. 277.]
[64.]
[160. 313. 477.]
[454.]
[260.]
[85.]
[ 37. 398.]
[ 65. 312.]
[ 71. 115.]
[16.]
[190.]
[159.]
[453.]
[84.]
[ 38.  79. 158. 311. 397. 476.]
[191.]
[59.]
[452.]
[1

In [1007]:
# clean_df_copy.join(station_df_copy, on = 'station', how = 'right')


Unnamed: 0,station,entries,exits,date_time,weekday,hour,year,turnstile_id,exit_counts,entry_counts,total_traffic
0,59 ST,6470223,2190140,2018-01-06 07:00:00,5,7,2018,0,17.0,7.0,24.0
1,59 ST,6470256,2190229,2018-01-06 11:00:00,5,11,2018,0,89.0,33.0,122.0
2,59 ST,6470379,2190299,2018-01-06 15:00:00,5,15,2018,0,70.0,123.0,193.0
3,59 ST,6470665,2190366,2018-01-06 19:00:00,5,19,2018,0,67.0,286.0,353.0
4,59 ST,6470809,2190398,2018-01-06 23:00:00,5,23,2018,0,32.0,144.0,176.0
...,...,...,...,...,...,...,...,...,...,...,...
2284152,RITROOSEVELT,5554,379,2019-06-05 17:00:00,2,17,2019,9686,0.0,0.0,0.0
2284153,RITROOSEVELT,5554,379,2019-06-05 21:00:00,2,21,2019,9686,0.0,0.0,0.0
2284154,RITROOSEVELT,5554,379,2019-06-06 01:00:00,3,1,2019,9686,0.0,0.0,0.0
2284155,RITROOSEVELT,5554,379,2019-06-06 05:00:00,3,5,2019,9686,0.0,0.0,0.0


## Merging:
#### It Works! A combined df of stations, turnstiles, gps coordinates, and tech hubs

In [88]:
combined = clean_df_copy.set_index('station').join(station_df_copy.set_index('station'), how = 'left')

In [89]:
combined

Unnamed: 0_level_0,entries,exits,date_time,weekday,hour,year,turnstile_id,exit_counts,entry_counts,total_traffic,Division,Borough,GTFS Latitude,GTFS Longitude,tech_borough
station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1 AV,13080575,14585697,2018-01-06 07:00:00,5,7,2018,1491,120.0,31.0,151.0,BMT,M,40.730953,-73.981628,flatiron
1 AV,13080870,14586195,2018-01-06 11:00:00,5,11,2018,1491,498.0,295.0,793.0,BMT,M,40.730953,-73.981628,flatiron
1 AV,13081558,14587007,2018-01-06 15:00:00,5,15,2018,1491,812.0,688.0,1500.0,BMT,M,40.730953,-73.981628,flatiron
1 AV,13082789,14588521,2018-01-06 23:00:00,5,23,2018,1491,591.0,501.0,1092.0,BMT,M,40.730953,-73.981628,flatiron
1 AV,13083004,14588781,2018-01-07 03:00:00,6,3,2018,1491,260.0,215.0,475.0,BMT,M,40.730953,-73.981628,flatiron
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZEREGA AV,39,148,2019-06-07 05:00:00,4,5,2019,8530,0.0,0.0,0.0,IRT,Bx,40.836488,-73.847036,outside_tech_hub
ZEREGA AV,39,148,2019-06-07 09:00:00,4,9,2019,8530,0.0,0.0,0.0,IRT,Bx,40.836488,-73.847036,outside_tech_hub
ZEREGA AV,39,148,2019-06-07 13:00:00,4,13,2019,8530,0.0,0.0,0.0,IRT,Bx,40.836488,-73.847036,outside_tech_hub
ZEREGA AV,39,148,2019-06-07 17:00:00,4,17,2019,8530,0.0,0.0,0.0,IRT,Bx,40.836488,-73.847036,outside_tech_hub


## Pickling the final combined df

In [91]:
with open('data/combined_df.pickle', 'wb') as to_write:
    pickle.dump(combined, to_write)