In [1]:
from __future__ import print_function, division

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
%matplotlib inline

In [5]:
# Source: http://web.mta.info/developers/turnstile.html
def get_data(week_nums):
    url = "http://web.mta.info/developers/data/nyct/turnstile/turnstile_{}.txt"
    dfs = []
    for week_num in week_nums:
        file_url = url.format(week_num)
        dfs.append(pd.read_csv(file_url))
    return pd.concat(dfs)
        
week_nums = [190907, 190914, 190921, 190928, 191005, 191012, 191019, 191026, 191102, 191109, 191116, 191123, 191130]
turnstiles_df = get_data(week_nums)

In [6]:
turnstiles_df.columns = [column.strip() for column in turnstiles_df.columns]

In [7]:
turnstiles_df["DATE_TIME"] = pd.to_datetime(turnstiles_df.DATE + " " + turnstiles_df.TIME, 
                                            format="%m/%d/%Y %H:%M:%S")

In [8]:
# Get rid of the duplicate entries
turnstiles_df.sort_values(["C/A", "UNIT", "SCP", "STATION", "DATE_TIME"], 
                          inplace=True, ascending=False)
turnstiles_df.drop_duplicates(subset=["C/A", "UNIT", "SCP", "STATION", "DATE_TIME"], inplace=True)

In [9]:
# Sanity Check to verify that "C/A", "UNIT", "SCP", "STATION", "DATE_TIME" is unique
(turnstiles_df
 .groupby(["C/A", "UNIT", "SCP", "STATION", "DATE_TIME"])
 .ENTRIES.count()
 .reset_index()
 .sort_values("ENTRIES", ascending=False)).head(5)

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE_TIME,ENTRIES
0,A002,R051,02-00-00,59 ST,2019-08-31 00:00:00,1
1784129,R138,R293,00-02-05,34 ST-PENN STA,2019-09-11 06:00:00,1
1784111,R138,R293,00-02-05,34 ST-PENN STA,2019-09-08 06:00:00,1
1784112,R138,R293,00-02-05,34 ST-PENN STA,2019-09-08 10:00:00,1
1784113,R138,R293,00-02-05,34 ST-PENN STA,2019-09-08 14:00:00,1


In [10]:
# Drop Exits and Desc Column.  To prevent errors in multiple run of cell, errors on drop is ignored
turnstiles_df = turnstiles_df.drop(["EXITS", "DESC"], axis=1, errors="ignore")

In [11]:
turnstiles_daily = (turnstiles_df
                        .groupby(["C/A", "UNIT", "SCP", "STATION", "LINENAME", "DATE"],as_index=False)
                        .ENTRIES.first())

In [12]:
turnstiles_daily.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DATE,ENTRIES
0,A002,R051,02-00-00,59 ST,NQR456W,08/31/2019,7183842
1,A002,R051,02-00-00,59 ST,NQR456W,09/01/2019,7184559
2,A002,R051,02-00-00,59 ST,NQR456W,09/02/2019,7185132
3,A002,R051,02-00-00,59 ST,NQR456W,09/03/2019,7186355
4,A002,R051,02-00-00,59 ST,NQR456W,09/04/2019,7187672


In [13]:
turnstiles_daily[["PREV_DATE", "PREV_ENTRIES"]] = (turnstiles_daily
                                                       .groupby(["C/A", "UNIT", "SCP", "STATION", "LINENAME"])["DATE", "ENTRIES"]
                                                       .apply(lambda grp: grp.shift(1)))

  


In [14]:
turnstiles_daily.dropna(subset=["PREV_DATE"], axis=0, inplace=True)

In [15]:
turnstiles_daily[turnstiles_daily["ENTRIES"] < turnstiles_daily["PREV_ENTRIES"]].head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DATE,ENTRIES,PREV_DATE,PREV_ENTRIES
3530,A011,R080,01-03-00,57 ST-7 AV,NQRW,09/01/2019,885822693,08/31/2019,885823072.0
3531,A011,R080,01-03-00,57 ST-7 AV,NQRW,09/02/2019,885822365,09/01/2019,885822693.0
3532,A011,R080,01-03-00,57 ST-7 AV,NQRW,09/03/2019,885821627,09/02/2019,885822365.0
3533,A011,R080,01-03-00,57 ST-7 AV,NQRW,09/04/2019,885820824,09/03/2019,885821627.0
3534,A011,R080,01-03-00,57 ST-7 AV,NQRW,09/05/2019,885819992,09/04/2019,885820824.0


In [16]:
def get_daily_counts(row, max_counter):
    counter = row["ENTRIES"] - row["PREV_ENTRIES"]
    if counter < 0:
        # Maybe counter is reversed?
        counter = -counter
    if counter > max_counter:
        # Maybe counter was reset to 0? 
        print(row["ENTRIES"], row["PREV_ENTRIES"])
        counter = min(row["ENTRIES"], row["PREV_ENTRIES"])
    if counter > max_counter:
        # Check it again to make sure we're not still giving a counter that's too big
        return 0
    return counter

# If counter is > 1Million, then the counter might have been reset.  
# Just set it to zero as different counters have different cycle limits
# It'd probably be a good idea to use a number even significantly smaller than 1 million as the limit!
turnstiles_daily["DAILY_ENTRIES"] = turnstiles_daily.apply(get_daily_counts, axis=1, max_counter=1000000)

12 6170034.0
263 3846402.0
29 3367836.0
26 1083967.0
864 2152170.0
111 1599317.0
460501 6804909.0
73 6697765.0
875479599 775219651.0
590597 9156558.0
88 2728627.0
94 4283110.0
1041 2210879.0
98 1478936.0
116 6197971.0
66 1121336148.0
82 2025847506.0
1451 4711879.0
973 6844908.0
126 6053205.0
196612 50345125.0
589824 50331648.0
5421 10392294.0
2157 6026406.0
458752 117440512.0
107 1068192.0
2567 152966689.0
2204 19783118.0
458752 117440512.0
24 2781046.0
3728 1842366.0
2069 4443445.0
178 6625781.0
4982302 877264.0
1421 2254274.0
721441460 2880424.0
516 3275592.0
610 2460400.0
14 4584512.0
26 153536031.0
18 118621565.0
67122955 103584.0
218 67122955.0
458846 2297063.0
672 5491989.0
235398290 9934.0
83886083 117440614.0
10 5679392.0
1145 4459356.0
6817688 3603909.0
401 3071806.0
123785342 117604971.0
636 2209128.0
31260681 48404.0
1507856 4609.0
2053 3512072.0
483 4777140.0
102459350 210849.0
248 3782823.0
248 9852706.0
1 134218317.0
2319 22554564.0
190 1449873.0
3480 5391681.0
845 172975

In [17]:
turnstiles_daily

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DATE,ENTRIES,PREV_DATE,PREV_ENTRIES,DAILY_ENTRIES
1,A002,R051,02-00-00,59 ST,NQR456W,09/01/2019,7184559,08/31/2019,7183842.0,717.0
2,A002,R051,02-00-00,59 ST,NQR456W,09/02/2019,7185132,09/01/2019,7184559.0,573.0
3,A002,R051,02-00-00,59 ST,NQR456W,09/03/2019,7186355,09/02/2019,7185132.0,1223.0
4,A002,R051,02-00-00,59 ST,NQR456W,09/04/2019,7187672,09/03/2019,7186355.0,1317.0
5,A002,R051,02-00-00,59 ST,NQR456W,09/05/2019,7189025,09/04/2019,7187672.0,1353.0
...,...,...,...,...,...,...,...,...,...,...
444786,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,11/25/2019,5554,11/24/2019,5554.0,0.0
444787,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,11/26/2019,5554,11/25/2019,5554.0,0.0
444788,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,11/27/2019,5554,11/26/2019,5554.0,0.0
444789,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,11/28/2019,5554,11/27/2019,5554.0,0.0


In [18]:
stations_total = (turnstiles_daily
                        .groupby(["STATION","LINENAME"],as_index=False).agg("sum")).sort_values("DAILY_ENTRIES", ascending = False).reset_index()
                        #.DAILY_ENTRIES.sum())

In [19]:
stations_total

Unnamed: 0,index,STATION,LINENAME,ENTRIES,PREV_ENTRIES,DAILY_ENTRIES
0,311,GRD CNTRL-42 ST,4567S,226865326108,2.268583e+11,12458229.0
1,89,34 ST-HERALD SQ,BDFMNQRW,416207405402,4.161986e+11,9979861.0
2,102,42 ST-PORT AUTH,ACENQRS1237W,667469074088,6.674638e+11,7175885.0
3,403,PATH NEW WTC,1,414635697,4.136185e+08,6878013.0
4,93,34 ST-PENN STA,ACE,150330038840,1.503246e+11,6548407.0
...,...,...,...,...,...,...
472,449,TOMPKINSVILLE,1,216784555,2.167347e+08,49882.0
473,203,BEACH 98 ST,AS,452779551,4.527428e+08,36755.0
474,220,BROAD CHANNEL,AS,130029958,1.300058e+08,24181.0
475,196,BEACH 105 ST,AS,10704291634,1.070427e+10,18738.0


In [20]:
set(stations_total.STATION)

{'1 AV',
 '103 ST',
 '103 ST-CORONA',
 '104 ST',
 '110 ST',
 '111 ST',
 '116 ST',
 '116 ST-COLUMBIA',
 '121 ST',
 '125 ST',
 '135 ST',
 '137 ST CITY COL',
 '138/GRAND CONC',
 '14 ST',
 '14 ST-UNION SQ',
 '145 ST',
 '149/GRAND CONC',
 '14TH STREET',
 '15 ST-PROSPECT',
 '155 ST',
 '157 ST',
 '161/YANKEE STAD',
 '163 ST-AMSTERDM',
 '167 ST',
 '168 ST',
 '169 ST',
 '170 ST',
 '174 ST',
 '174-175 STS',
 '175 ST',
 '176 ST',
 '18 AV',
 '18 ST',
 '181 ST',
 '182-183 STS',
 '183 ST',
 '190 ST',
 '191 ST',
 '2 AV',
 '20 AV',
 '207 ST',
 '21 ST',
 '21 ST-QNSBRIDGE',
 '215 ST',
 '219 ST',
 '225 ST',
 '23 ST',
 '231 ST',
 '233 ST',
 '238 ST',
 '25 AV',
 '25 ST',
 '28 ST',
 '3 AV',
 '3 AV 138 ST',
 '3 AV-149 ST',
 '30 AV',
 '33 ST',
 '33 ST-RAWSON ST',
 '34 ST-HERALD SQ',
 '34 ST-HUDSON YD',
 '34 ST-PENN STA',
 '36 AV',
 '36 ST',
 '39 AV',
 '4 AV-9 ST',
 '40 ST LOWERY ST',
 '42 ST-BRYANT PK',
 '42 ST-PORT AUTH',
 '45 ST',
 '46 ST',
 '46 ST BLISS ST',
 '47-50 STS ROCK',
 '49 ST',
 '4AV-9 ST',
 '5 AV

In [21]:
no_elevators=pd.read_csv('Stations With No Elevator_updated.csv')

In [22]:
count=0
lst=[]
for index, row in no_elevators.iterrows():
    for i, r in stations_total.iterrows():
        if row["Stop Name"].upper()==r["STATION"] and row['Daytime Routes'] in r['LINENAME']:
            lst.append(r)
            count+=1
        
print(count)
dat=pd.DataFrame(lst)
dat.head()

118


Unnamed: 0,index,STATION,LINENAME,ENTRIES,PREV_ENTRIES,DAILY_ENTRIES
117,0,1 AV,L,108210509971,108209200000.0,1298171.0
347,3,103 ST,BC,2747657700,2747271000.0,386482.0
171,1,103 ST,1,5901511168,5900530000.0,981372.0
160,2,103 ST,6,4361693836,4360641000.0,1052643.0
192,7,110 ST,6,3525627318,3524766000.0,861425.0


In [23]:
#dat.to_csv("dat.csv")

In [24]:
duplicateDFRow = dat[dat.duplicated()]
print(duplicateDFRow)

     index          STATION      LINENAME       ENTRIES  PREV_ENTRIES  \
55      31           145 ST          ABCD   10532589724  1.053063e+10   
24     269   DELANCEY/ESSEX          FJMZ    6832262457  6.830297e+09   
8      301        FULTON ST      2345ACJZ  141641492745  1.416370e+11   
37     302        FULTON ST      ACJZ2345  285343602498  2.853413e+11   
8      301        FULTON ST      2345ACJZ  141641492745  1.416370e+11   
37     302        FULTON ST      ACJZ2345  285343602498  2.853413e+11   
0      311  GRD CNTRL-42 ST         4567S  226865326108  2.268583e+11   
0      311  GRD CNTRL-42 ST         4567S  226865326108  2.268583e+11   
9      447   TIMES SQ-42 ST  1237ACENQRSW  506048173890  5.060434e+11   
71     446   TIMES SQ-42 ST   1237ACENQRS   10503475931  1.050172e+10   
137    448   TIMES SQ-42 ST  ACENQRS1237W    2493827302  2.492658e+09   
9      447   TIMES SQ-42 ST  1237ACENQRSW  506048173890  5.060434e+11   
71     446   TIMES SQ-42 ST   1237ACENQRS   1050347

In [25]:
stations_zips = pd.read_csv("sub_st_zip.csv")
no_elevator_stations_total =pd.read_csv("no_elevators_station_total.csv")

In [26]:
stations_zips.rename(columns={'Stop Name':'STATION', "Zip Code":"ZipCode"}, inplace = True)

In [27]:
stations_zips

Unnamed: 0,STATION,Borough,GTFS Latitude,GTFS Longitude,ZipCode
0,59 ST,M,40.762660,-73.967258,10065
1,5 AV/59 ST,M,40.764811,-73.973347,10065
2,57 ST-7 AV,M,40.764664,-73.980658,10106
3,49 St,M,40.759901,-73.984139,10019
4,Times Sq - 42 St,M,40.754672,-73.986754,10018
...,...,...,...,...,...
148,Grand Central - 42 St,M,40.752769,-73.979189,10017
149,34 ST-HUDSON YD,M,40.755882,-74.001910,10001
150,96 St,M,40.784318,-73.947152,10029
151,86 St,M,40.777891,-73.951787,10028


In [28]:
stations_zips.STATION = stations_zips.STATION.str.upper()

In [29]:
manh_stations=stations_zips.STATION.to_list()

In [30]:
len(manh_stations)

153

In [33]:
zip_dict  = pd.Series(stations_zips.ZipCode.values,index=stations_zips.STATION).to_dict()

In [34]:
no_elevator_stations_total["ZipCode"] = no_elevator_stations_total["STATION"].map(zip_dict)

In [35]:
manh_stations_total=no_elevator_stations_total[["STATION","LINENAME","ENTRIES","PREV_ENTRIES","DAILY_ENTRIES","ZipCode"]]

In [36]:
manh_stations_total

Unnamed: 0,STATION,LINENAME,ENTRIES,PREV_ENTRIES,DAILY_ENTRIES,ZipCode
0,1 AV,L,108210509971,108209211800,1298171,10009
1,103 ST,BC,2747657700,2747271218,386482,10025
2,103 ST,1,5901511168,5900529796,981372,10025
3,103 ST,6,4361693836,4360641193,1052643,10025
4,110 ST,6,3525627318,3524765893,861425,10029
...,...,...,...,...,...,...
98,TIMES SQ-42 ST,ACENQRS1237W,2493827302,2492658181,1169121,10018
99,14 ST-UNION SQ,LNQR456W,17715382997,17713652332,5577067,10003
100,WALL ST,23,10504197476,10502446154,1751322,10038
101,WALL ST,45,42717492240,42716505558,1849328,10038


In [50]:
age_disability = pd.read_csv("zip_code_age_disability_df.csv")

In [51]:
age_disability

Unnamed: 0,ZipCode,aged,disabled
0,10001,5297.0,3475.0
1,10002,23825.0,24260.0
2,10003,11020.0,5624.0
3,10004,540.0,169.0
4,10005,669.0,154.0
5,10006,377.0,0.0
6,10007,1231.0,170.0
7,10009,13667.0,13695.0
8,10010,7523.0,4206.0
9,10011,13260.0,7709.0


In [52]:
aged_dict  = pd.Series(age_disability.aged.values,index=age_disability.ZipCode).to_dict()

In [53]:
manh_stations_total["aged"] = manh_stations_total["ZipCode"].map(aged_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [54]:
disabled_dict = pd.Series(age_disability.disabled.values,index=age_disability.ZipCode).to_dict()

In [55]:
manh_stations_total["disabled"] = manh_stations_total["ZipCode"].map(disabled_dict)

In [56]:
manh_stations_total["aged_scaled"] = manh_stations_total["aged"]/max(manh_stations_total.aged)

In [57]:
manh_stations_total["disabled_scaled"] = manh_stations_total["disabled"]/max(manh_stations_total.disabled)

In [58]:
manh_stations_total["entries_scaled"] = manh_stations_total["DAILY_ENTRIES"]/max(manh_stations_total.DAILY_ENTRIES)

In [59]:
manh_stations_total["Neediness"] = (manh_stations_total.aged_scaled + manh_stations_total.disabled_scaled + manh_stations_total.entries_scaled)/3

In [60]:
manh_stations_total

Unnamed: 0,STATION,LINENAME,ENTRIES,PREV_ENTRIES,DAILY_ENTRIES,ZipCode,aged,disabled,aged_scaled,disabled_scaled,entries_scaled,Neediness
0,1 AV,L,108210509971,108209211800,1298171,10009,13667.0,13695.0,0.472400,0.564509,0.104202,0.380370
1,103 ST,BC,2747657700,2747271218,386482,10025,28931.0,17545.0,1.000000,0.723207,0.031022,0.584743
2,103 ST,1,5901511168,5900529796,981372,10025,28931.0,17545.0,1.000000,0.723207,0.078773,0.600660
3,103 ST,6,4361693836,4360641193,1052643,10025,28931.0,17545.0,1.000000,0.723207,0.084494,0.602567
4,110 ST,6,3525627318,3524765893,861425,10029,18966.0,24241.0,0.655560,0.999217,0.069145,0.574641
...,...,...,...,...,...,...,...,...,...,...,...,...
98,TIMES SQ-42 ST,ACENQRS1237W,2493827302,2492658181,1169121,10018,1339.0,921.0,0.046283,0.037964,0.093843,0.059363
99,14 ST-UNION SQ,LNQR456W,17715382997,17713652332,5577067,10003,11020.0,5624.0,0.380906,0.231822,0.447661,0.353463
100,WALL ST,23,10504197476,10502446154,1751322,10038,6151.0,4893.0,0.212609,0.201690,0.140576,0.184958
101,WALL ST,45,42717492240,42716505558,1849328,10038,6151.0,4893.0,0.212609,0.201690,0.148442,0.187581


In [61]:
#Sort by neediest stations
manh_stations_total=manh_stations_total.sort_values("Neediness", ascending = False).dropna()

In [62]:
manh_stations_total

Unnamed: 0,STATION,LINENAME,ENTRIES,PREV_ENTRIES,DAILY_ENTRIES,ZipCode,aged,disabled,aged_scaled,disabled_scaled,entries_scaled,Neediness
29,2 AV,F,12302761442,12301342040,1419402,10002,23825.0,24260.0,0.823511,1.000000,0.113933,0.645815
81,EAST BROADWAY,F,13818349792,13817246360,1103432,10002,23825.0,24260.0,0.823511,1.000000,0.088571,0.637361
67,BOWERY,JZ,1541058079,1540687340,370739,10002,23825.0,24260.0,0.823511,1.000000,0.029759,0.617757
3,103 ST,6,4361693836,4360641193,1052643,10025,28931.0,17545.0,1.000000,0.723207,0.084494,0.602567
2,103 ST,1,5901511168,5900529796,981372,10025,28931.0,17545.0,1.000000,0.723207,0.078773,0.600660
...,...,...,...,...,...,...,...,...,...,...,...,...
20,155 ST,BD,2015951261,2015653704,297557,10018,1339.0,921.0,0.046283,0.037964,0.023884,0.036044
19,155 ST,C,1479960900,1479730260,230640,10018,1339.0,921.0,0.046283,0.037964,0.018513,0.034253
31,215 ST,1,703569282,703415343,153939,10018,1339.0,921.0,0.046283,0.037964,0.012356,0.032201
68,BROAD ST,JZ,1645466543,1644864939,601604,10005,669.0,154.0,0.023124,0.006348,0.048290,0.025921


In [63]:
manh_stations_total.to_csv("Neediness For Stations Without Elevators.csv")