In [2]:
from __future__ import print_function, division

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
%matplotlib inline

In [4]:
# Source: http://web.mta.info/developers/turnstile.html
def get_data(week_nums):
    url = "http://web.mta.info/developers/data/nyct/turnstile/turnstile_{}.txt"
    dfs = []
    for week_num in week_nums:
        file_url = url.format(week_num)
        dfs.append(pd.read_csv(file_url))
    return pd.concat(dfs)
        
week_nums = [190907, 190914, 190921, 190928, 191005, 191012, 191019, 191026, 191102, 191109, 191116, 191123, 191130]
turnstiles_df = get_data(week_nums)

In [5]:
turnstiles_df.columns = [column.strip() for column in turnstiles_df.columns]

In [6]:
turnstiles_df["DATE_TIME"] = pd.to_datetime(turnstiles_df.DATE + " " + turnstiles_df.TIME, 
                                            format="%m/%d/%Y %H:%M:%S")

In [7]:
# Get rid of the duplicate entries
turnstiles_df.sort_values(["C/A", "UNIT", "SCP", "STATION", "DATE_TIME"], 
                          inplace=True, ascending=False)
turnstiles_df.drop_duplicates(subset=["C/A", "UNIT", "SCP", "STATION", "DATE_TIME"], inplace=True)

In [8]:
# Sanity Check to verify that "C/A", "UNIT", "SCP", "STATION", "DATE_TIME" is unique
(turnstiles_df
 .groupby(["C/A", "UNIT", "SCP", "STATION", "DATE_TIME"])
 .ENTRIES.count()
 .reset_index()
 .sort_values("ENTRIES", ascending=False)).head(5)

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE_TIME,ENTRIES
0,A002,R051,02-00-00,59 ST,2019-08-31 00:00:00,1
1784129,R138,R293,00-02-05,34 ST-PENN STA,2019-09-11 06:00:00,1
1784111,R138,R293,00-02-05,34 ST-PENN STA,2019-09-08 06:00:00,1
1784112,R138,R293,00-02-05,34 ST-PENN STA,2019-09-08 10:00:00,1
1784113,R138,R293,00-02-05,34 ST-PENN STA,2019-09-08 14:00:00,1


In [9]:
# Drop Exits and Desc Column.  To prevent errors in multiple run of cell, errors on drop is ignored
turnstiles_df = turnstiles_df.drop(["EXITS", "DESC"], axis=1, errors="ignore")

In [10]:
turnstiles_daily = (turnstiles_df
                        .groupby(["C/A", "UNIT", "SCP", "STATION", "DATE"],as_index=False)
                        .ENTRIES.first())

In [11]:
turnstiles_daily.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE,ENTRIES
0,A002,R051,02-00-00,59 ST,08/31/2019,7183842
1,A002,R051,02-00-00,59 ST,09/01/2019,7184559
2,A002,R051,02-00-00,59 ST,09/02/2019,7185132
3,A002,R051,02-00-00,59 ST,09/03/2019,7186355
4,A002,R051,02-00-00,59 ST,09/04/2019,7187672


In [12]:
turnstiles_daily[["PREV_DATE", "PREV_ENTRIES"]] = (turnstiles_daily
                                                       .groupby(["C/A", "UNIT", "SCP", "STATION"])["DATE", "ENTRIES"]
                                                       .apply(lambda grp: grp.shift(1)))

  


In [13]:
turnstiles_daily.dropna(subset=["PREV_DATE"], axis=0, inplace=True)

In [14]:
turnstiles_daily[turnstiles_daily["ENTRIES"] < turnstiles_daily["PREV_ENTRIES"]].head()

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE,ENTRIES,PREV_DATE,PREV_ENTRIES
3530,A011,R080,01-03-00,57 ST-7 AV,09/01/2019,885822693,08/31/2019,885823072.0
3531,A011,R080,01-03-00,57 ST-7 AV,09/02/2019,885822365,09/01/2019,885822693.0
3532,A011,R080,01-03-00,57 ST-7 AV,09/03/2019,885821627,09/02/2019,885822365.0
3533,A011,R080,01-03-00,57 ST-7 AV,09/04/2019,885820824,09/03/2019,885821627.0
3534,A011,R080,01-03-00,57 ST-7 AV,09/05/2019,885819992,09/04/2019,885820824.0


In [15]:
def get_daily_counts(row, max_counter):
    counter = row["ENTRIES"] - row["PREV_ENTRIES"]
    if counter < 0:
        counter = -counter
    if counter > max_counter:
        print(row["ENTRIES"], row["PREV_ENTRIES"])
        return 0
    return counter

# If counter is > 1Million, then the counter might have been reset.  
# Just set it to zero as different counters have different cycle limits
_ = turnstiles_daily.apply(get_daily_counts, axis=1, max_counter=1000000)

12 6170034.0
263 3846402.0
29 3367836.0
26 1083967.0
864 2152170.0
111 1599317.0
460501 6804909.0
73 6697765.0
875479599 775219651.0
590597 9156558.0
88 2728627.0
94 4283110.0
1041 2210879.0
98 1478936.0
116 6197971.0
66 1121336148.0
82 2025847506.0
1451 4711879.0
973 6844908.0
126 6053205.0
196612 50345125.0
589824 50331648.0
5421 10392294.0
2157 6026406.0
458752 117440512.0
107 1068192.0
2567 152966689.0
2204 19783118.0
458752 117440512.0
24 2781046.0
3728 1842366.0
2069 4443445.0
178 6625781.0
4982302 877264.0
1421 2254274.0
721441460 2880424.0
516 3275592.0
610 2460400.0
14 4584512.0
26 153536031.0
18 118621565.0
67122955 103584.0
218 67122955.0
458846 2297063.0
672 5491989.0
235398290 9934.0
83886083 117440614.0
10 5679392.0
1145 4459356.0
6817688 3603909.0
401 3071806.0
123785342 117604971.0
636 2209128.0
31260681 48404.0
1507856 4609.0
2053 3512072.0
483 4777140.0
102459350 210849.0
248 3782823.0
248 9852706.0
1 134218317.0
2319 22554564.0
190 1449873.0
3480 5391681.0
845 172975

In [16]:
def get_daily_counts(row, max_counter):
    counter = row["ENTRIES"] - row["PREV_ENTRIES"]
    if counter < 0:
        # Maybe counter is reversed?
        counter = -counter
    if counter > max_counter:
        # Maybe counter was reset to 0? 
        print(row["ENTRIES"], row["PREV_ENTRIES"])
        counter = min(row["ENTRIES"], row["PREV_ENTRIES"])
    if counter > max_counter:
        # Check it again to make sure we're not still giving a counter that's too big
        return 0
    return counter

# If counter is > 1Million, then the counter might have been reset.  
# Just set it to zero as different counters have different cycle limits
# It'd probably be a good idea to use a number even significantly smaller than 1 million as the limit!
turnstiles_daily["DAILY_ENTRIES"] = turnstiles_daily.apply(get_daily_counts, axis=1, max_counter=1000000)

12 6170034.0
263 3846402.0
29 3367836.0
26 1083967.0
864 2152170.0
111 1599317.0
460501 6804909.0
73 6697765.0
875479599 775219651.0
590597 9156558.0
88 2728627.0
94 4283110.0
1041 2210879.0
98 1478936.0
116 6197971.0
66 1121336148.0
82 2025847506.0
1451 4711879.0
973 6844908.0
126 6053205.0
196612 50345125.0
589824 50331648.0
5421 10392294.0
2157 6026406.0
458752 117440512.0
107 1068192.0
2567 152966689.0
2204 19783118.0
458752 117440512.0
24 2781046.0
3728 1842366.0
2069 4443445.0
178 6625781.0
4982302 877264.0
1421 2254274.0
721441460 2880424.0
516 3275592.0
610 2460400.0
14 4584512.0
26 153536031.0
18 118621565.0
67122955 103584.0
218 67122955.0
458846 2297063.0
672 5491989.0
235398290 9934.0
83886083 117440614.0
10 5679392.0
1145 4459356.0
6817688 3603909.0
401 3071806.0
123785342 117604971.0
636 2209128.0
31260681 48404.0
1507856 4609.0
2053 3512072.0
483 4777140.0
102459350 210849.0
248 3782823.0
248 9852706.0
1 134218317.0
2319 22554564.0
190 1449873.0
3480 5391681.0
845 172975

In [17]:
turnstiles_daily

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE,ENTRIES,PREV_DATE,PREV_ENTRIES,DAILY_ENTRIES
1,A002,R051,02-00-00,59 ST,09/01/2019,7184559,08/31/2019,7183842.0,717.0
2,A002,R051,02-00-00,59 ST,09/02/2019,7185132,09/01/2019,7184559.0,573.0
3,A002,R051,02-00-00,59 ST,09/03/2019,7186355,09/02/2019,7185132.0,1223.0
4,A002,R051,02-00-00,59 ST,09/04/2019,7187672,09/03/2019,7186355.0,1317.0
5,A002,R051,02-00-00,59 ST,09/05/2019,7189025,09/04/2019,7187672.0,1353.0
...,...,...,...,...,...,...,...,...,...
444786,TRAM2,R469,00-05-01,RIT-ROOSEVELT,11/25/2019,5554,11/24/2019,5554.0,0.0
444787,TRAM2,R469,00-05-01,RIT-ROOSEVELT,11/26/2019,5554,11/25/2019,5554.0,0.0
444788,TRAM2,R469,00-05-01,RIT-ROOSEVELT,11/27/2019,5554,11/26/2019,5554.0,0.0
444789,TRAM2,R469,00-05-01,RIT-ROOSEVELT,11/28/2019,5554,11/27/2019,5554.0,0.0


In [92]:
turnstiles_total = (turnstiles_daily
                        .groupby(["C/A","STATION","UNIT","SCP","DATE"],as_index=False).agg("sum")).sort_values("DAILY_ENTRIES", ascending = False).reset_index()
                        #.DAILY_ENTRIES.sum())

In [24]:
#challenge6
stations_total = (turnstiles_daily
                        .groupby(["STATION","DATE"],as_index=False).agg("sum")).sort_values("DAILY_ENTRIES", ascending = False).reset_index()
                        #.DAILY_ENTRIES.sum())

In [25]:
turnstiles_total.head()

Unnamed: 0,index,C/A,UNIT,SCP,DATE,ENTRIES,PREV_ENTRIES,DAILY_ENTRIES
0,25209,A069,R044,01-00-02,09/30/2019,28,957437.0,957409.0
1,148006,N139,R355,00-00-01,11/29/2019,4982302,877264.0,877264.0
2,39999,B029,R172,00-00-01,10/23/2019,672,862572.0,861900.0
3,346075,R243,R049,00-03-00,09/12/2019,569320809,779039.0,779039.0
4,92089,K026,R100,00-00-04,09/19/2019,560,771974.0,771414.0


In [27]:
stations_total

Unnamed: 0,index,STATION,DATE,ENTRIES,PREV_ENTRIES,DAILY_ENTRIES
0,16138,CHAMBERS ST,09/30/2019,4160425726,4.165794e+09,998192.0
1,539,111 ST,11/29/2019,133604126,1.294875e+08,888814.0
2,14451,BRIGHTON BEACH,10/23/2019,78307418,7.915694e+07,874281.0
3,7767,59 ST COLUMBUS,09/28/2019,3193068458,3.292907e+09,838393.0
4,7121,51 ST,09/12/2019,674205092,1.056411e+08,801226.0
...,...,...,...,...,...,...
33947,27470,NORWOOD AV,09/22/2019,13104530,1.310453e+07,0.0
33948,25317,MORGAN AV,09/29/2019,7092606,7.092606e+06,0.0
33949,5795,39 AV,10/06/2019,8355091,8.355091e+06,0.0
33950,27491,NORWOOD AV,10/13/2019,13161775,1.316178e+07,0.0


In [38]:
stations_zips = pd.read_csv("sub_st_zip.csv")

In [39]:
stations_zips

Unnamed: 0,Stop Name,Borough,GTFS Latitude,GTFS Longitude,Zip Code
0,Lexington Av/59 St,M,40.762660,-73.967258,10065
1,5 Av/59 St,M,40.764811,-73.973347,10065
2,57 St - 7 Av,M,40.764664,-73.980658,10106
3,49 St,M,40.759901,-73.984139,10019
4,Times Sq - 42 St,M,40.754672,-73.986754,10018
...,...,...,...,...,...
148,Grand Central - 42 St,M,40.752769,-73.979189,10017
149,34 St - 11 Av,M,40.755882,-74.001910,10001
150,96 St,M,40.784318,-73.947152,10029
151,86 St,M,40.777891,-73.951787,10028


In [50]:
stations_zips.rename(columns={'Stop Name':'STATION', "Zip Code":"ZipCode"}, inplace = True)

In [82]:
stations_zips.STATION = stations_zips.STATION.str.upper()

In [86]:
manh_stations=stations_zips.STATION.to_list()

In [87]:
stations_zips

Unnamed: 0,STATION,Borough,GTFS Latitude,GTFS Longitude,ZipCode
0,LEXINGTON AV/59 ST,M,40.762660,-73.967258,10065
1,5 AV/59 ST,M,40.764811,-73.973347,10065
2,57 ST - 7 AV,M,40.764664,-73.980658,10106
3,49 ST,M,40.759901,-73.984139,10019
4,TIMES SQ - 42 ST,M,40.754672,-73.986754,10018
...,...,...,...,...,...
148,GRAND CENTRAL - 42 ST,M,40.752769,-73.979189,10017
149,34 ST - 11 AV,M,40.755882,-74.001910,10001
150,96 ST,M,40.784318,-73.947152,10029
151,86 ST,M,40.777891,-73.951787,10028


In [88]:
type(manh_stations)

list

In [107]:
zip_dict  = pd.Series(stations_zips.ZipCode.values,index=stations_zips.STATION).to_dict()

In [110]:
manh_stations_total = stations_total[stations_total["STATION"].isin(manh_stations)]

In [93]:
manh_turnstiles_total = turnstiles_total[turnstiles_total["STATION"].isin(manh_stations)]

In [111]:
manh_stations_total

Unnamed: 0,index,STATION,DATE,ENTRIES,PREV_ENTRIES,DAILY_ENTRIES
0,16138,CHAMBERS ST,09/30/2019,4160425726,4.165794e+09,998192.0
4,7121,51 ST,09/12/2019,674205092,1.056411e+08,801226.0
8,15550,CANAL ST,11/11/2019,4042695892,4.043223e+09,640864.0
11,14235,BOWLING GREEN,09/17/2019,2368999214,2.368442e+09,556916.0
13,30606,SOUTH FERRY,11/14/2019,162973461,1.624356e+08,537899.0
...,...,...,...,...,...,...
33708,6834,5 AV/53 ST,11/24/2019,106521297,1.065213e+08,2.0
33713,3282,190 ST,10/13/2019,1453890668,1.453891e+09,1.0
33714,3614,207 ST,09/15/2019,159828754,1.598288e+08,1.0
33770,3344,191 ST,09/15/2019,6286044,6.286044e+06,0.0


In [104]:
stations_elev = pd.read_csv("Station_Elevator.csv")

In [105]:
stations_elev

Unnamed: 0,STATION,Elevator
0,14 St/8 Av (A/C/E/L),1
1,14 St-Union Sq (L/N/Q/R/W only; 4/5/6 is not a...,1
2,23 St (6),1
3,28 St (6 downtown only),1
4,34 St-Herald Sq (B/D/F/M/N/Q/R/W),1
5,34 St-Penn Station (1/2/3/A/C/E),1
6,34 St-Hudson Yards (7),1
7,42 St-Port Authority Bus Terminal (A/C/E),1
8,47-50 Sts-Rockefeller Ctr (B/D/F/M),1
9,49 St (N/R/W uptown only),1


In [108]:
elev_dict  = pd.Series(stations_elev.Elevator.values,index=stations_elev.STATION).to_dict()

In [119]:
elev_dict

{'14 St/8 Av (A/C/E/L)': 1,
 '14 St-Union Sq (L/N/Q/R/W\xa0only; 4/5/6 is not accessible)': 1,
 '23 St (6)': 1,
 '28 St (6 downtown only)': 1,
 '34 St-Herald Sq (B/D/F/M/N/Q/R/W)': 1,
 '34 St-Penn Station (1/2/3/A/C/E)': 1,
 '34 St-Hudson Yards (7)': 1,
 '42 St-Port Authority Bus Terminal (A/C/E)': 1,
 '47-50 Sts-Rockefeller Ctr (B/D/F/M)': 1,
 '49 St (N/R/W uptown only)': 1,
 '50 St (C/E downtown only)': 1,
 '51 St (6)': 1,
 '59 St-Columbus Circle (A/C/B/D/1)': 1,
 '66 St-Lincoln Center (1)': 1,
 '72 St (1/2/3)': 1,
 '72 St (Q)': 1,
 '86 St (6 uptown only)': 1,
 '86 St (Q)': 1,
 '96 St (1/2/3)': 1,
 '96 St (Q)': 1,
 '125 St (4/5/6)': 1,
 '125 St (A/C/B/D)': 1,
 '135 St (2/3)': 1,
 '168 St (A/C only; 1 is not accessible)': 1,
 '175 St (A)': 1,
 'Bleecker St (6)': 1,
 'Bowling Green (4/5)': 1,
 'Broadway-Lafayette St\xa0(B/D/F/M)': 1,
 'Brooklyn Bridge-City Hall (4/5/6)': 1,
 'Canal St (6 only; J/N/Q/R/W/Z are not accessible)': 1,
 'Chambers St (1/2/3)': 1,
 'Cortlandt St (R/W)': 1,
 'D

In [112]:
manh_stations_total["ZipCode"] = manh_stations_total["STATION"].map(zip_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [114]:
manh_turnstiles_total["ZipCode"] = manh_turnstiles_total["STATION"].map(zip_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [113]:
manh_stations_total

Unnamed: 0,index,STATION,DATE,ENTRIES,PREV_ENTRIES,DAILY_ENTRIES,ZipCode
0,16138,CHAMBERS ST,09/30/2019,4160425726,4.165794e+09,998192.0,10013
4,7121,51 ST,09/12/2019,674205092,1.056411e+08,801226.0,10022
8,15550,CANAL ST,11/11/2019,4042695892,4.043223e+09,640864.0,10013
11,14235,BOWLING GREEN,09/17/2019,2368999214,2.368442e+09,556916.0,10004
13,30606,SOUTH FERRY,11/14/2019,162973461,1.624356e+08,537899.0,10004
...,...,...,...,...,...,...,...
33708,6834,5 AV/53 ST,11/24/2019,106521297,1.065213e+08,2.0,10022
33713,3282,190 ST,10/13/2019,1453890668,1.453891e+09,1.0,10040
33714,3614,207 ST,09/15/2019,159828754,1.598288e+08,1.0,10034
33770,3344,191 ST,09/15/2019,6286044,6.286044e+06,0.0,10040


In [116]:
manh_stations_total["Elev"] = manh_stations_total["STATION"].map(elev_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [117]:
manh_turnstiles_total["Elev"] = manh_turnstiles_total["STATION"].map(elev_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [118]:
manh_stations_total

Unnamed: 0,index,STATION,DATE,ENTRIES,PREV_ENTRIES,DAILY_ENTRIES,ZipCode,Elev
0,16138,CHAMBERS ST,09/30/2019,4160425726,4.165794e+09,998192.0,10013,
4,7121,51 ST,09/12/2019,674205092,1.056411e+08,801226.0,10022,
8,15550,CANAL ST,11/11/2019,4042695892,4.043223e+09,640864.0,10013,
11,14235,BOWLING GREEN,09/17/2019,2368999214,2.368442e+09,556916.0,10004,
13,30606,SOUTH FERRY,11/14/2019,162973461,1.624356e+08,537899.0,10004,
...,...,...,...,...,...,...,...,...
33708,6834,5 AV/53 ST,11/24/2019,106521297,1.065213e+08,2.0,10022,
33713,3282,190 ST,10/13/2019,1453890668,1.453891e+09,1.0,10040,
33714,3614,207 ST,09/15/2019,159828754,1.598288e+08,1.0,10034,
33770,3344,191 ST,09/15/2019,6286044,6.286044e+06,0.0,10040,


In [None]:
#Map disability and age #s 

In [None]:
#Use minmaxscaler for the scaling

In [None]:
#Create total metric

In [None]:
#Sort by total metric and elevation possession status