**Importing packages and data**
- data of the relevant months are imported from the official MTA website 
- a column naming problem of 'EXITS' was solved 


In [None]:
import pandas as pd
import numpy as np


def get_data(week_nums):
    url = "http://web.mta.info/developers/data/nyct/turnstile/turnstile_{}.txt"
    dfs = []
    for week_num in week_nums:
        file_url = url.format(week_num)
        dfs.append(pd.read_csv(file_url))
    return pd.concat(dfs)
        
week_nums = [200307, 200314, 200321,200328,200404,200411,200418,200425,200502,200509,200516,200523,200530,200606,200613,200620,200627]
turnstiles_df = get_data(week_nums)
turnstiles_df.rename(columns={'EXITS                                                               ': 'EXITS'}, inplace=True)



**Preliminary data visualization**
- a preliminary plot was created to show the effect of the lockdown on one turnstile of one `C/A`, `UNIT`, `SCP`, and `STATION` combination




In [None]:
"""
plot
"""
%matplotlib inline
import matplotlib.pyplot as plt
plt.figure(figsize=(10,3))
filtered=turnstiles_df.loc[(turnstiles_df.STATION=='59 ST')&(turnstiles_df.SCP=='02-00-00') & (turnstiles_df['C/A']=='A002') & (turnstiles_df.UNIT=='R051')];
filtered.groupby(["DATE"])[['DATE','ENTRIES']].apply(lambda x:x.ENTRIES.max()-x.ENTRIES.min()).plot()


plt.ylabel('Entries per day');



**more data acquisition **
- obtained the exact data from the beginning of march up until the present and will be divided into lockdown vs reopen



In [None]:
def get_data(week_nums):
    url = "http://web.mta.info/developers/data/nyct/turnstile/turnstile_{}.txt"
    dfs = []
    for week_num in week_nums:
        file_url = url.format(week_num)
        print(file_url)
        dfs.append(pd.read_csv(file_url))
    return pd.concat(dfs)
        
week_nums = [200328,200404,200411,200418,200425,200502,200509,200516,200523,200530,200606,200613,200620]
turnstiles_lockdown_df = get_data(week_nums)
turnstiles_lockdown_df;



In [None]:
def get_data(week_nums):
    url = "http://web.mta.info/developers/data/nyct/turnstile/turnstile_{}.txt"
    dfs = []
    for week_num in week_nums:
        file_url = url.format(week_num)
        print(file_url)
        dfs.append(pd.read_csv(file_url))
    return pd.concat(dfs)
        
week_nums = [200627]
turnstiles_reopen_df = get_data(week_nums)


**uber data preliminary  analysis**
- downloaded and imported official data from Uber used in a study from approximately the same time period, but in 2015.
- dowloaded and imported official new york taxi zone data from the same study. The taxi zone allows me to link ridership by neighborhood via `locationID`.
- filtered the data so that only Manhattan data is selected

In [None]:
#Data too large to be uploaded to github - retrieved from here (sorry!)
uber_data=pd.read_csv(#https://github.com/andywzz/uber-tlc-foil-response/blob/master/uber-trip-data/uber-raw-data-janjune-15.csv.zip)
uber_location_data=pd.read_csv("taxi-zone-lookup.csv")
uber_location_data.rename(columns={'LocationID': 'locationID'}, inplace=True)
uber_location_merge=pd.merge(uber_data,uber_location_data,on='locationID')
uber_data_manhattan=uber_data_marjun[uber_data_marjun['Borough']=='Manhattan']

uber_data_manhattan.groupby('Pickup_date').head(20)



**cleaned MTA data preliminary manipulation**
- grouped the lockdown period data(03/22/20-06/08/20) for manhattan
- added a new column called `time_period` where lockdown and reopen data would be labeled

In [None]:
maraprmayjune_2020_data=pd.read_csv(#"yourpath/Data/manhattan_daily_stations_2020.csv")

lockdown_data=maraprmayjune_2020_data[maraprmayjune_2020_data['date']>'03/22/2020'][maraprmayjune_2020_data['date']<'06/08/2020']
conditions=[
    (maraprmayjune_2020_data['date']>'03/21/2020') & (maraprmayjune_2020_data['date']<'06/08/2020'),
    (maraprmayjune_2020_data['date']=='06/08/2020') | (maraprmayjune_2020_data['date']>'06/08/2020')]
choices=['lockdown','reopen']
maraprmayjune_2020_data['time_period'] = np.select(conditions, choices, default='neither')



**cleaned MTA data manipulation**
- ussed various groupbys to obtain the percentage increase of average ridership from before the lockdown vs after the lockdown

In [None]:
time_period_data=maraprmayjune_2020_data.groupby(['time_period','GeogName','date'],as_index=False)[['new_entry']].sum()
lockdowndays=len(time_period_data[time_period_data['time_period']=='lockdown']['date'].unique())
reopendays=len(time_period_data[time_period_data['time_period']=='reopen']['date'].unique())
time_period_average=maraprmayjune_2020_data.groupby(['GeogName','time_period'],as_index=False)[['new_entry','date']].mean()
time_period_df=time_period_average.loc[time_period_average.time_period != 'neither']
time_period_df['percent_change']= time_period_df.groupby('GeogName')[['new_entry']].pct_change()*100
time_period_df.sort_values('percent_change',ascending=False)
time_period_df.rename(columns={'new_entry': 'average_entries'}, inplace=True)
time_period_df.sort_values('percent_change',ascending=False)

**Uber data cleaning and visualization**
- summed the number of pickups by location in the manhattan area and created a bar graph to visualize that data

In [None]:
uber_total=uber_data_manhattan.groupby(['Zone','locationID'],as_index=False).count()[['Zone','locationID','Affiliated_base_num']].sort_values('Affiliated_base_num',ascending=False)
uber_total.rename(columns={'Affiliated_base_num': 'total_ridership'}, inplace=True)
uber_total
y=uber_total['total_ridership'].head(10)
x=uber_total['Zone'].head(10)
plt.figure(figsize=(20,5))
plt.bar(x,y)



**MTA data cleaning and visualization continued**
- graphed the percentage change in ridership by neighborhood name

In [None]:

percent_change_df=time_period_df.dropna(how = 'any')
percent_change_df.sort_values('percent_change',ascending=False)

x_val=percent_change_df['GeogName']
y_val=percent_change_df['percent_change']

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
import seaborn as sns
plt.figure(figsize=(20,5))
plt.bar(x_val,y_val)

In [None]:
time_period_df
x_val=time_period_df['GeogName']
y_val=time_period_df['average_entries']
plt.figure(figsize=(20,5))
plt.bar(x_val,y_val)

**MTA data cleaning and visualization continued**
- collected the top ten neighbhorhoods with biggest percentage increase in average ridership after the lockdown and plotted it

In [None]:

time_period_top10=time_period_df.loc[(time_period_df['GeogName']=='WEST VILLAGE')|(time_period_df['GeogName']=='SOHO-TRIBECA-CIVIC CENTER-LITTLE ITALY')|(time_period_df['GeogName']=='HUDSON YARDS-CHELSEA-FLATIRON-UNION SQUARE')|(time_period_df['GeogName']=='TURTLE BAY-EAST MIDTOWN')|(time_period_df['GeogName']=='LINCOLN SQUARE')|(time_period_df['GeogName']=='MIDTOWN-MIDTOWN SOUTH')|(time_period_df['GeogName']=='UPPER EAST SIDE-CARNEGIE HILL')|(time_period_df['GeogName']=='LENOX HILL-ROOSEVELT ISLAND')|(time_period_df['GeogName']=='EAST VILLAGE')|(time_period_df['GeogName']=='LOWER EAST SIDE')]
top_10_sorted=time_period_top10.reset_index().sort_values('percent_change')
sns.barplot(x="GeogName", hue="time_period", y="average_entries", data=time_period_top10)

**MTA data cleaning and visualization continued**
- collected the top 7 neighbhorhoods with biggest percentage increase in average ridership after the lockdown 

In [None]:
time_period_top5.to_csv('time_period_top5.csv')

In [None]:
top_7=pd.read_csv('time_period_top5.csv')
top_5_df=top_7[(top_7['GeogName']=='HUDSON YARDS-CHELSEA-FLATIRON-UNION SQUARE')|(top_7['GeogName']=='MIDTOWN-MIDTOWN SOUTH')|(top_7['GeogName']=='SOHO-TRIBECA-CIVIC CENTER-LITTLE ITALY')|(top_7['GeogName']=='WEST VILLAGE')|(top_7['GeogName']=='UPPER EAST SIDE-CARNEGIE HILL')]
top_5_df

**MTA data cleaning and visualization continued**
- collected the top 5 neighbhorhoods with biggest percentage increase in average ridership after the lockdown and plotted it with a double bar format. Lockdown ridership average and after lockdown ridership average

In [None]:
ticks_x = np.linspace(0, 4, 5)
x_ticks=['HUDSON YARDS-CHELSEA-FLATIRON \n UNION SQUARE', 
       'MIDTOWN-MIDTOWN \n SOUTH', 'SOHO-TRIBECA-CIVIC \n CENTER-LITTLE ITALY',
        'UPPER EAST \n SIDE-CARNEGIE HILL',
       'WEST VILLAGE']


colors = ['#6f40a8','#ED7C43']
plt.figure(figsize=(20,10))
ax=sns.barplot(x="GeogName", hue="time_period", y="percent_of_total", data=top_5_df,palette=['#6f40a8','#ED7C43'])
plt.xticks(ticks_x,x_ticks)
plt.xlabel('Neighborhood',fontsize = 20,labelpad=20)
plt.ylabel('Percentage of total ridership',fontsize = 20,labelpad=20)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

ax.text(.05, 70,'+109.4%', fontsize=15);
ax.text(1.05, 70,'+109.1%', fontsize=15);
ax.text(2.05, 70,'+117.3%', fontsize=15);
ax.text(3.05, 70,'+102.9%', fontsize=15);
ax.text(4.05, 71.5,'+144.7%', fontsize=15);

plt.setp(ax.get_legend().get_texts(), fontsize=13)  
plt.setp(ax.get_legend().get_title(), fontsize=13)
plt.title('Percent increase of MTA ridership after lockdown vs during lockdown',fontsize=25);
plt.savefig('percent_increase_doublebar.png')


**MTA data cleaning and visualization continued**
- collected the top 10 neighbhorhoods with biggest percentage increase in average ridership after the lockdown and plotted it with a single bar format and a special font to fit with the style of others.

In [None]:
from matplotlib import font_manager as fm
font_jose=fm.FontProperties(fname = '/Users/andywu98/Downloads/JosefinSans-Regular.ttf', size=30)

In [None]:
ticks_x = np.linspace(0, 9, 10)
x_ticks=['LOWER EAST \n SIDE', 'EAST VILLAGE', 'LINCOLN \n SQUARE',
       'LENOX HILL, \nROOSEVELT \n ISLAND', 'TURTLE BAY, \n EAST MIDTOWN',
       'UPPER EAST SIDE, \n CARNEGIE HILL', 'MIDTOWN, \n MIDTOWN \n SOUTH',
       'HUDSON YARDS, \n CHELSEA, \n FLATIRON, \n UNION SQUARE',
       'SOHO-TRIBECA, \n CIVIC CENTER, \n LITTLE ITALY', 'WEST VILLAGE']


plt.figure(figsize=(20,10))
sns.barplot(x="GeogName", y="percent_change", data=top_10_sorted, palette=['#ED7C43','#ED7C43','#6f40a8','#ED7C43','#6f40a8','#ED7C43','#ED7C43','#6f40a8','#6f40a8','#6f40a8'])
plt.xticks(ticks_x,x_ticks)
plt.xlabel('Neighborhood',fontproperties=font_jose,fontsize = 20,labelpad=15)
plt.ylabel('% increase in average ridership after lockdown',fontproperties=font_jose,fontsize = 20,labelpad=20)
plt.xticks(fontproperties=font_jose,fontsize=12)
plt.yticks(fontproperties=font_jose,fontsize=12)
plt.title('Percent increase of MTA ridership after lockdown',fontproperties=font_jose,fontsize=25);
plt.savefig('percent_increase.png')
