In [None]:
import seaborn as sns
import pickle 
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
import pandas as pd
import numpy as np

In [None]:
pd.set_option('display.max_rows', 200)

In [None]:
year19_path_list=  ['turnstile_190504.txt', 'turnstile_190511.txt','turnstile_190518.txt','turnstile_190525.txt']

In [None]:
def load_data(list_of_datasets):
    year = pd.concat(map(lambda x: pd.read_csv(x,parse_dates=[['DATE','TIME']]), 
        list_of_datasets))
    return year

In [None]:
def preprocessing(dataframe):
    dataframe.reset_index(drop=True,inplace=True)
    dataframe.drop(['C/A','LINENAME','DIVISION', 'DESC'],axis=1,inplace = True)
    dataframe['UNIT/SCP']= dataframe['UNIT'] + dataframe['SCP'] #creating an ideantifier column 
    dataframe =  dataframe[(dataframe['DATE_TIME'].dt.month > 4)]
    dataframe = dataframe.sort_values(['STATION', 'UNIT/SCP','DATE_TIME'], ascending= True)
    return  dataframe
    

In [None]:
def calculate_entries_column(dataframe):
    dataframe.reset_index(drop=True,inplace=True)
    dataframe['ENTRIES_DIFF']=0
    for index, row in dataframe.iterrows():
        if index == 0:
            continue
        if row['UNIT/SCP'] == dataframe['UNIT/SCP'][index-1] and row['STATION'] == dataframe['STATION'][index-1] and row['ENTRIES'] > int(dataframe['ENTRIES'][index-1]):
            dataframe['ENTRIES_DIFF'][index] = row['ENTRIES'] - int(dataframe['ENTRIES'][index-1])
            
    return dataframe

In [None]:
def calculate_exits_column(dataframe):
    dataframe.columns=dataframe.columns.str.strip()
    dataframe.reset_index(drop=True,inplace=True)
    dataframe['EXITS_DIFF']=0
    for index, row in dataframe.iterrows():
        if index == 0:
            continue
        if row['UNIT/SCP'] == dataframe['UNIT/SCP'][index-1] and row['STATION'] == dataframe['STATION'][index-1] and row['EXITS'] > int(dataframe['EXITS'][index-1]):
            dataframe['EXITS_DIFF'][index] = row['EXITS'] - int(dataframe['EXITS'][index-1])
            
    return dataframe

In [None]:
def calculate_total_traffic(dataframe):
    dataframe['TOTAL_TRAFFIC']= dataframe['ENTRIES_DIFF'] + dataframe['EXITS_DIFF']
    return dataframe

In [None]:
def generate_stat(dataframe):
    year19_traffic_per_station = dataframe.groupby('STATION').TOTAL_TRAFFIC.sum()
    year19_traffic_per_station = pd.DataFrame(year19_traffic_per_station)
    sorted_traffic = year19_traffic_per_station.sort_values(by=['TOTAL_TRAFFIC'], ascending= False)
    return sorted_traffic

In [None]:
def pickle_df(dataframe,filename):
    with open(filename+".pickle","wb")as to_write:
        pickle.dump(dataframe,to_write)

In [None]:
def read_pickle(file_name):
    with open(file_name, 'rb') as f:
        dataframe = pickle.load(f)
        return dataframe

In [None]:
def init_dataframe(years_list):
    year=load_data(years_list)
    year = preprocessing(year)
    year = calculate_entries_column(year)
    year = calculate_exits_column(year)
    return year 

In [None]:
def graph_stats(stats):
    stats = stats.head(20)
    stats = stats.reset_index()
    top20 = sns.barplot(x=stats['TOTAL_TRAFFIC'], y=stats['STATION'])
    top20.set_title('Top 20 Crowded Stations')
    plt.show()

In [None]:
def graph_days_per_month(month):
    month['Dates'] = pd.to_datetime(month['DATE_TIME']).dt.date
    top20 = sns.barplot(x=month['TOTAL_TRAFFIC'], y=list(map((lambda x: x.strftime("%m-%d-%Y")), month['Dates'])))
    top20.set_title('Days per month -May- traffic')\
    
    plt.figure(figsize=(6,4),dpi=(300))
    plt.show()

In [None]:
year19_pickle = read_pickle('MTA_19.pickle')

In [None]:
median=year19_pickle.groupby('STATION').median()
Mediandf = pd.DataFrame(median) 

In [None]:
Mediandf.drop(['ENTRIES','EXITS','ENTRIES_DIFF', 'EXITS_DIFF'],axis=1,inplace = True)


In [None]:
Mediandf.reset_index()['TOTAL_TRAFFIC']

In [None]:
year19= pd.merge(year19_pickle,Mediandf.reset_index()[['TOTAL_TRAFFIC','STATION']],on='STATION')
year19.columns=year19.columns.str.replace("TOTAL_TRAFFIC_y",'TOTAL_TRAFFIC_MED')
year19.columns=year19.columns.str.replace("TOTAL_TRAFFIC_x",'TOTAL_TRAFFIC')

In [None]:
Q1=year19_pickle.groupby('STATION').quantile(q=0.25)
q1df = pd.DataFrame(Q1) 

In [None]:
q1df.drop(['ENTRIES','EXITS','ENTRIES_DIFF', 'EXITS_DIFF'],axis=1,inplace = True)


In [None]:
Q3=year19_pickle.groupby('STATION').quantile(q=0.75)
q3df = pd.DataFrame(Q3) 

In [None]:
q3df.drop(['ENTRIES','EXITS','ENTRIES_DIFF', 'EXITS_DIFF'],axis=1,inplace = True)

In [None]:
IQR = pd.DataFrame(q3df['TOTAL_TRAFFIC'] - q1df['TOTAL_TRAFFIC'])

In [None]:
upper = pd.DataFrame(Mediandf['TOTAL_TRAFFIC'] + 1.5 * IQR['TOTAL_TRAFFIC'])

In [None]:
lower= pd.DataFrame(Mediandf['TOTAL_TRAFFIC'] - 1.5 * IQR['TOTAL_TRAFFIC'])

In [None]:
year19_up = pd.merge(year19,upper.reset_index()[['TOTAL_TRAFFIC','STATION']],on='STATION')
year19_up.columns=year19_up.columns.str.replace("TOTAL_TRAFFIC_x",'TOTAL_TRAFFIC')
year19_up.columns=year19_up.columns.str.replace("TOTAL_TRAFFIC_y",'TOTAL_TRAFFIC_UPPER')


In [None]:
year19_up.loc[year19_up.TOTAL_TRAFFIC > year19_up.TOTAL_TRAFFIC_UPPER,"TOTAL_TRAFFIC"] =year19_up.loc[year19_up.TOTAL_TRAFFIC > year19_up.TOTAL_TRAFFIC_UPPER,"TOTAL_TRAFFIC_MED"]

In [None]:
stats = generate_stat(year19_up)

In [None]:
graph_stats(stats)

In [None]:
stats = stats.reset_index()

In [None]:
graph_days_per_month(year19_up[year19_up['STATION']== stats['STATION'][0]])