In [1]:
import seaborn as sns
import pickle 


In [2]:
import pandas as pd
import numpy as np

In [3]:
pd.set_option('display.max_rows', 200)

In [4]:
year19_path_list=  ['turnstile_190504.txt', 'turnstile_190511.txt','turnstile_190518.txt','turnstile_190525.txt']
year18_path_list=  ['turnstile_180505.txt', 'turnstile_180512.txt','turnstile_180519.txt','turnstile_180526.txt']
year17_path_list=  ['turnstile_170506.txt', 'turnstile_170513.txt','turnstile_170520.txt','turnstile_170527.txt']

In [5]:
def load_data(list_of_datasets):
    year = pd.concat(map(lambda x: pd.read_csv(x,parse_dates=[['DATE','TIME']]), 
        list_of_datasets))
    return year

In [6]:
def preprocessing(dataframe):
    dataframe.reset_index(drop=True,inplace=True)
    dataframe.drop(['C/A','LINENAME','DIVISION', 'DESC'],axis=1,inplace = True)
    dataframe['UNIT/SCP']= dataframe['UNIT'] + dataframe['SCP'] #creating an ideantifier column 
    dataframe =  dataframe[(dataframe['DATE_TIME'].dt.month > 4)]
    dataframe = dataframe.sort_values(['STATION', 'UNIT/SCP','DATE_TIME'], ascending= True)
    return  dataframe
    

In [7]:
def calculate_entries_column(dataframe):
    dataframe.reset_index(drop=True,inplace=True)
    dataframe['ENTRIES_DIFF']=0
    for index, row in dataframe.iterrows():
        if index == 0:
            continue
        if row['UNIT/SCP'] == dataframe['UNIT/SCP'][index-1] and row['STATION'] == dataframe['STATION'][index-1] and row['ENTRIES'] > int(dataframe['ENTRIES'][index-1]):
            dataframe['ENTRIES_DIFF'][index] = row['ENTRIES'] - int(dataframe['ENTRIES'][index-1])
            
    return dataframe

In [8]:
def calculate_exits_column(dataframe):
    dataframe.columns=dataframe.columns.str.strip()
    dataframe.reset_index(drop=True,inplace=True)
    dataframe['EXITS_DIFF']=0
    for index, row in dataframe.iterrows():
        if index == 0:
            continue
        if row['UNIT/SCP'] == dataframe['UNIT/SCP'][index-1] and row['STATION'] == dataframe['STATION'][index-1] and row['EXITS'] > int(dataframe['EXITS'][index-1]):
            dataframe['EXITS_DIFF'][index] = row['EXITS'] - int(dataframe['EXITS'][index-1])
            
    return dataframe

In [9]:
def calculate_total_traffic(dataframe):
    dataframe['TOTAL_TRAFFIC']= dataframe['ENTRIES_DIFF'] + dataframe['EXITS_DIFF']
    return dataframe

In [10]:
def generate_stat(dataframe):
    year19_traffic_per_station = dataframe.groupby('STATION').TOTAL_TRAFFIC.sum()
    year19_traffic_per_station = pd.DataFrame(year19_traffic_per_station)
    sorted_traffic = year19_traffic_per_station.sort_values(by=['TOTAL_TRAFFIC'], ascending= False)
    return sorted_traffic

In [11]:
def pickle_df(dataframe,filename):
    with open(filename+".pickle","wb")as to_write:
        pickle.dump(dataframe,to_write)

In [12]:
def read_pickle(file_name):
    with open(file_name, 'rb') as f:
        dataframe = pickle.load(f)
        return dataframe

In [13]:
def init_dataframe(years_list):
    year=load_data(years_list)
    year = preprocessing(year)
    year = calculate_entries_column(year)
    year = calculate_exits_column(year)
    return year 

In [14]:
def graph_stats(stats):
    stats = stats.head(5)
    stats = stats.reset_index()
    stats.columns
    sns.barplot(x=stats['TOTAL_TRAFFIC'], y=stats['STATION'])

In [23]:
year19_pickle = read_pickle('MTA_19.pickle')

In [24]:
year19_pickle

Unnamed: 0,DATE_TIME,UNIT,SCP,STATION,ENTRIES,EXITS,UNIT/SCP,ENTRIES_DIFF,EXITS_DIFF,TOTAL_TRAFFIC
0,2019-05-01 00:00:00,R248,00-00-00,1 AV,14654538,16381430,R24800-00-00,0,0,0
1,2019-05-01 04:00:00,R248,00-00-00,1 AV,14654538,16381436,R24800-00-00,0,6,6
2,2019-05-01 08:00:00,R248,00-00-00,1 AV,14654889,16382236,R24800-00-00,351,800,1151
3,2019-05-01 12:00:00,R248,00-00-00,1 AV,14656102,16383435,R24800-00-00,1213,1199,2412
4,2019-05-01 16:00:00,R248,00-00-00,1 AV,14656924,16384214,R24800-00-00,822,779,1601
5,2019-05-01 20:00:00,R248,00-00-00,1 AV,14657978,16385015,R24800-00-00,1054,801,1855
6,2019-05-02 00:00:00,R248,00-00-00,1 AV,14658165,16385220,R24800-00-00,187,205,392
7,2019-05-02 04:00:00,R248,00-00-00,1 AV,14658165,16385220,R24800-00-00,0,0,0
8,2019-05-02 08:00:00,R248,00-00-00,1 AV,14658505,16386045,R24800-00-00,340,825,1165
9,2019-05-02 09:45:01,R248,00-00-00,1 AV,14659386,16386724,R24800-00-00,881,679,1560


In [25]:
median=year19_pickle.groupby('STATION').median()
Mediandf = pd.DataFrame(median) 

In [26]:
median

Unnamed: 0_level_0,ENTRIES,EXITS,ENTRIES_DIFF,EXITS_DIFF,TOTAL_TRAFFIC
STATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1 AV,6.362651e+06,15745363.5,93.5,108.0,347.0
103 ST,1.144130e+07,5605833.0,292.0,154.0,527.0
103 ST-CORONA,5.630244e+06,7767753.0,255.0,168.5,582.5
104 ST,5.604605e+06,4925973.0,9.0,4.0,22.0
110 ST,5.395527e+06,7464547.0,160.0,180.0,436.0
111 ST,3.247829e+06,2394708.0,55.0,30.0,105.0
116 ST,2.610494e+06,726379.0,180.0,98.0,329.0
116 ST-COLUMBIA,8.801784e+06,5920915.5,234.5,108.0,426.5
121 ST,2.028452e+06,1146686.0,38.0,27.0,97.0
125 ST,6.380224e+06,5965325.0,189.0,162.0,377.5


In [27]:
lower=year19_pickle.groupby('STATION').quantile(q=0.25)
lowerdf = pd.DataFrame(lower) 


In [28]:
lower

0.25,ENTRIES,EXITS,ENTRIES_DIFF,EXITS_DIFF,TOTAL_TRAFFIC
STATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1 AV,5.565480e+05,1045804.50,7.00,14.00,61.75
103 ST,4.323436e+06,3559117.00,86.00,62.00,188.00
103 ST-CORONA,3.987503e+06,1644733.50,89.00,66.00,279.50
104 ST,3.057604e+06,1034438.00,0.00,0.00,1.00
110 ST,4.661174e+06,1666844.00,67.00,75.50,156.50
111 ST,7.967990e+05,287326.00,8.00,4.00,16.50
116 ST,1.829575e+05,116362.75,60.00,43.00,111.00
116 ST-COLUMBIA,5.321377e+06,2630559.00,88.00,38.25,169.50
121 ST,8.271988e+05,299128.50,3.00,4.00,9.00
125 ST,3.459971e+06,2540464.75,28.00,47.00,94.75


In [29]:
upper=year19_pickle.groupby('STATION').quantile(q=0.75)
upperdf = pd.DataFrame(upper) 

In [30]:
upper

0.75,ENTRIES,EXITS,ENTRIES_DIFF,EXITS_DIFF,TOTAL_TRAFFIC
STATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1 AV,6.022462e+07,3.776328e+07,298.25,398.00,817.00
103 ST,1.468688e+07,1.071977e+07,490.00,306.00,796.00
103 ST-CORONA,1.252663e+07,1.049995e+07,518.00,311.75,794.00
104 ST,1.121290e+09,6.220627e+08,88.00,22.00,134.00
110 ST,5.451973e+06,8.502448e+06,415.00,311.50,719.50
111 ST,1.563217e+07,7.560456e+06,310.50,144.00,502.50
116 ST,6.340620e+06,3.691612e+06,353.00,190.00,534.00
116 ST-COLUMBIA,2.479919e+07,8.053408e+06,558.75,250.00,785.75
121 ST,3.377262e+06,3.746433e+06,94.00,60.00,155.25
125 ST,1.383854e+07,2.080145e+07,408.00,325.00,717.00


In [None]:
for index, row in year19_pickle.iterrows():
    if row['TOTAL_TRAFFIC'] < lowerdf['TOTAL_TRAFFIC'][row ['STATION']] or  row['TOTAL_TRAFFIC'] > upperdf['TOTAL_TRAFFIC'][row ['STATION']]:
        year19_pickle['TOTAL_TRAFFIC'][index] = Mediandf['TOTAL_TRAFFIC'][row ['STATION']]
        


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
year19_pickle

In [None]:
stats = generate_stat(year19_pickle)

In [104]:
graph_stats(stats)

NameError: name 'stats' is not defined

In [105]:
# year18 = init_dataframe(year18_path_list)

In [106]:
# pickle_df(year18,'MTA_18' )

NameError: name 'year18' is not defined

In [107]:
year18_pickle = read_pickle('MTA_18.pickle')

In [None]:
# year17 = init_dataframe(year17_path_list)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [None]:
# pickle_df(year17,'MTA_17')

In [None]:
year17_pickle = read_pickle('MTA_17.pickle')

In [None]:
year17_pickle