In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Pivot subway data from long to wide format

In [3]:
#access subway data
#number of days in month * 143 unique stations = one month of entries
url_sc = 'https://data.cityofchicago.org/resource/5neh-572f.json?$where=date%20%3E%272021-05-15%27&$limit=4433'
subway_df_closed = pd.read_json(url_sc)

url_so = 'https://data.cityofchicago.org/resource/5neh-572f.json?$where=date%20%3E%272021-04-15%27&$limit=4290'
subway_df_open = pd.read_json(url_so)

In [4]:
subway_df_closed

# reshape from long to wide in pandas python

subway_df_closed_wide = subway_df_closed.pivot(index = 'date', columns='stationname', values='rides').reset_index()
subway_df_open_wide = subway_df_open.pivot(index = 'date', columns='stationname', values='rides').reset_index()

## Iteratively add SRI segements to dataframe

In [2]:
from datetime import date, timedelta
def dates(startyear, startmon, startday, endyear, endmon, endday): #in (year, month, day) format
    start_date = date(startyear, startmon, startday) 
    end_date = date(endyear, endmon, endday)    # perhaps date.now()

    delta = end_date - start_date   # returns timedelta
    days = []
    for i in range(delta.days + 1):
        day = start_date + timedelta(days=i)
        days.append(str(day))
    return days

In [3]:
days = dates(2018,3,5,2021,3,4)

In [5]:
#test behavior with one sample csv
files = ['/Users/amandakhoo/code/data4all-team6/data/SRI/1061_SRI.csv', '/Users/amandakhoo/code/data4all-team6/data/SRI/1062_SRI.csv']
columns = []
for file in files:
    df = pd.read_csv(file).dropna()
    maxtest = []
    for day in days:
        maxtest.append(df.loc[df['time'].str.contains(day), 'SRI'].max(skipna = True))
        #print(len(maxtest))
    columns.append(maxtest)
    
testdf = pd.DataFrame()
testdf['date'] = days

final_testdf = pd.concat([testdf,pd.DataFrame(columns).T], axis = 1)
final_testdf

Unnamed: 0,date,0,1
0,2018-03-05,-0.317582,0.011064
1,2018-03-06,2.906682,0.035418
2,2018-03-07,0.038480,0.035418
3,2018-03-08,0.044862,0.035418
4,2018-03-09,0.044862,0.209375
...,...,...,...
1091,2021-02-28,0.044862,0.035418
1092,2021-03-01,0.044862,0.035418
1093,2021-03-02,0.044862,0.035418
1094,2021-03-03,0.044862,0.008630


In [None]:
#run for all csvs in SRI folder (~over 1000 segments)
import os
import glob
  
# use glob to get all the csv files 
# in the folder
path = os.getcwd() + '/data/SRI/'

#print(path) #this should be the same as the path to where the csvs are
#print('/Users/amandakhoo/code/data4all-team6/data/SRI/')

csv_files = glob.glob(os.path.join(path, "*.csv"))
  
days = dates(2018,3,5,2021,3,4)
SRI = pd.DataFrame()
SRI['date'] = days
column_names = []
column_data = []
counter = 0  

# loop over the list of csv files
for f in csv_files:
    # read the csv file

    df = pd.read_csv(f).dropna()
    daily_max_SRIs = []
    
    #iterate over the days of interest
    for day in days:
        daily_max_SRI = df.loc[df['time'].str.contains(day), 'SRI'].max(skipna = True)
        daily_max_SRIs.append(daily_max_SRI)
    
    #add date column to the SRI df
    
    # print the location and filename
    #print('Location:', f)
    #print('File Name:', f.split("\\")[-1])
    
    #add new columns by segment ID with max SRI per day
    file_name = f.split("\\")[-1]
    segment = file_name.split('/')[-1].split('.')[0].split('_')[0] #get the segement number from the filename
    
    column_names.append('segment_' + segment + '_max_SRI')
    column_data.append(daily_max_SRIs)
    
    counter += 1
    print('completed file' + str(counter))
    

completed file1
completed file2
completed file3
completed file4
completed file5
completed file6
completed file7
completed file8
completed file9
completed file10
completed file11
completed file12
completed file13
completed file14
completed file15
completed file16
completed file17
completed file18
completed file19
completed file20
completed file21
completed file22
completed file23
completed file24
completed file25
completed file26
completed file27
completed file28
completed file29
completed file30
completed file31
completed file32
completed file33
completed file34
completed file35
completed file36
completed file37
completed file38
completed file39
completed file40
completed file41
completed file42
completed file43
completed file44
completed file45
completed file46
completed file47
completed file48
completed file49
completed file50
completed file51
completed file52
completed file53
completed file54
completed file55
completed file56
completed file57
completed file58
completed file59
comple

In [None]:
#pd.concat([SRI, pd.DataFrame(column_data).T], axis = 1)
daily_SRI_df = pd.DataFrame(column_data).T

#print(column_names)

daily_SRI_df.columns = column_names
daily_SRI_df['date'] = days
daily_SRI_df

## Combine SRI and Subway data
where the dates match

In [137]:
url = 'https://data.cityofchicago.org/resource/5neh-572f.json?$where=date%20%3E%272018-04-08%27&$limit=4433'
subway_df = pd.read_json(url)
subway_df_wide = subway_df.pivot(index = 'date', columns='stationname', values='rides').reset_index()

#change all date types to object
subway_df_wide['date'] = subway_df_wide['date'].dt.date.astype(str).values

In [139]:
dates = subway_df_wide['date'].values
dates

array(['2018-04-09', '2018-04-10', '2018-04-11', '2018-04-12',
       '2018-04-13', '2018-04-14', '2018-04-15', '2018-04-16',
       '2018-04-17', '2018-04-18', '2018-04-19', '2018-04-20',
       '2018-04-21', '2018-04-22', '2018-04-23', '2018-04-24',
       '2018-04-25', '2018-04-26', '2018-04-27', '2018-04-28',
       '2018-04-29', '2018-04-30', '2018-05-01', '2018-05-02',
       '2018-05-03', '2018-05-04', '2018-05-05', '2018-05-06',
       '2018-05-07', '2018-05-08', '2018-05-09'], dtype=object)

In [140]:
SRI_subway_subset = SRI[SRI['date'].isin(dates)]

In [141]:
print(SRI_subway_subset.dtypes)
print(subway_df_wide.dtypes)

date               object
segment_37_SRI    float64
segment_27_SRI    float64
segment_55_SRI    float64
segment_45_SRI    float64
                   ...   
segment_14_SRI    float64
segment_22_SRI    float64
segment_32_SRI    float64
segment_40_SRI    float64
segment_50_SRI    float64
Length: 91, dtype: object
stationname
date                    object
18th                   float64
35-Bronzeville-IIT     float64
35th/Archer            float64
43rd                   float64
                        ...   
Western-Cermak         float64
Western-Forest Park    float64
Western-Orange         float64
Western/Milwaukee      float64
Wilson                 float64
Length: 145, dtype: object


In [143]:
#merge
merged_subway_SRI_df = SRI_subway_subset.merge(subway_df_wide, on = 'date')

In [144]:
merged_subway_SRI_df.to_csv('data/SAMPLE_subway_SRI_data.csv')