In [132]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Pivot subway data from long to wide format

In [133]:
#access subway data
#number of days in month * 143 unique stations = one month of entries
url_sc = 'https://data.cityofchicago.org/resource/5neh-572f.json?$where=date%20%3E%272021-05-15%27&$limit=4433'
subway_df_closed = pd.read_json(url_sc)

url_so = 'https://data.cityofchicago.org/resource/5neh-572f.json?$where=date%20%3E%272021-04-15%27&$limit=4290'
subway_df_open = pd.read_json(url_so)

In [134]:
subway_df_closed

# reshape from long to wide in pandas python

subway_df_closed_wide = subway_df_closed.pivot(index = 'date', columns='stationname', values='rides').reset_index()
subway_df_open_wide = subway_df_open.pivot(index = 'date', columns='stationname', values='rides').reset_index()

## Iteratively add SRI segements to dataframe

In [135]:
import os
import glob
  
# use glob to get all the csv files 
# in the folder
path = os.getcwd() + '/SRI/'
#print(path) #this should be the same as the path to where the csvs are
#print('/Users/amandakhoo/code/data4all-team6/SRI')
csv_files = glob.glob(os.path.join(path, "*.csv"))
  

SRI = pd.DataFrame()
    
# loop over the list of csv files
for f in csv_files:
    # read the csv file
    df = pd.read_csv(f)
    SRI['date'] = df['time']  
    
    # print the location and filename
    #print('Location:', f)
    #print('File Name:', f.split("\\")[-1])
    
    file_name = f.split("\\")[-1]
    segment = file_name.split('/')[-1].split('.')[0].split('_')[0] #get the segement number from the filename
    column_name = 'segment_' + segment + '_SRI'
    SRI[column_name] = df['SRI']
    

In [136]:
SRI

Unnamed: 0,date,segment_37_SRI,segment_27_SRI,segment_55_SRI,segment_45_SRI,segment_2_SRI,segment_73_SRI,segment_63_SRI,segment_11_SRI,segment_87_SRI,...,segment_82_SRI,segment_92_SRI,segment_66_SRI,segment_7_SRI,segment_76_SRI,segment_14_SRI,segment_22_SRI,segment_32_SRI,segment_40_SRI,segment_50_SRI
0,2018-03-05,,,,,,,,,,...,,,,,,,,,,
1,2018-03-06,,,,,,,,,,...,,,,,,,,,,
2,2018-03-07,,,,,,,,,,...,,,,,,,,,,
3,2018-03-08,0.359163,3.302960,2.582116,3.160523,0.341544,4.379226,0.378816,0.519244,1.179201,...,0.222898,0.203624,2.929182,0.421873,2.604357,0.455804,2.476669,1.729207,0.093905,0.987008
4,2018-03-09,-0.105586,0.463564,2.115062,1.902928,1.811669,-0.223624,0.254831,3.155534,0.345993,...,0.570915,0.052478,1.266277,0.463568,-0.071756,1.984037,2.500895,0.223244,0.523121,0.813927
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1091,2021-02-28,0.024605,4.534855,2.606105,2.732924,0.811055,0.200367,0.152889,2.716049,1.390528,...,2.539752,-0.169314,2.967447,-0.788657,-0.601449,2.058824,0.526998,1.123142,2.638513,1.046801
1092,2021-03-01,1.415876,1.951532,2.146062,2.230041,-0.868141,4.689024,2.023675,0.321212,1.453211,...,3.114520,0.880594,4.447861,-1.315875,4.453685,1.041337,6.891892,1.319443,1.635354,2.771144
1093,2021-03-02,,,,,,,,,,...,,,,,,,,,,
1094,2021-03-03,,,,,,,,,,...,,,,,,,,,,


## Combine SRI and Subway data
where the dates match

In [137]:
url = 'https://data.cityofchicago.org/resource/5neh-572f.json?$where=date%20%3E%272018-04-08%27&$limit=4433'
subway_df = pd.read_json(url)
subway_df_wide = subway_df.pivot(index = 'date', columns='stationname', values='rides').reset_index()

#change all date types to object
subway_df_wide['date'] = subway_df_wide['date'].dt.date.astype(str).values

In [139]:
dates = subway_df_wide['date'].values
dates

array(['2018-04-09', '2018-04-10', '2018-04-11', '2018-04-12',
       '2018-04-13', '2018-04-14', '2018-04-15', '2018-04-16',
       '2018-04-17', '2018-04-18', '2018-04-19', '2018-04-20',
       '2018-04-21', '2018-04-22', '2018-04-23', '2018-04-24',
       '2018-04-25', '2018-04-26', '2018-04-27', '2018-04-28',
       '2018-04-29', '2018-04-30', '2018-05-01', '2018-05-02',
       '2018-05-03', '2018-05-04', '2018-05-05', '2018-05-06',
       '2018-05-07', '2018-05-08', '2018-05-09'], dtype=object)

In [140]:
SRI_subway_subset = SRI[SRI['date'].isin(dates)]

In [141]:
print(SRI_subway_subset.dtypes)
print(subway_df_wide.dtypes)

date               object
segment_37_SRI    float64
segment_27_SRI    float64
segment_55_SRI    float64
segment_45_SRI    float64
                   ...   
segment_14_SRI    float64
segment_22_SRI    float64
segment_32_SRI    float64
segment_40_SRI    float64
segment_50_SRI    float64
Length: 91, dtype: object
stationname
date                    object
18th                   float64
35-Bronzeville-IIT     float64
35th/Archer            float64
43rd                   float64
                        ...   
Western-Cermak         float64
Western-Forest Park    float64
Western-Orange         float64
Western/Milwaukee      float64
Wilson                 float64
Length: 145, dtype: object


In [143]:
#merge
merged_subway_SRI_df = SRI_subway_subset.merge(subway_df_wide, on = 'date')

In [144]:
merged_subway_SRI_df.to_csv('data/SAMPLE_subway_SRI_data.csv')