In [9]:
# This file reads and transforms all open weather json files into a single csv file

In [10]:
# function to convert utc to human readable time
def utc2time(ts):
    import datetime
    return list(map(int,datetime.datetime.utcfromtimestamp(ts).strftime('%Y %m %d %H').split()))

In [11]:
def params(data,hour,index,lon,lat):
    # a function that returns a list of parameters against a given json
    dt=data['list'][hour]['dt']
    aqi=data['list'][hour]['main']['aqi']
    co=data['list'][hour]['components']['co']
    no=data['list'][hour]['components']['no']
    no2=data['list'][hour]['components']['no2']
    o3=data['list'][hour]['components']['o3']
    so2=data['list'][hour]['components']['so2']
    pm2_5=data['list'][hour]['components']['pm2_5']
    pm10=data['list'][hour]['components']['pm10']
    nh3=data['list'][hour]['components']['nh3']

    temp=utc2time(dt)

    return [temp[0],temp[1],temp[2],temp[3],index,lon,lat,aqi,co,no,no2,o3,so2,nh3,pm2_5,pm10]

In [12]:
# function returns dataframe with temperature and dewpoint for longitude and latitude
def get_meteo(longitude,latitude):
    import pandas as pd
    import requests
    response = requests.get('https://archive-api.open-meteo.com/v1/era5?latitude='+str(latitude)+'&longitude='+str(longitude)+'&start_date=2021-01-01&end_date=2021-12-31&hourly=temperature_2m,dewpoint_2m')
    data=response.json()
    df=pd.DataFrame(data['hourly']['temperature_2m'],columns=['temperature'])
    df['dewpoint']=data['hourly']['dewpoint_2m']
    df.drop([i for i in range(624,649)])
    return df

In [13]:
# this function reads indexed file, transforms and returns a dataframe

def read_file(index):
    import pandas as pd
    import json

    # reading the json file
    with open(f'air_pollution{index}.json') as f:
        data = json.load(f)

    # defining longitude and latitude
    lon=data['coord']['lon']
    lat=data['coord']['lat']

    # Transforming the json file into a dataframe
    arr=[]
    for i in range(len(data['list'])):
        arr.append(params(data,i,index,lon,lat))
    df1 = pd.DataFrame(arr,columns=['year','month','day','hour','zone','lon','lat','aqi','co','no','no2','o3','so2','nh3','pm2_5','pm10'])
    df2=get_meteo(lon,lat)
    df1['temperature']=df2['temperature']
    df1['dewpoint']=df2['dewpoint']
    return df1

In [14]:
arr=[read_file(i).drop([624,8734]) for i in range(1,91)] # need to drop record for 27 jan and last record for each file

In [15]:
# combining all the dataframes into a single dataframe
import pandas as pd
pd.concat(arr, axis=0).sort_index(kind='merge').to_csv('data.csv', index=False, mode='a')