In [11]:
import pandas as pd
import json
import requests
import datetime

In [12]:
def refactoring(data):
    new_data = {}
    for key, value in data.items():
        if isinstance(value, str) and '<' in value:
            value = int(value.split('<')[1])
        if value != '':
            new_data[key] = value
    return new_data


In [13]:
def fetch_data(src1):
    url = "https://arsoxmlwrapper.app.grega.xyz/api/air/archive"
    response = requests.get(url)
    if response.status_code == 200:
        print("Fetched main datset")
        data = json.loads(response.content)
        with open(src1, "w") as f:
            json.dump(data, f)
    else:
        print("Failed to retrieve JSON data")

In [14]:
def fetch_data_two(src2):
    # Get the current date and time
    current_date = datetime.datetime.now()

    # Get the date one month ago
    one_month_ago = current_date - datetime.timedelta(days=60)

    # Convert dates to Unix time
    current_unix_time = int(current_date.timestamp())
    one_month_ago_unix_time = int(one_month_ago.timestamp())

    latitude = '46.05'
    longitude = '14.51'

    start_date = datetime.datetime.utcfromtimestamp(
        one_month_ago_unix_time).strftime('%Y-%m-%d')
    end_date = datetime.datetime.utcfromtimestamp(
        current_unix_time).strftime('%Y-%m-%d')    

    url = f'https://archive-api.open-meteo.com/v1/archive?latitude={latitude}&longitude={longitude}&start_date={start_date}&end_date={end_date}&hourly=temperature_2m,relativehumidity_2m,precipitation,windspeed_10m'
    response = requests.get(url)
    if response.status_code == 200:
        print("Fetched weather history")
        data = json.loads(response.content)
        with open(src2, "w") as f:
            json.dump(data, f)
    else:
        print("Failed to retrieve JSON data")

In [15]:
fetch_data('../data/raw/data.json')
fetch_data_two('../data/raw/weather/data.json')

Fetched main datset
Fetched weather history


In [16]:
src1 = '../data/raw/data.json'
src2 = '../data/raw/weather/data.json'
dist = '../data/processed/data.csv'

In [17]:
f = open(src1, 'r', encoding='utf-8')
raw = json.load(f)
f.close()

df = pd.DataFrame()

print('Transforming json to pandas dataframe...')
# prilagodimo json dataframe-u
for i in range(len(raw)):
    jdata = json.loads(raw[i]['json'])
    station = jdata['arsopodatki']['postaja']
    for i in range(len(station)):
        if station[i]['merilno_mesto'] == 'LJ Bežigrad':
            data = station[i]
            data = refactoring(data)
            df = pd.concat([df, pd.json_normalize(data)])

print('Connecting data...')

df = df[['datum_od', 'pm10']]
df['pm10'].fillna((df['pm10'].mean()), inplace=True)
df['datum_od'] = pd.to_datetime(df['datum_od'])
df = df.sort_values(by='datum_od')
df = df.drop_duplicates(subset='datum_od', keep='first')

Transforming json to pandas dataframe...
Connecting data...


In [18]:
f = open(src2, 'r', encoding='utf-8')
raw = json.load(f)
f.close()

df1 = pd.DataFrame()
df1['date'] = raw['hourly']['time']
df1['date'] = pd.to_datetime(df1['date'])

df1['temp'] = raw['hourly']['temperature_2m']
df1['temp'].fillna(df1['temp'].mean(), inplace=True)

df1['hum'] = raw['hourly']['relativehumidity_2m']
df1['hum'].fillna(df1['hum'].mean(), inplace=True)

df1['percp'] = raw['hourly']['precipitation']
df1['percp'].fillna(df1['percp'].mean(), inplace=True)

df1['wspeed'] = raw['hourly']['windspeed_10m']
df1['wspeed'].fillna(df1['wspeed'].mean(), inplace=True)

In [19]:
df1

Unnamed: 0,date,temp,hum,percp,wspeed
0,2023-01-20 00:00:00,-2.900000,99.000000,0.000000,11.300000
1,2023-01-20 01:00:00,-3.100000,99.000000,0.000000,11.400000
2,2023-01-20 02:00:00,-3.200000,98.000000,0.000000,10.400000
3,2023-01-20 03:00:00,-3.700000,98.000000,0.000000,8.700000
4,2023-01-20 04:00:00,-3.800000,97.000000,0.000000,8.400000
...,...,...,...,...,...
1459,2023-03-21 19:00:00,2.141435,81.233796,0.071142,8.926466
1460,2023-03-21 20:00:00,2.141435,81.233796,0.071142,8.926466
1461,2023-03-21 21:00:00,2.141435,81.233796,0.071142,8.926466
1462,2023-03-21 22:00:00,2.141435,81.233796,0.071142,8.926466


In [20]:
start = df['datum_od'].iloc[0]
end = df['datum_od'].iloc[-1]

start_index = df1.loc[df1['date'] == start].index[0]
end_index = df1.loc[df1['date'] == end].index[0]

print(start_index, end_index)

df1 = df1.iloc[start_index:end_index]

df1

643 1458


Unnamed: 0,date,temp,hum,percp,wspeed
643,2023-02-15 19:00:00,3.200000,78.000000,0.000000,3.000000
644,2023-02-15 20:00:00,2.300000,81.000000,0.000000,2.500000
645,2023-02-15 21:00:00,1.600000,83.000000,0.000000,2.100000
646,2023-02-15 22:00:00,0.500000,86.000000,0.000000,2.400000
647,2023-02-15 23:00:00,-0.500000,88.000000,0.000000,2.400000
...,...,...,...,...,...
1453,2023-03-21 13:00:00,2.141435,81.233796,0.071142,8.926466
1454,2023-03-21 14:00:00,2.141435,81.233796,0.071142,8.926466
1455,2023-03-21 15:00:00,2.141435,81.233796,0.071142,8.926466
1456,2023-03-21 16:00:00,2.141435,81.233796,0.071142,8.926466


In [21]:
df1 = df1.reset_index(drop=True)
df = df.reset_index(drop=True)
df1

Unnamed: 0,date,temp,hum,percp,wspeed
0,2023-02-15 19:00:00,3.200000,78.000000,0.000000,3.000000
1,2023-02-15 20:00:00,2.300000,81.000000,0.000000,2.500000
2,2023-02-15 21:00:00,1.600000,83.000000,0.000000,2.100000
3,2023-02-15 22:00:00,0.500000,86.000000,0.000000,2.400000
4,2023-02-15 23:00:00,-0.500000,88.000000,0.000000,2.400000
...,...,...,...,...,...
810,2023-03-21 13:00:00,2.141435,81.233796,0.071142,8.926466
811,2023-03-21 14:00:00,2.141435,81.233796,0.071142,8.926466
812,2023-03-21 15:00:00,2.141435,81.233796,0.071142,8.926466
813,2023-03-21 16:00:00,2.141435,81.233796,0.071142,8.926466


In [22]:
df1['pm10'] = df.loc[:, 'pm10']
df1

Unnamed: 0,date,temp,hum,percp,wspeed,pm10
0,2023-02-15 19:00:00,3.200000,78.000000,0.000000,3.000000,67.0
1,2023-02-15 20:00:00,2.300000,81.000000,0.000000,2.500000,60.0
2,2023-02-15 21:00:00,1.600000,83.000000,0.000000,2.100000,63.0
3,2023-02-15 22:00:00,0.500000,86.000000,0.000000,2.400000,68.0
4,2023-02-15 23:00:00,-0.500000,88.000000,0.000000,2.400000,71.0
...,...,...,...,...,...,...
810,2023-03-21 13:00:00,2.141435,81.233796,0.071142,8.926466,12.0
811,2023-03-21 14:00:00,2.141435,81.233796,0.071142,8.926466,9.0
812,2023-03-21 15:00:00,2.141435,81.233796,0.071142,8.926466,11.0
813,2023-03-21 16:00:00,2.141435,81.233796,0.071142,8.926466,16.0


In [23]:
print('Saving processed data...')
df1.to_csv(dist, index=False)

print('Finished!')

Saving processed data...
Finished!
