In [1]:
import pandas as pd
import json
import requests
import datetime

In [2]:
def refactoring(data):
    new_data = {}
    for key, value in data.items():
        if isinstance(value, str) and '<' in value:
            value = int(value.split('<')[1])
        if value != '':
            new_data[key] = value
    return new_data


In [3]:
def fetch_data(src1):
    url = "https://arsoxmlwrapper.app.grega.xyz/api/air/archive"
    response = requests.get(url)
    if response.status_code == 200:
        print("Fetched main datset")
        data = json.loads(response.content)
        with open(src1, "w") as f:
            json.dump(data, f)
    else:
        print("Failed to retrieve JSON data")

In [4]:
def fetch_data_two(src2):
    # Get the current date and time
    current_date = datetime.datetime.now()

    # Get the date one month ago
    one_month_ago = current_date - datetime.timedelta(days=30)

    # Convert dates to Unix time
    current_unix_time = int(current_date.timestamp())
    one_month_ago_unix_time = int(one_month_ago.timestamp())

    latitude = '46.05'
    longitude = '14.51'

    start_date = datetime.datetime.utcfromtimestamp(
        one_month_ago_unix_time).strftime('%Y-%m-%d')
    end_date = datetime.datetime.utcfromtimestamp(
        current_unix_time).strftime('%Y-%m-%d')    

    url = f'https://archive-api.open-meteo.com/v1/archive?latitude={latitude}&longitude={longitude}&start_date={start_date}&end_date={end_date}&hourly=temperature_2m,relativehumidity_2m,precipitation,windspeed_10m'
    response = requests.get(url)
    if response.status_code == 200:
        print("Fetched weather history")
        data = json.loads(response.content)
        with open(src2, "w") as f:
            json.dump(data, f)
    else:
        print("Failed to retrieve JSON data")

In [5]:
fetch_data('../data/raw/data.json')
fetch_data_two('../data/raw/weather/data.json')

Fetched main datset
Fetched weather history


In [6]:
src1 = '../data/raw/data.json'
src2 = '../data/raw/weather/data.json'
dist = '../data/processed/data.csv'

In [7]:
f = open(src1, 'r', encoding='utf-8')
raw = json.load(f)
f.close()

df = pd.DataFrame()

print('Transforming json to pandas dataframe...')
# prilagodimo json dataframe-u
for i in range(len(raw)):
    jdata = json.loads(raw[i]['json'])
    station = jdata['arsopodatki']['postaja']
    for i in range(len(station)):
        if station[i]['merilno_mesto'] == 'LJ Bežigrad':
            data = station[i]
            data = refactoring(data)
            df = pd.concat([df, pd.json_normalize(data)])

print('Connecting data...')

df = df[['datum_od', 'pm10']]
df['pm10'].fillna((df['pm10'].mean()), inplace=True)
df['datum_od'] = pd.to_datetime(df['datum_od'])
df = df.sort_values(by='datum_od')
df = df.drop_duplicates(subset='datum_od', keep='first')

Transforming json to pandas dataframe...
Connecting data...


In [8]:
f = open(src2, 'r', encoding='utf-8')
raw = json.load(f)
f.close()

df1 = pd.DataFrame()
df1['date'] = raw['hourly']['time']
df1['date'] = pd.to_datetime(df1['date'])

df1['temp'] = raw['hourly']['temperature_2m']
df1['temp'].fillna(df1['temp'].mean(), inplace=True)

df1['hum'] = raw['hourly']['relativehumidity_2m']
df1['hum'].fillna(df1['hum'].mean(), inplace=True)

df1['percp'] = raw['hourly']['precipitation']
df1['percp'].fillna(df1['percp'].mean(), inplace=True)

df1['wspeed'] = raw['hourly']['windspeed_10m']
df1['wspeed'].fillna(df1['wspeed'].mean(), inplace=True)

In [9]:
df1

Unnamed: 0,date,temp,hum,percp,wspeed
0,2023-02-13 00:00:00,2.800000,79.000000,0.000000,6.800000
1,2023-02-13 01:00:00,1.400000,82.000000,0.000000,6.000000
2,2023-02-13 02:00:00,1.200000,83.000000,0.000000,5.400000
3,2023-02-13 03:00:00,0.900000,85.000000,0.000000,3.700000
4,2023-02-13 04:00:00,0.500000,86.000000,0.000000,3.100000
...,...,...,...,...,...
739,2023-03-15 19:00:00,3.947222,84.284722,0.077083,9.319097
740,2023-03-15 20:00:00,3.947222,84.284722,0.077083,9.319097
741,2023-03-15 21:00:00,3.947222,84.284722,0.077083,9.319097
742,2023-03-15 22:00:00,3.947222,84.284722,0.077083,9.319097


In [10]:
start = df['datum_od'].iloc[0]
end = df['datum_od'].iloc[-1]

start_index = df1.loc[df1['date'] == start].index[0]
end_index = df1.loc[df1['date'] == end].index[0]

print(start_index, end_index)

df1 = df1.iloc[start_index:end_index]

df1

67 740


Unnamed: 0,date,temp,hum,percp,wspeed
67,2023-02-15 19:00:00,3.200000,78.000000,0.000000,3.000000
68,2023-02-15 20:00:00,2.300000,81.000000,0.000000,2.500000
69,2023-02-15 21:00:00,1.600000,83.000000,0.000000,2.100000
70,2023-02-15 22:00:00,0.500000,86.000000,0.000000,2.400000
71,2023-02-15 23:00:00,-0.500000,88.000000,0.000000,2.400000
...,...,...,...,...,...
735,2023-03-15 15:00:00,3.947222,84.284722,0.077083,9.319097
736,2023-03-15 16:00:00,3.947222,84.284722,0.077083,9.319097
737,2023-03-15 17:00:00,3.947222,84.284722,0.077083,9.319097
738,2023-03-15 18:00:00,3.947222,84.284722,0.077083,9.319097


In [11]:
df1 = df1.reset_index(drop=True)
df = df.reset_index(drop=True)
df1

Unnamed: 0,date,temp,hum,percp,wspeed
0,2023-02-15 19:00:00,3.200000,78.000000,0.000000,3.000000
1,2023-02-15 20:00:00,2.300000,81.000000,0.000000,2.500000
2,2023-02-15 21:00:00,1.600000,83.000000,0.000000,2.100000
3,2023-02-15 22:00:00,0.500000,86.000000,0.000000,2.400000
4,2023-02-15 23:00:00,-0.500000,88.000000,0.000000,2.400000
...,...,...,...,...,...
668,2023-03-15 15:00:00,3.947222,84.284722,0.077083,9.319097
669,2023-03-15 16:00:00,3.947222,84.284722,0.077083,9.319097
670,2023-03-15 17:00:00,3.947222,84.284722,0.077083,9.319097
671,2023-03-15 18:00:00,3.947222,84.284722,0.077083,9.319097


In [12]:
df1['pm10'] = df.loc[:, 'pm10']
df1

Unnamed: 0,date,temp,hum,percp,wspeed,pm10
0,2023-02-15 19:00:00,3.200000,78.000000,0.000000,3.000000,67.0
1,2023-02-15 20:00:00,2.300000,81.000000,0.000000,2.500000,60.0
2,2023-02-15 21:00:00,1.600000,83.000000,0.000000,2.100000,63.0
3,2023-02-15 22:00:00,0.500000,86.000000,0.000000,2.400000,68.0
4,2023-02-15 23:00:00,-0.500000,88.000000,0.000000,2.400000,71.0
...,...,...,...,...,...,...
668,2023-03-15 15:00:00,3.947222,84.284722,0.077083,9.319097,3.0
669,2023-03-15 16:00:00,3.947222,84.284722,0.077083,9.319097,2.0
670,2023-03-15 17:00:00,3.947222,84.284722,0.077083,9.319097,3.0
671,2023-03-15 18:00:00,3.947222,84.284722,0.077083,9.319097,4.0


In [13]:
print('Saving processed data...')
df1.to_csv(dist, index=False)

print('Finished!')

Saving processed data...
Finished!
