# Comparison of all the models

Finally, I will compare all the models (2 ARIMA, 2 simple, VAR, VAR sinus) of all the weather parameters. I will plot histograms of the absolute error of the models compared to the actual data.

In [15]:
# libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from dateutil import parser
import os
from statsmodels.tsa.arima.model import ARIMA
import warnings
warnings.filterwarnings("ignore")
from statsmodels.tsa.api import VAR

In [7]:
# data

station_id=5001
# read the .csv file into a DataFrame
file_path = os.path.join(r'D:\FMF magisterij\Matematika z računalnikom', f'ELES-MAS-{station_id}-2020-2023.csv.gz')
data = pd.read_csv(file_path, compression='gzip')
data = data[data['data_validity'] <= 32]
data.drop('measurement_location_inst_id', axis=1, inplace=True)
df=pd.DataFrame(data)
df['timestamp'] = pd.to_datetime(df['acquisition_timestamp'], utc=True)


# rearrange the data
new_data = df.pivot_table(index='timestamp', columns='parameter_type', values='value').reset_index()

# ensure 'acquisition_timestamp' is in datetime format
new_data['timestamp'] = pd.to_datetime(new_data['timestamp'])

new_data.set_index('timestamp', inplace=True)


# fill NaN values in the row with the values of next row with non NaN values
new_data = new_data.fillna(method='bfill')

#print('index type :',type(new_data.index))
#new_data.head()

# count the NaN values
nan_count = new_data.isna().sum()
#print(nan_count)
# NO nan values

# check for missing measurements (skipped times)
complete_range = pd.date_range(start=new_data.index.min(), end=new_data.index.max(), freq='5T')
missing_timestamps = complete_range.difference(new_data.index)
#print(f"missing time stamps ={len(missing_timestamps)}")
# print(complete_range)
df_complete_range = pd.DataFrame(index=complete_range)
#print(f"df complete range = {len(df_complete_range)}")

# we have 271508 measurements, there are 5112 missing measurment out of 276620 time steps in the whole time range

# merge all the measurements with all the time steps. missing measurment will be NaN values for now
new_data_incomplete = df_complete_range.merge(new_data, left_index=True, right_index=True, how='left')
#print(new_data_incomplete.isna().sum()) # okay

# missing measurements are filled with the next available measurement
new_data_all = new_data_incomplete.bfill()

# fill NaN values in the row with the values of next row with non-NaN values
new_data_all = new_data_all.fillna(method='bfill')

# print(new_data_all)
#print(new_data_all.isna().sum()) # okay

str(new_data_all.index.dtype) # okay

# resampled the data hourly with mean.
new_data_all_resampled=new_data_all.resample('1H').mean()
new_data_all_resampled.head()

Unnamed: 0,AIR_PRESSURE,AMBIENT_TEMPERATURE,RAIN_INTENSITY,RELATIVE_HUMIDITY,SOLAR_RADIATION_INTENSITY,WIND_DIRECTION,WIND_SPEED
2020-10-13 10:00:00+00:00,952.7625,7.6875,0.0,61.675,300.8375,293.625,2.3575
2020-10-13 11:00:00+00:00,952.366667,8.958333,0.0,56.825,258.341667,312.666667,1.675
2020-10-13 12:00:00+00:00,951.783333,10.183333,0.0,48.391667,260.816667,270.833333,0.826667
2020-10-13 13:00:00+00:00,951.691667,10.716667,0.0,43.625,161.95,291.083333,0.460833
2020-10-13 14:00:00+00:00,951.508333,10.833333,0.0,44.483333,155.1,280.416667,0.4775


In [12]:
# simple model 1: today = tomorrow
simple_1 = new_data_all_resampled.copy()
simple_1.iloc[24:] = new_data_all_resampled.iloc[:-24].values
simple_1.iloc[:24] = new_data_all_resampled.iloc[:24].values  # first 24hrs stay the same


In [13]:
# simple model 2: last 3 days' average is today

rolling_avg = new_data_all_resampled.rolling(window=72, min_periods=1).mean()

simple_2 = new_data_all_resampled.copy()
simple_2.iloc[72:] = rolling_avg.iloc[72:].values

simple_2.reset_index(inplace=True)

In [17]:
# VAR for all
model = VAR(new_data_all_resampled)
results = model.fit(maxlags=15, ic='aic')
params = results.params
var = results.fittedvalues

## Ambient temperature

In [14]:
# actual data
temp_actual = new_data_all_resampled['AMBIENT_TEMPERATURE']

# arima 1
model_215 = ARIMA(temp_actual, order=(2,1,5))
fit_215 = model_215.fit()
temp_arima_215 = fit_215.fittedvalues

# arima 2
model_412 = ARIMA(temp_actual, order=(4,1,2))
fit_412 = model_412.fit()
temp_arima_412 = fit_412.fittedvalues

# simple model 1
temp_sim_1 = simple_1['AMBIENT_TEMPERATURE']

# simple model 2
temp_sim_2 = simple_2['AMBIENT_TEMPERATURE']

# VAR

#temp_var = 

# VAR sinus
#temp_var_sin =

## Air pressure

## Rain intensity

## Relative humidity

## Solar radiation intensity

## Wind direction

## Wind speed