In [80]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from scipy import stats
from IPython.display import display
import ipywidgets as widgets

#setup
area = 'DK1'
file = 'consumption'
fileName = file+'_'+area+'.csv'
window = 728
start_date=False
save_files=True
dir_forecasts = 'forecasts/'
file_forecasts = 'forecasts/files.txt'
variables_list=['get_variables_neg_day1','get_variables_neg_day2','get_variables_neg_day7','consumption_prognosis']
set_other_settings()

#get_variables_neg_day1,get_variables_neg_day2,get_variables_neg_day7,consumption_prognosis

In [10]:
consumption_prognosis = pd.read_csv("../../data/data_sanitized/consumption_prognosis_"+area+".csv") 

In [79]:
def set_other_settings():
    global data, data_shape, dayofweek, holidays, median, mad, variables
    data = pd.read_csv("../../data/data_sanitized/"+fileName) 
    data['date']= pd.to_datetime(data['date'],format='%Y-%m-%d')
    dayofweek = data['date'].dt.dayofweek
    holidays = data['holiday']
    data = data.drop(columns=['holiday'])

    data_shape = get_data_shape()

    variables = []
    for key in variables_list:
        if key == 'get_variables_neg_day1':
            v = get_variables_neg_day(1)
        elif key == 'get_variables_neg_day2':
            v = get_variables_neg_day(2)
        elif key == 'get_variables_neg_day7':
            v = get_variables_neg_day(7)
        elif key == 'consumption_prognosis':
            v = get_variables_prognosis(consumption_prognosis)
        variables.append(v)
        
def get_data_shape():
    return count_frame_indeces(data,window,start_date)

In [72]:
def print_settings():
    return '|'.join([file, area, str(window), str(start_date),str(variables_list)])

# print(print_settings())

In [42]:
def count_frame_indeces(data_f,window_f,start_date_f):
    shape = data_f.shape
    start_index = shape[0]-2*window_f
    last_index = shape[0]-window_f
    if(start_date_f):
        start_date_index = int(data_f[data_f['date']==start_date_f].index[0])
        if(start_date_index > start_index):
            print(start_index)
            raise Exception('Start date is too big')
        start_index = start_date_index
        last_index = start_date_index + window_f
    last_index -= 1
    return (start_index,last_index)

In [29]:
def count_median_mad(data):
    data = filter_hours(data)
    median = np.median(data, axis = 0)
    mad = stats.median_absolute_deviation(data)
    return (median,mad)

In [57]:
#counting MAD
def standarize(data,median,mad):
    mad = np.tile(mad, (data.shape[0],1))
    return np.arcsinh((data - np.tile(median, (data.shape[0],1)))/mad)

In [49]:
def unstandarize(data,median,mad):
    data = np.sinh(data)
    data = data * mad + median
    return data

In [258]:
def data_shape(data):
    print("Length (days): " + str(data.shape[0]))
    print("Width: " + str(data.shape[1]))
    print("First date: " + str(data.iloc[0,0]))
    print("Last date: " + str(data.iloc[data.shape[0]-1,0]))
# data_shape(data)

In [28]:
def filter_hours(data):
    return data.loc[:,'0':'23']

In [23]:
#get variables -n days

#types of variables
#each_hour - for each our and for day (1,24)
#each_day - for each day (1,1)
def get_variables_neg_day(n=1):
    data_copy = filter_hours(data)
    d = pd.DataFrame(np.zeros((n, len(data_copy.columns))))
    d.columns = data_copy.columns
    d = d.append(data_copy)
    d = d.reset_index(drop=True)
    d = d.drop(d.tail(n).index) 
    return ("each_hour",d )

In [24]:
def get_variables_holidays(holidays):
    return ("each_day",holidays )

In [25]:
def get_variables_dayofweek(dayofweek):
    var_dayofweek = np.zeros((dayofweek.shape[0],7))
    for i in range(0,7):
        var_dayofweek[:,i] = dayofweek==i
    return ("each_day",var_dayofweek )

In [26]:
def get_variables_prognosis(prognosis):
    data = filter_hours(prognosis)
    return ("each_hour",data)

def get_variables_consumption_prognosis(prognosis):
    return ("each_day",get_variables_prognosis(prognosis))

def get_variables_wind_power_prognosis(prognosis):
    return ("each_day",get_variables_prognosis(prognosis))

In [45]:
#data: full dataframe with all varibales, 
#window: window for forecasting, integer, ex. 730
#start_date: first data of window (if False takes last date - windowd), ex. '2016-05-05'
#variables: arrays of all variables []

def forecast(data_f,window_f,data_shape_f,variables):
    shape = data_f.shape
    start_index, last_index = data_shape_f
    out = widgets.HTML()
    display(out)

    pred_data = data_f.copy()
    pred_data = pred_data.drop(columns=['date'])
    

    test_data = pred_data.copy()
    pred_data.loc[last_index+1:,'0':'23'] = 0.0
    for i in pred_data.loc[last_index+1:].index:
        out.value="Forecasting row " + str(i) + ", left: " + str(shape[0]-i-1)
        pred_data.loc[i,'0':'23'] = forecast_row(i,window_f,test_data,variables)
    
    
    pred_data = pred_data.loc[last_index+1:,:]
    
    real_data = data_f.loc[last_index+1:,:]
    
    if(save_files):
        with open(file_forecasts, 'r+') as f:
            lines = f.read().splitlines()
            if not lines:
                no = str(0)
            else:
                last_line = lines[-1]
                no = last_line.split('.', 1)[0]
            no = str(int(no)+1)
            f.write(no + "." + print_settings()+"\n")
            
        pred_data.to_csv(dir_forecasts + no + ".csv", index=True)

    return (real_data, pred_data)

def f():
    return forecast(data,window,get_data_shape(),variables)

In [82]:
window=10
f()

HTML(value='')

(           date       0       1       2       3       4       5       6  \
 1584 2020-05-03  1701.0  1676.0  1656.0  1656.0  1658.0  1655.0  1678.0   
 1585 2020-05-04  1848.0  1795.0  1791.0  1805.0  1859.0  1973.0  2234.0   
 1586 2020-05-05  1968.0  1925.0  1900.0  1872.0  1915.0  2019.0  2285.0   
 1587 2020-05-06  1894.0  1876.0  1854.0  1889.0  1943.0  2031.0  2310.0   
 1588 2020-05-07  1931.0  1858.0  1826.0  1873.0  1909.0  2020.0  2272.0   
 1589 2020-05-08  1839.0  1795.0  1765.0  1754.0  1779.0  1780.0  1861.0   
 1590 2020-05-09  1752.0  1691.0  1665.0  1654.0  1662.0  1672.0  1735.0   
 1591 2020-05-10  1721.0  1658.0  1635.0  1617.0  1620.0  1615.0  1691.0   
 1592 2020-05-11  1893.0  1827.0  1818.0  1872.0  1888.0  2027.0  2293.0   
 1593 2020-05-12  2002.0  1945.0  1936.0  1948.0  1989.0  2085.0  2345.0   
 
            7       8  ...      14      15      16      17      18      19  \
 1584  1797.0  1983.0  ...  1988.0  1953.0  2026.0  2237.0  2260.0  2155.0   
 1585 

In [69]:
#index for which forecast should be calculated
#window - number of rows taken for window
#data - filtered data - only with index and hour columns
#array of variables - definition above
def forecast_row(index, window, data, variables):
    row = np.zeros((1,24))
    sizeB = 1 #size of variables in modelt
    data_v = pd.DataFrame.from_records(np.ones((1,data.shape[0])))
    
#counting MAD and median
    median, mad = count_median_mad(data.loc[index-window:index-1])
    

    data = standarize(data,median,mad)

#counting variables vector size in model
    for v in variables:
        if v[0]=='each_hour':
            sizeB += 1
        elif v[0]=='each_day':
            sizeB += v[1].shape[1]
            
            
#counting 
    variablesMatrix = pd.DataFrame().from_records(np.ones((data.shape[0],sizeB)))
    for h in range(row.shape[1]):
        i=0
        
        for v in variables:
            if v[0]=='each_hour':
                variablesMatrix[i] = v[1][str(h)]
            elif v[0]=='each_day':
                for k in range(v[1].shape[1]):
                    variablesMatrix[i] = v[1][k]
            i+=1
        row[0,h] = getParametersVector(index,h, window,data, variablesMatrix) @ getVariablesVector(index,variablesMatrix)
    row = unstandarize(row,median,mad)
    
    return row

def getParametersVector(index,h,window,data,variablesMatrix):
    Y = data.loc[index-window:index-1,str(h)]
    X = variablesMatrix.loc[index-window:index-1,:]
    
    return np.dot(np.dot(np.linalg.inv(np.dot(X.T,X)),X.T),Y)

def getVariablesVector(index,variables):
    return variables.loc[index,:]

# forecast_row(874,730,data,variables)

In [70]:
f()

HTML(value='')

NameError: name 'print_settings' is not defined

(1104, 1348)

In [508]:
def get_data_shape():
    shape = data.shape
    start_index = shape[0]-2*window
    last_index = shape[0]-window
    if(start_date):
        start_date_index = int(data[data['date']==start_date].index[0])
        if(start_date_index > start_index):
            print(start_index)
            raise Exception('Start date is too big')
        start_index = start_date_index
        last_index = start_date_index + window
    last_index -= 1
    return (start_index,last_index)

2427