In [None]:
from bs4 import BeautifulSoup
import pandas as pd
from urllib import request, parse
from datetime import date, timedelta
import numpy as np
from dateutil.relativedelta import relativedelta
import re
from matplotlib import pyplot as plt


I collected the data from http://idojarasbudapest.hu/archivalt-idojaras with a lot of HTTP POST method

In [None]:
class DataPrepare:
    '''
    This class will download and clean the data, what comes from the above url
    '''
    def __init__(self, url, interval_start, interval_end):
        ''' Url - the endpoint where to post
            interval_start - datetime.date type!!
            interval_end - datetime.date type!!
        '''
        np.random.seed(1)
        self.url = url
        self.dstart = interval_start
        self.dend = interval_end
    
    def htmltable_to_df(self,html):
        '''Find the table html tag from the response,
            create table columns and add data to it.
            html - html file with one <table> tag.
        '''
        
        soup = BeautifulSoup(html, 'html.parser')
        table = soup.find("table")
        # The first tr contains the field names.
        headings = [th.get_text().strip() for th in table.find("tr").find_all("th")]
        datasets = []
        for row in table.find_all("tr")[1:]:
            dataset = dict(zip(headings, (td.get_text() for td in row.find_all("td"))))
            datasets.append(dataset)

        df = pd.DataFrame.from_dict(datasets)
        return df
    
    def post_query(self,mydate):
        '''Posts a query request to the above define url with the added date.'''
        self.data = parse.urlencode({'ev': mydate.year, 'ho': str(mydate.month).zfill(2), 'button': 'Mehet'}).encode()
        req =  request.Request(self.url, data=self.data) # this will make the method "POST"
        resp = request.urlopen(req)
        html = resp.read().decode()
        return html
    
    def collect_data(self):
        ''' Iterate over start to end date monthly,
            download monthly data, append to the existing dataframe.
        '''
        month = relativedelta(months=+1)
        mydate = self.dstart
        while mydate <= self.dend:
            html = self.post_query(mydate)
            df_tmp = self.htmltable_to_df(html)
            
            if mydate == self.dstart:
                self.dataframe = df_tmp
            else:
                self.dataframe = self.dataframe.append(df_tmp, ignore_index=True)
            
            mydate += month
        return self.dataframe
    
    def clean_data(self):
        ''' Clean the column names, drop columns what i don't needed, rename and create 
        '''
        regex = re.compile("([a-z]+)")
        self.dataframe['Nap'] = self.dataframe['Nap'].apply(lambda x: regex.split(x)[0])
        self.dataframe = self.dataframe.rename({self.dataframe.columns[2]: 'Tmean',
                                   self.dataframe.columns[1]: 'Date',
                                   self.dataframe.columns[3]: 'Tmax',
                                   self.dataframe.columns[4]: 'Tmin'}, axis='columns')
        
        self.dataframe['Tmax'] = self.dataframe['Tmax'].astype(float)
        self.dataframe['Tmin'] = self.dataframe['Tmin'].astype(float)
        self.dataframe['Tmean'] = self.dataframe['Tmax'] + self.dataframe['Tmin'] / 2
        self.dataframe = self.dataframe.drop([self.dataframe.columns[0], 'Tmax', 'Tmin', 'Date'], axis=1)
        return self.dataframe
    
    def save_data(self, filename = 'weather_datas.csv'):
        self.dataframe.to_csv(filename, index = False)
    def load_data(self, filename = 'weather_datas.csv'):
        self.dataframe = pd.read_csv(filename)        
    
    # convert series to supervised learning
    def series_to_deeplr_dataset(self,data, n_in=1, n_out=1, dropnan=True):
        '''
            This method transform the data with this form
            | date | temperature |
            | 2018 | 1.2         |
            
            to this
             
            | index | var(t-1) | var(t) | var(t+1) | ... |
            | 1     | 11       | 12     | 13       | ... |
            | 2     | 12       | 13     | 14       | ... |
            
            where n_in is the number t-n_in, t-(n_in-1)...
            where n_out is the number t+n_in, t+(n_in-1)...       
        '''
        n_vars = 1
        df = pd.DataFrame(data)
        cols, names = list(), list()
        # input sequence (t-n, ... t-1)
        for i in range(n_in, 0, -1):
            cols.append(df.shift(i))
            names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
        # forecast sequence (t, t+1, ... t+n)
        for i in range(0, n_out):
            cols.append(df.shift(-i))
            if i == 0:
                names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
            else:
                names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
        # put it all together
        agg = pd.concat(cols, axis=1)
        agg.columns = names
        # drop rows with NaN values
        if dropnan:
            agg.dropna(inplace=True)
        return agg
        

In [None]:
startDate = date(2012,1,1)
endDate = date(2018,10,1)
url = 'http://idojarasbudapest.hu/archivalt-idojaras'
dp = DataPrepare(url, startDate, endDate)

'''dp.collect_data()
dp.clean_data()
dp.save_data()'''
dp.load_data()

In [None]:
dp.series_to_deeplr_dataset(dp.dataframe, 2,2)

In [None]:
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras import regularizers

class TrainingClasses:
    '''
        Aggregated class for the models.
        There are three neural network what are predicting forward 1, 7, 31 day mean temperature.
    '''
    def __init__(self):
        ;
        
    def train_daily_predictor(self, dp, day_count_before = 7, day_count_after = 1, test_split = 1/6, valid_split = 1/6,
                             ep = 1000, batch_siz = 16):
        dataset = dp.series_to_deeplr_dataset(dp.dataframe, n_in=day_count_before, n_out=day_count_after).values
        
        X = dataset[:,0:day_count_before]
        Y = dataset[:,day_count_before:]

        X = X.reshape((X.shape[0], X.shape[1], 1))
        v_index = int(X.shape[0]*(1-valid_split-test_split))
        t_index = int(X.shape[0]*(1-test_split))
        
        self.X_daily_test = X[t_index:]
        self.Y_daily_test = Y[t_index:]
        X_valid = X[v_index:t_index]
        Y_valid = Y[v_index:t_index]
        X = X[:v_index]
        Y = Y[:v_index]
        
        self.daily_test_datestart = date(2012,1,1) + relativedelta(days=+t_index+day_count_before)
        self.daily_test_dateends = self.daily_test_datestart + relativedelta(days=+self.X_daily_test.shape[0])
        
        
        
        patience=40
        early_stopping=EarlyStopping(patience=patience, verbose=1)
        checkpointer=ModelCheckpoint(filepath='weights_daily.hdf5', save_best_only=True, verbose=1)
        self.daily_model = self.create_daily_predictor(day_count_before, day_count_after)
        
        print(self.daily_model.summary())
        # fit model
        history = self.daily_model.fit(X, Y, epochs=ep, verbose=2, batch_size=batch_siz, validation_data=(X_valid, Y_valid),
                 callbacks=[checkpointer, early_stopping])
        return history
        
        
    def create_daily_predictor(self, day_count_before,day_count_after):
        model = Sequential()
        model.add(Conv1D(filters=32, kernel_size=day_count_before-1,strides=4,padding='same', activation='relu', input_shape=(day_count_before, day_count_after)))
        model.add(MaxPooling1D(pool_size=2))
        model.add(Conv1D(filters=4, kernel_size=3, padding='same',strides=2, activation='relu' ))
        
        model.add(Flatten())
        model.add(Dense(50, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
        model.add(Dense(day_count_after))
        model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])
        self.daily_model = model
        return model
    
    def train_weekly_predictor(self, dp, day_count_before = 31, day_count_after = 7, test_split = 1/6, valid_split = 1/6,
                             ep = 1000, batch_siz = 512):
        dataset = dp.series_to_deeplr_dataset(dp.dataframe, n_in=day_count_before, n_out=day_count_after).values
        
        X = dataset[:,0:day_count_before]
        Y = dataset[:,day_count_before:]

        X = X.reshape((X.shape[0], X.shape[1], 1))
        v_index = int(X.shape[0]*(1-valid_split-test_split))
        t_index = int(X.shape[0]*(1-test_split))

        self.X_weekly_test = X[t_index:]
        self.Y_weekly_test = Y[t_index:]
        X_valid = X[v_index:t_index]
        Y_valid = Y[v_index:t_index]
        X = X[:v_index]
        Y = Y[:v_index]
        
        self.weekly_test_datestart = date(2012,1,1) + relativedelta(days=+t_index+day_count_before)
        self.weekly_test_dateends = self.weekly_test_datestart + relativedelta(days=+self.X_weekly_test.shape[0])
        
        
        
        patience=40
        early_stopping=EarlyStopping(patience=patience, verbose=1)
        checkpointer=ModelCheckpoint(filepath='weights_weekly.hdf5', save_best_only=True, verbose=1)
        self.weekly_model = self.create_weekly_predictor(day_count_before, day_count_after)
        
        print(self.weekly_model.summary())
        # fit model
        histoy = self.weekly_model.fit(X, Y, epochs=ep, verbose=2, batch_size=batch_siz, validation_data=(X_valid, Y_valid),
                 callbacks=[checkpointer, early_stopping])
        return history
        
        
    def create_weekly_predictor(self, day_count_before,day_count_after):
        model = Sequential()
        model.add(Conv1D(filters=32, kernel_size=31, strides=21, padding = 'same', activation='relu', input_shape=(day_count_before, 1)))
        model.add(MaxPooling1D(pool_size=2))
        model.add(Conv1D(filters=8, kernel_size=7, strides=3, padding = 'same', activation='relu', input_shape=(day_count_before, 1)))
        model.add(Flatten())
        model.add(Dense(50, activation='relu',kernel_regularizer=regularizers.l2(0.001)))
        model.add(Dense(day_count_after))
        model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])
        self.weekly_model = model
        return self.weekly_model
    
    def train_monthly_predictor(self, dp, day_count_before = 31, day_count_after = 31, test_split = 1/6, valid_split = 1/6,
                             ep = 1000, batch_siz = 512):
        dataset = dp.series_to_deeplr_dataset(dp.dataframe, n_in=day_count_before, n_out=day_count_after).values
        
        X = dataset[:,0:day_count_before]
        Y = dataset[:,day_count_before:]
        print(date(2012,1,1) + relativedelta(days=+X.shape[0]))
        X = X.reshape((X.shape[0], X.shape[1], 1))
        v_index = int(X.shape[0]*(1-valid_split-test_split))
        t_index = int(X.shape[0]*(1-test_split))

        self.X_monthly_test = X[t_index:]
        self.Y_monthly_test = Y[t_index:]
        X_valid = X[v_index:t_index]
        Y_valid = Y[v_index:t_index]
        X = X[:v_index]
        Y = Y[:v_index]
        
        self.monthly_test_datestart = date(2012,1,1) + relativedelta(days=+t_index+day_count_before)
        self.monthly_test_dateends = self.monthly_test_datestart + relativedelta(days=+self.X_monthly_test.shape[0])
        
        patience=40
        early_stopping=EarlyStopping(patience=patience, verbose=1)
        checkpointer=ModelCheckpoint(filepath='weights_weekly.hdf5', save_best_only=True, verbose=1)
        self.monthly_model = self.create_monthly_predictor(day_count_before, day_count_after)
        
        print(self.monthly_model.summary())
        # fit model
        histoy = self.monthly_model.fit(X, Y, epochs=ep, verbose=2, batch_size=batch_siz, validation_data=(X_valid, Y_valid),
                 callbacks=[checkpointer, early_stopping])
        
        return history
        
        
    def create_monthly_predictor(self, day_count_before,day_count_after):
        model = Sequential()
        model.add(Conv1D(filters=64, kernel_size=31,padding='same', activation='relu', input_shape=(day_count_before, 1)))
        model.add(MaxPooling1D(pool_size=2))
        model.add(Conv1D(filters=16, kernel_size=7, activation='relu', input_shape=(day_count_before, 1)))
        model.add(Flatten())
        model.add(Dense(50, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
        model.add(Dense(day_count_after))
        model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])
        self.monthly_model = model
        return self.monthly_model
        
    def test_daily_predictor(self, prediction_count = 10):
        

        results = []
        for i in range(self.X_daily_test.shape[0]):
            x_input = self.X_daily_test[i]
            x_input = x_input.reshape((1, self.X_daily_test.shape[1], 1))
            yhat = self.daily_model.predict(x_input, verbose=0)
            results.append(yhat.flatten())

        date_time = [ self.daily_test_datestart + relativedelta(days=d) for d in range(0, len(results))]
        lastdate = date_time[-1]
        
        date_time = pd.to_datetime(date_time)

        DF = pd.DataFrame()
        DF['temp'] = results
        DF = DF.set_index(date_time)
        
        res = []
            
        # prediction
        for i in range(prediction_count):
            x_input = np.array(results[-self.X_daily_test.shape[1]:])
            x_input = x_input.reshape((1, self.X_daily_test.shape[1], 1))
            yhat = self.daily_model.predict(x_input, verbose=0)
            res.extend(yhat.flatten().tolist())
    
        date_time2 = [ lastdate + relativedelta(days=d) for d in range(1, len(res)+1)]
        date_time2 = pd.to_datetime(date_time2)
        DF_tmp = pd.DataFrame()
        DF_tmp['temp'] = res
        DF_tmp = DF_tmp.set_index(date_time2)
        DF = DF.append(DF_tmp)
        
        fig, ax = plt.subplots()
        fig.subplots_adjust(bottom=0.3)
        plt.xticks(rotation=90)
        
        ax.axvline(x=pd.to_datetime(lastdate), color='g', linestyle='--')
        
        xposition = [pd.to_datetime('2018-10-26'), pd.to_datetime('2018-11-01'),pd.to_datetime('2018-11-29')]
        print(xposition[0], ' mean temperature prediction: ', DF.loc[xposition[0]][0], ' °C')
        print(xposition[1], ' mean temperature prediction: ', DF.loc[xposition[1]][0], ' °C')
        print(xposition[2], ' mean temperature prediction: ', DF.loc[xposition[2]][0], ' °C')
        
        for xc in xposition:
            ax.axvline(x=xc, color='r', linestyle='--')
        fig.suptitle('Daily predictor, predcition starts from the green line.', fontsize=12, fontweight='bold')
        plt.plot(DF)
    
    def test_weekly_predictor(self, prediction_count = 3):

        results = []
        for i in range(0,self.X_weekly_test.shape[0],self.Y_weekly_test.shape[1]):
            x_input = self.X_weekly_test[i]
            x_input = x_input.reshape((1, self.X_weekly_test.shape[1], 1))
            
            yhat = self.weekly_model.predict(x_input, verbose=0)

            results.extend(yhat.flatten().tolist())
                
        
        date_time = [ self.weekly_test_datestart + relativedelta(days=d) for d in range(0, len(results))]
        lastdate = date_time[-1]
        
        date_time = pd.to_datetime(date_time)

        DF = pd.DataFrame()
        DF['temp'] = results
        DF = DF.set_index(date_time)
        
        res = []
        res.extend(results[-self.X_weekly_test.shape[1]:])
        for i in range(prediction_count):
            x_input = np.array(res[-self.X_weekly_test.shape[1]:])
            x_input = x_input.reshape((1, self.X_weekly_test.shape[1], 1))
            yhat = self.weekly_model.predict(x_input, verbose=0)
            res.extend(yhat.flatten().tolist())
            
        date_time2 = [ lastdate + relativedelta(days=d) for d in range(1, len(res)+1)]
        date_time2 = pd.to_datetime(date_time2)
        DF_tmp = pd.DataFrame()
        DF_tmp['temp'] = res
        DF_tmp = DF_tmp.set_index(date_time2)
        DF = DF.append(DF_tmp)
        
        fig, ax = plt.subplots()
        fig.subplots_adjust(bottom=0.3)
        plt.xticks(rotation=90)
        
        ax.axvline(x=pd.to_datetime(lastdate), color='g', linestyle='--')
        
        xposition = [pd.to_datetime('2018-10-26'), pd.to_datetime('2018-11-01'),pd.to_datetime('2018-11-29')]
        print(xposition[0], ' mean temperature prediction: ', DF.loc[xposition[0]][0], ' °C')
        print(xposition[1], ' mean temperature prediction: ', DF.loc[xposition[1]][0], ' °C')
        print(xposition[2], ' mean temperature prediction: ', DF.loc[xposition[2]][0], ' °C')
        
        for xc in xposition:
            ax.axvline(x=xc, color='r', linestyle='--')
        fig.suptitle('Weekly predictor, predcition starts from the green line.', fontsize=8, fontweight='bold')
        plt.plot(DF)
        
    def test_monthly_predictor(self, prediction_count = 3):

        results = []
        '''for i in range(0,self.X_monthly_test.shape[0],self.Y_monthly_test.shape[1]):
            x_input = self.X_monthly_test[i]
            x_input = x_input.reshape((1, self.X_monthly_test.shape[1], 1))
            
            yhat = self.monthly_model.predict(x_input, verbose=0)

            results.extend(yhat.flatten().tolist())
        '''
        
        for i in range(0,self.Y_monthly_test.shape[0],self.Y_monthly_test.shape[1]):
            y_out = self.Y_monthly_test[i]

            results.extend(y_out.flatten().tolist())
        
        
        
        date_time = [ self.monthly_test_datestart + relativedelta(days=d) for d in range(0, len(results))]
        lastdate = date_time[-1]
        print(lastdate)
        date_time = pd.to_datetime(date_time)

        DF = pd.DataFrame()
        DF['temp'] = results
        DF = DF.set_index(date_time)
        
        res = []
        res.extend(results[-self.X_monthly_test.shape[1]:])
        for i in range(prediction_count):
            x_input = np.array(res[-(self.X_monthly_test.shape[1]):])
            x_input = x_input.reshape((1, self.X_monthly_test.shape[1], 1))
            yhat = self.monthly_model.predict(x_input, verbose=0)
            res.extend(yhat.flatten().tolist())
            
        date_time2 = [ lastdate + relativedelta(days=d) for d in range(1, len(res)+1)]
        date_time2 = pd.to_datetime(date_time2)
        DF_tmp = pd.DataFrame()
        DF_tmp['temp'] = res
        DF_tmp = DF_tmp.set_index(date_time2)
        DF = DF.append(DF_tmp)
        
        fig, ax = plt.subplots()
        fig.subplots_adjust(bottom=0.3)
        plt.xticks(rotation=90)
        
        ax.axvline(x=pd.to_datetime(lastdate), color='g', linestyle='--')
        
        xposition = [pd.to_datetime('2018-10-26'), pd.to_datetime('2018-11-01'),pd.to_datetime('2018-11-29')]
        print(xposition[0], ' mean temperature prediction: ', DF.loc[xposition[0]][0], ' °C')
        print(xposition[1], ' mean temperature prediction: ', DF.loc[xposition[1]][0], ' °C')
        print(xposition[2], ' mean temperature prediction: ', DF.loc[xposition[2]][0], ' °C')
        
        for xc in xposition:
            ax.axvline(x=xc, color='r', linestyle='--')
        fig.suptitle('Monthly predictor, predcition from the predicted elements starts from the green line.', fontsize=8, fontweight='bold')
        plt.plot(DF)


In [None]:
def plot_history_losses(history):
    plt.figure()
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model train vs validation loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'validation'], loc='upper right')
    plt.show()

In [None]:

#dataset = dp.series_to_deeplr_dataset(dp.dataframe, n_in=day_count_before, n_out=day_count_after).values
model_holder = TrainingClasses()



In [None]:
history = model_holder.train_weekly_predictor(dp,day_count_before = 62, day_count_after = 7, test_split = 1/5, valid_split = 0.1,
                             ep = 100, batch_siz = 512)

In [None]:
model_holder.test_weekly_predictor(4)
print('Prediction starts from: ', model_holder.weekly_test_dateends)
plot_history_losses(history)

In [None]:
history = model_holder.train_monthly_predictor(dp,day_count_before = 93, day_count_after = 31, test_split = 1/6, valid_split = 1/6,
                             ep = 1000, batch_siz = 512)

In [None]:
model_holder.test_monthly_predictor(3) # 3 prediction
print('Prediction starts from: ', model_holder.monthly_test_dateends)
plot_history_losses(history)

In [None]:
history = model_holder.train_daily_predictor(dp,day_count_before = 7, day_count_after = 1, test_split = 1/6, valid_split = 1/6,
                             ep = 1000, batch_siz = 512)

In [None]:
model_holder.test_daily_predictor(50)
print('Prediction starts from: ', model_holder.daily_test_dateends)
plot_history_losses(history)