In [None]:
# DA PROVARE

import pandas as pd

def get_train_and_test_from_timeseries(dataframe, train_size, outputs_col, verbose=0):
    '''
        Splits a dataset of a time series in its corrispective train and test parts.
        It keeps the order of the dataset.
        
        Attributes:
            - dataframe      (pandas.DataFrame) : 
            - train_size     (int)              : the percentage of the dataframe to keep in the training set (0 < train_size <= 1)
            - outputs        ([string])         : the outputs columns
            - verbose        (int)

        Returns:
            - (pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame): train_X, train_Y, test_X, test_Y
    '''

    assert type(dataframe) == pd.DataFrame
    assert train_size > 0 and train_size <= 1, 'The percentage of the train_size need to be greather than 0 and less or equal to 1.'
    assert outputs_col is not None and len(outputs_col) > 0

    nrow = round(train_size * dataframe.shape[0])

    train = series.iloc[:nrow, :]
    test = series.iloc[nrow:, :]

    train_X = train.drop(columns=output)
    test_X = test.drop(columns=output)

    train_Y = train[output]
    test_Y = test[output]

    if verbose == 1:
        print('Training set shape for X (inputs):')
        print(train_X.shape)
        print('Training set shape for Y (output):')
        print(train_Y.shape)
        print('Test set shape for X (inputs):')
        print(test_X.shape)
        print('Test set shape for Y (output):')
        print(test_Y.shape)
    
    return train_X, train_Y, test_X, test_Y

In [None]:
# DA PROVARE

import pandas as pd

def create_window_multivariate(data, window_size_backward=1, window_size_forward=None, full_forward=True):    
    '''
        Starting from a pandas.DataFrame it create a windowed version, 
         with values from the past and from the future. 
        
        Attributes:
            - data                  : (pandas.DataFrame)   original dataframe
            - window_size_backward  : (int)                window size (in the past)
            - window_size_forward   : (int, default None)  window size (in the future). If None, only past values are going to be aggregated
            - full_forward          : (bool, default True) when window_size_forward != 0 indicates if all the values, from the actual to the window_size_forwardth must been kept
            
        Return:
            - pandas.DataFrame      : the windowed version of the original dataframe:
                                        - the columns in the past have the same name of the originals followed by a -1
                                        - the columns in the future have the same name of the originals followed by a +1
    '''
    
    # it has to be greater than 0
    assert type(window_size_backward) == int and window_size_backward > 0
    
    res = []
    for c in data.columns:

        sub_df = data[[c]].copy()
        data_s = sub_df.copy()
        col_name = [c]
        for i in range(window_size_backward):
            col_name.append(c + '-' + str(i))
            sub_df = pd.concat([sub_df, data_s.shift((i + 1))], axis = 1)
        
        # if not none it has to be greater than 0
        if window_size_forward is not None and type(window_size_forward == int) and window_size_forward > 0:
            column_to_delete = []
            for i in range(1, window_size_forward + 1):
                
                if full_forward == False:
                    if i != window_size_forward:
                        column_to_delete.append(c + '+' + str(i))
                
                col_name.append(c + '+' + str(i))
                sub_df = pd.concat([sub_df, data_s.shift((-i))], axis = 1)
            
        sub_df.dropna(axis=0, inplace=True)
        sub_df.columns = col_name
        
        if full_forward == False and len(column_to_delete) > 0:
            sub_df.drop(columns=column_to_delete, inplace=True)


        res.append(sub_df)

    final_df = pd.concat(res, axis=1)
    
    return final_df

## Previsioni

In [None]:
# DA PROVARE

import matplotlib.pyplot as plt
import numpy as np

def plot_prediction_train_vs_test(window_size, df, feature, train_predict, test_predict):  
    '''
        Plots the results of the predictions (on train and on test) of a model (UNIVARIATE) for time series.
        Assuming that the source data comes from a Pandas dataframe.

        Attributes:
            - window_size: (int) dimension of the window used for prediction
            - df: (Pandas.DataFrame) original dataframe
            - feature: (string) column of the dataframe on which the model has been trained
            - train_predict: (array[float]) predictions on train set
            - test_predict: (array[float]) predictions on test set
    '''

    # Start with training predictions
    train_predict_plot = np.empty_like(df.filter([feature]))
    train_predict_plot[:, :] = np.nan
    train_predict_plot[window_size:window_size + len(train_predict), :] = train_predict

    # Add test predictions
    test_predict_plot = np.empty_like(df.filter([feature]))
    test_predict_plot[:, :] = np.nan
    test_predict_plot[window_size + len(train_predict):df.shape[0], :] = test_predict

    # Create the plot
    plt.plot(df[feature].values, label = 'True value', alpha=0.4)
    plt.plot(train_predict_plot, label = 'Training set prediction')
    plt.plot(test_predict_plot, label = 'Test set prediction')
    plt.xlabel('')
    plt.ylabel('Values')
    plt.title('Comparison true vs. predicted training / test')
    plt.legend()
    plt.show()

## Decomposizione stagionale

In [None]:
# DA PROVARE

import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose

def plot_seasonal_decomposition(dataframe, feature, figsize=(30, 10)):
    '''
        Plots the seasonal decomposition of a time series

        Attributes:
            - dataframe: (pandas.DataFrame)
            - feature: (string) the column of the DF on which to perform decomposition
            - figsize: ((X_dim, Y_dim)) the dimension of the figure

    '''

    decompositions = seasonal_decompose(dataframe[feature], model='additive')

    plt.figure(figsize=figsize)
    plt.plot(decompositions.trend, label='trend')
    plt.plot(decompositions.seasonal, label='seasonal')
    plt.plot(decompositions.resid, label='residual')
    plt.plot(decompositions.observed, label='observed')
    plt.title('Seasonal decomposition for: ' + str(feature))
    plt.legend()
    plt.show()

## White Noise

In [None]:
# DA PROVARE

from pandas.tools.plotting import autocorrelation_plot

def check_series_is_white_noise(df, feature):
    '''
    '''
    autocorrelation_plot(df[feature].fillna(0))

## Autocorrelazione

In [None]:
# DA PROVARE

import pandas as pd
import numpy as np

def autocorr(x):
    '''
        Autocorrelation check

        Attributes:
            - x (numpy.array) : input array 

        Returns:
            - float : r
            - float : lag
    '''
    n = x.size
    norm = (x - np.mean(x))
    result = np.correlate(norm, norm, mode='same')
    acorr = result[n//2 + 1:] / (x.var() * np.arange(n-1, n//2, -1))
    lag = np.abs(acorr).argmax() + 1
    r = acorr[lag-1]        
    if np.abs(r) > 0.5:
      print('Appears to be autocorrelated with r = {}, lag = {}'. format(r, lag))
    else: 
      print('Appears to be not autocorrelated')
    return r, lag


In [None]:
# DA PROVARE 

def remove_correlated_columns(dataframe, threshold=0.95):
    '''
        Removes the columns of a dataframe that are correlated more that threshold

        Attrubutes:
            - dataframe: (pandas.DataFrame) the original dataframe
            - threshold: (float, default = 0.95) the lower threshold for the correlation

        Returns:
            - pandas.DataFrame : a copy of the original dataframe without the correlated columns
            - [string] : the columns dropped
    '''
    df = dataframe.copy()

    # Create correlation matrix
    corr_matrix = df.corr().abs()

    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

    # Find index of feature columns with correlation greater than threshold
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

    # Drop features 
    df.drop(df.columns[to_drop], axis=1, inplace=True)

    return df, to_drop