# Libraries

In [1]:
import mysql.connector
import pandas as pd
import numpy as np
from pandasql import sqldf

# Data
from sklearn.model_selection import train_test_split

# Metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score
from sklearn.metrics import max_error

from pytictoc import TicToc
t = TicToc() #create instance of class

# Global parameters

In [2]:
# For privacy reasons, User Information is located in a .txt file in the same folder
# First row contains the user name
# Second row contains the password

try:
    login = pd.read_csv(r'/home/cesar/Python_NBs/HDL_Project/HDL_Project/mysql_login.txt', header=None)
    mysql_user = login[0][0]
    mysql_pwd = login[0][1]
    print('User information is ready!')
except:
    print('Login information is not available!!!')
    
mydb = mysql.connector.connect(
  host="localhost"
  ,user=mysql_user
  ,password=mysql_pwd
  ,database='HDL_Project'
)

mycursor = mydb.cursor()    

User information is ready!


In [None]:
del mysql_user, mysql_pwd

# Global functions

In [3]:
#https://dev.mysql.com/doc/connector-python/en/connector-python-example-cursor-transaction.html

def qdata(sqlq):
    """
    UDF to query data from local MySQL Server Database
    
    Input:
    "sqlq": Query (e.g. Select * from table)
    
    """
    
    mycursor.execute(sqlq)
    myresult = mycursor.fetchall()
    return myresult

In [4]:
class data_processing_particles(object):
    """
    Sequential processing of data to obtain time series.
    
    Activities:
    - initial_df: Read SQL dataset for specific station number.
    - samples_creation: Creation of samples array.
    """

    def __init__(self, station_number):
        """
        Input:
        * station_number: Database station number to process
        """
        self.station_number = station_number
        
    def initial_df(self):
        # Read raw dataset components from SQL database
        sql_df = qdata("Select * from {}_ts".format(self.station_number))
        col_names = [i[0] for i in qdata("show columns from {}_ts".format(self.station_number))]

        # Create dataframe
        df = pd.DataFrame(sql_df)
        df.columns = col_names

        # Drop unnecessary columns
        df = df.drop("date", axis=1)
        df = df.drop("time", axis=1)
        df = df.drop("station", axis=1)

        # Set `datetime` column as dataframe index
        df = df.set_index('datetime')

        # Overview
        return df
    
    def samples_creation(self, n_steps, target_name):
        """
        Transformation of Dataframe object into numpy.ndarray objects (input, output)
        """
        
        # Rearrangin dataset to place target as last column
        df = self.initial_df()
        
        target_col = df[target_name]

        df = df.loc[:, df.columns != target_name]
        df[target_name] = target_col     
        
        arr = df.to_numpy()
        del target_col
        
        # Creating samples
        tmp = list(reversed(range(len(arr)+1)))
        tmp = tmp[:-n_steps][::-1]
        tmp = pd.DataFrame(tmp).reset_index(drop = False)
        tmp.columns = ["index", "end_ix"]
        
        # Create empty lists 
        X, y = list(), list()

        for i, end_ix in zip(tmp["index"], tmp["end_ix"]):
            
            # Gather input and output parts of the pattern
            seq_x, seq_y = arr[i:end_ix, :-1], arr[end_ix-1, -1]
            X.append(seq_x)
            y.append(seq_y)        
        
        return X, y

In [5]:
class univariate_samples(object):
    """
    Sequential processing of data to obtain univariate time series.
    This class is specific for SQL tables sima_* (e.g. sima_pm25)
    
    Activities:
    - initial_df: Read SQL dataset for specific station number.
    - samples_creation: Creation of samples array.
    """

    def __init__(self, table_name, column, where = ""):
        """
        Input:
        * station_number: Database station number to process
        """
        self.table_name = table_name
        self.column = column
        self.where = where
        
    def initial_df(self):
        # Read raw dataset components from SQL database
        sql_df = qdata("Select datetime, {} from {}".format(self.column, self.table_name))
        col_names = ["datetime", self.column]

        # Create dataframe
        df = pd.DataFrame(sql_df)
        df.columns = col_names

        # Set `datetime` column as dataframe index
        df = df.set_index('datetime')

        # Overview
        return df
    
    def samples_creation(self, n_steps):
        """
        Transformation of Dataframe object into numpy.ndarray objects (input, output)
        """
        
        # Rearrangin dataset to place target as last column
        arr = self.initial_df()
        arr = arr[self.column]
        
        # Creating samples
        tmp = list(reversed(range(len(arr)+1)))
        tmp = tmp[:-n_steps][::-1]
        tmp = pd.DataFrame(tmp).reset_index(drop = False)
        tmp.columns = ["index", "end_ix"]
        tmp = tmp[:-1]
        
        # Create empty lists 
        X, y = list(), list()

        for i, end_ix in zip(tmp["index"], tmp["end_ix"]):
            
            # Gather input and output parts of the pattern
            seq_x, seq_y = arr[i:end_ix], arr[end_ix]
            X.append(seq_x)
            y.append(seq_y)        
        
        return np.array(X), np.array(y)

In [6]:
class multivariate_samples(object):
    """
    Sequential processing of data to obtain time series.
    
    Activities:
    - initial_df: Read SQL dataset for specific station number.
    - samples_creation: Creation of samples array.
    """

    def __init__(self, table_name, target, cols = '*', where = ""):
        """
        Input:
        * station_number: Database station number to process
        """
        self.table_name = table_name
        self.cols = cols
        self.where = where
        self.target = target
        
    def initial_df(self):
        # Read raw dataset components from SQL database
        sql_df = qdata("Select {} from {} {}".format(self.cols, self.table_name, self.where))
        
        if self.cols == '*':
            col_names = [i[0] for i in qdata("show columns from {}".format(self.table_name))]
        else: 
            col_names = self.cols.split(', ')

        # Create dataframe
        df = pd.DataFrame(sql_df)
        df.columns = col_names

        # Set `datetime` column as dataframe index
        df = df.set_index('datetime')
        df.sort_index(inplace=True)
        
        # Save temporary array with unmodified target information
        target_arr = df[self.target]
        
        # Data normalization
        df=(df-df.min())/(df.max()-df.min())
        df = df.fillna(0)
        df[self.target] = target_arr

        # Overview
        return df
    
    def samples_creation(self, n_steps, target_name):
        """
        Transformation of Dataframe object into numpy.ndarray objects (input, output)
        """
        
        # Rearrangin dataset to place target as last column
        df = self.initial_df()
        
        target_col = df[target_name]

        df = df.loc[:, df.columns != target_name]
        df[target_name] = target_col     
        
        arr = df.to_numpy()
        del target_col
        
        # Creating samples
        tmp = list(reversed(range(len(arr)+1)))
        tmp = tmp[:-n_steps][::-1]
        tmp = pd.DataFrame(tmp).reset_index(drop = False)
        tmp.columns = ["index", "end_ix"]
        
        # Create empty lists 
        X, y = list(), list()

        for i, end_ix in zip(tmp["index"], tmp["end_ix"]):
            
            # Gather input and output parts of the pattern
            seq_x, seq_y = arr[i:end_ix, :-1], arr[end_ix-1, -1]
            X.append(seq_x)
            y.append(seq_y)        
        
        return np.array(X), np.array(y)

In [7]:
def train_val_plot(history):
    #plotting
    plt.figure(figsize=(8,5))
    plt.subplots_adjust(bottom=0.1, top=1.4)
    plt.tight_layout()

    # plot learning curves
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='train')
    plt.plot(history.history['val_loss'], label='validation')
    plt.title("Loss")
    plt.xlabel('Epochs')
    plt.ylabel('Loss (MSE)')
    plt.legend(loc="upper right")


    # plot learning curves
    plt.subplot(1, 2, 2)
    plt.plot(history.history['mae'], label='train')
    plt.plot(history.history['val_mae'], label='validation')
    plt.title("MAE")
    plt.ylabel('mae')
    plt.legend(loc="upper right")

    plt.show()

    plt.cla()
    plt.clf()
    plt.close('all')

    print("Min training Loss:", min(history.history["loss"]))
    print("Min validation Loss: ", min(history.history["val_loss"]))
    print("")
    print("Final training Loss:", history.history['loss'][-1])
    print("Final validation Loss:", history.history['val_loss'][-1])        
    print("")
    print("Min training MAE:", min(history.history["mae"]))
    print("Min validation MAE: ", min(history.history["val_mae"]))
    print("")    
    print("Final training MAE:", history.history['mae'][-1])
    print("Final validation MAE:", history.history['val_mae'][-1])    

In [8]:
# Evaluate a single model
def testing_evaluation(X_test, y_test, model):
        
    # make predictions
    y_prediction = model.predict(X_test)
    
    metrics = dict()
    # evaluate predictions
    # accuracy = accuracy_score(y_test, y_prediction) * 100
    metrics["RMSE"] = mean_squared_error(y_test, y_prediction, squared=False)
    metrics["MAE"] = mean_absolute_error(y_test, y_prediction)
    metrics["MAPE"] = mean_absolute_percentage_error(y_test, y_prediction)
    metrics["R^2"] = r2_score(y_test, y_prediction)
    metrics["Max Error"] = max_error(y_test, y_prediction)    
    
    return metrics

# Sources
* https://scikit-learn.org/stable/modules/model_evaluation.html