# Libraries

In [1]:
# pip install fancyimpute

In [2]:
import mysql.connector
import pandas as pd
import numpy as np
from pandasql import sqldf

# Data
from sklearn.model_selection import train_test_split

# Metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score
from sklearn.metrics import max_error

from pytictoc import TicToc
t = TicToc() #create instance of class

# Global parameters

In [3]:
# For privacy reasons, User Information is located in a .txt file in the same folder
# First row contains the user name
# Second row contains the password

try:
    login = pd.read_csv(r'/home/cesar/Python_NBs/HDL_Project/Mini HDL/Baseline_ML_Pollution_Concentration_MMA/mysql_login.txt', header=None)
    mysql_user = login[0][0]
    mysql_pwd = login[0][1]
    print('User information is ready!')
except:
    print('Login information is not available!!!')
    
mydb = mysql.connector.connect(
  host="localhost"
  ,user=mysql_user
  ,password=mysql_pwd
  ,database='HDL_Project'
)

mycursor = mydb.cursor()    

User information is ready!


In [4]:
del mysql_user, mysql_pwd

# Global functions

In [5]:
#https://dev.mysql.com/doc/connector-python/en/connector-python-example-cursor-transaction.html

def qdata(sqlq):
    """
    UDF to query data in raw format from a local MySQL RDBMS
    
    Input:
    "sqlq": Query (e.g. Select * from table)
    
    """
    
    mycursor.execute(sqlq)
    myresult = mycursor.fetchall()
    return myresult

In [6]:
def qdata2(sql_table, sqlq):
    """
    qdata2 queries data from MySQL RDBMS and returns it in a dataframe format
    , along with its corresponding column names. 
    
    Input:
    * `sql_table`: Table name
    * `sqlq`: Complete query (e.g. Select * from table where col1 = "val1")
    
    """
    col_list = qdata("show columns from {}".format(sql_table))
    col_names = [col_list[i][0] for i in range(len(col_list))]

    print(sqlq)

    data = pd.DataFrame(qdata(sqlq))
    data.columns = col_names

    return data

In [7]:
def mice_imputer(sqlq, max_iter_vals = 100):
    """
    UDF specific to process a monitoring station's data through a 
    Missing value imputation algorithm. Especifically using a 
    Multivariate imputation by chained equations (MICE). 
    MICE is implemented using the FancyInput library.
    
    Input:
    * sqlq: SQL query for data querying (e.g. Select * from sima_station_CE where datetime > \'2020-04-20\')
    * max_iter_vals: Intervals for Imputation algorithm. Default = 100
    
    """
    
    data = qdata2(sql_table, sqlq)
    df_cols = data.columns
    df_size = data.shape[0]

    # Columns to ignore
    datetime_index = data.columns.get_loc("datetime")
    rainf_index = data.columns.get_loc("rainf")

    # Data columns to safekeep
    datetime_col = data.iloc[:,datetime_index]
    rainf_col = data.iloc[:,rainf_index]

    # Subset 
    data = data.loc[:, ~data.columns.isin(['datetime', 'rainf'])]

    # Replacing zeros with NA values (for MICE algorithm)
    data = data.replace(0, np.nan)
    missing_df = pd.DataFrame({"Missing Values (%)":data.isna().sum()/data.shape[0]*100 })
    
    # Calling the MICE class
    mice_imputer = IterativeImputer(max_iter = max_iter_vals)

    # imputing the missing value with mice imputer
    data_mice = pd.DataFrame(mice_imputer.fit_transform(data))

    # Reinserting columns in standby
    data_mice.insert(datetime_index, 'datetime', datetime_col)
    data_mice.insert(rainf_index, 'rainf', rainf_col)
    
    # Renaming columns
    data_mice.columns = df_cols
    
    
    return data_mice, missing_df

In [8]:
def initial_data(sql_table, s_where):
    # -------------------------------------------------
    # Define columns of interest:
    #     To call every column use this:
    s_cols = [i[0] for i in qdata("show columns from {}".format(sql_table))]; s_cols = str(s_cols)[1:-1]; s_cols = s_cols.replace('\'', '')

    #     To call specific colums manually define here:
    #s_cols = "datetime, SE"


    # Edit
    s_cols_backup = s_cols
    s_cols = s_cols.replace("datetime", "datetime, concat(year(datetime), '-',  DATE_FORMAT(datetime, '%b')) as `year_month`, year(datetime) as `year`, month(datetime) as `month`")

    s_cols_edit = s_cols
    s_cols_edit = s_cols_edit.replace("concat(year(datetime), '-',  DATE_FORMAT(datetime, '%b')) as `year_month`", "year_month")
    s_cols_edit = s_cols_edit.replace("year(datetime) as `year`", "year")
    s_cols_edit = s_cols_edit.replace("month(datetime) as `month`", "month")

    col_names = s_cols_edit.split(", ")

    # -------------------------------------------------

    data = pd.DataFrame(qdata("Select {} from {} {}".format(s_cols, sql_table, s_where)))
    data.columns = col_names

    # Before hand, we realize that station NE3 doesn't exist, so we drop it from the dataset.
    data = data.drop("NE3",axis = 1)
    data = data.rename(columns=dict(stations))
    #data = data.set_index("datetime")

    # Visualization
    return data

In [9]:
class data_processing_particles(object):
    """
    Sequential processing of data to obtain time series.
    
    Activities:
    - initial_df: Read SQL dataset for specific station number.
    - samples_creation: Creation of samples array.
    """

    def __init__(self, station_number):
        """
        Input:
        * station_number: Database station number to process
        """
        self.station_number = station_number
        
    def initial_df(self):
        # Read raw dataset components from SQL database
        sql_df = qdata("Select * from {}_ts".format(self.station_number))
        col_names = [i[0] for i in qdata("show columns from {}_ts".format(self.station_number))]

        # Create dataframe
        df = pd.DataFrame(sql_df)
        df.columns = col_names

        # Drop unnecessary columns
        df = df.drop("date", axis=1)
        df = df.drop("time", axis=1)
        df = df.drop("station", axis=1)

        # Set `datetime` column as dataframe index
        df = df.set_index('datetime')

        # Overview
        return df
    
    def samples_creation(self, n_steps, target_name):
        """
        Transformation of Dataframe object into numpy.ndarray objects (input, output)
        """
        
        # Rearrangin dataset to place target as last column
        df = self.initial_df()
        
        target_col = df[target_name]

        df = df.loc[:, df.columns != target_name]
        df[target_name] = target_col     
        
        arr = df.to_numpy()
        del target_col
        
        # Creating samples
        tmp = list(reversed(range(len(arr)+1)))
        tmp = tmp[:-n_steps][::-1]
        tmp = pd.DataFrame(tmp).reset_index(drop = False)
        tmp.columns = ["index", "end_ix"]
        
        # Create empty lists 
        X, y = list(), list()

        for i, end_ix in zip(tmp["index"], tmp["end_ix"]):
            
            # Gather input and output parts of the pattern
            seq_x, seq_y = arr[i:end_ix, :-1], arr[end_ix-1, -1]
            X.append(seq_x)
            y.append(seq_y)        
        
        return X, y

In [10]:
class univariate_samples(object):
    """
    Sequential processing of data to obtain univariate time series.
    This class is specific for SQL tables sima_* (e.g. sima_pm25)
    
    Activities:
    - initial_df: Read SQL dataset for specific station number.
    - samples_creation: Creation of samples array.
    """

    def __init__(self, table_name, column, where = ""):
        """
        Input:
        * station_number: Database station number to process
        """
        self.table_name = table_name
        self.column = column
        self.where = where
        
    def initial_df(self):
        # Read raw dataset components from SQL database
        sql_df = qdata("Select datetime, {} from {}".format(self.column, self.table_name))
        col_names = ["datetime", self.column]

        # Create dataframe
        df = pd.DataFrame(sql_df)
        df.columns = col_names

        # Set `datetime` column as dataframe index
        df = df.set_index('datetime')

        # Overview
        return df
    
    def samples_creation(self, n_steps):
        """
        Transformation of Dataframe object into numpy.ndarray objects (input, output)
        """
        
        # Rearrangin dataset to place target as last column
        arr = self.initial_df()
        arr = arr[self.column]
        
        # Creating samples
        tmp = list(reversed(range(len(arr)+1)))
        tmp = tmp[:-n_steps][::-1]
        tmp = pd.DataFrame(tmp).reset_index(drop = False)
        tmp.columns = ["index", "end_ix"]
        tmp = tmp[:-1]
        
        # Create empty lists 
        X, y = list(), list()

        for i, end_ix in zip(tmp["index"], tmp["end_ix"]):
            
            # Gather input and output parts of the pattern
            seq_x, seq_y = arr[i:end_ix], arr[end_ix]
            X.append(seq_x)
            y.append(seq_y)        
        
        return np.array(X), np.array(y)

In [11]:
class multivariate_samples(object):
    """
    Sequential processing of data to obtain time series.
    
    Activities:
    - initial_df: Read SQL dataset for specific station number.
    - samples_creation: Creation of samples array.
    """

    def __init__(self, table_name, target, cols = '*', where = ""):
        """
        Input:
        * station_number: Database station number to process
        """
        self.table_name = table_name
        self.cols = cols
        self.where = where
        self.target = target
        
    def initial_df(self):
        # Read raw dataset components from SQL database
        sql_df = qdata("Select {} from {} {}".format(self.cols, self.table_name, self.where))
        
        if self.cols == '*':
            col_names = [i[0] for i in qdata("show columns from {}".format(self.table_name))]
        else: 
            col_names = self.cols.split(', ')

        # Create dataframe
        df = pd.DataFrame(sql_df)
        df.columns = col_names

        # Set `datetime` column as dataframe index
        df = df.set_index('datetime')
        df.sort_index(inplace=True)
        
        # Save temporary array with unmodified target information
        target_arr = df[self.target]
        
        # Data normalization
        df=(df-df.min())/(df.max()-df.min())
        df = df.fillna(0)
        df[self.target] = target_arr

        # Overview
        return df
    
    def samples_creation(self, n_steps, target_name):
        """
        Transformation of Dataframe object into numpy.ndarray objects (input, output)
        """
        
        # Rearrangin dataset to place target as last column
        df = self.initial_df()
        
        target_col = df[target_name]

        df = df.loc[:, df.columns != target_name]
        df[target_name] = target_col     
        
        arr = df.to_numpy()
        del target_col
        
        # Creating samples
        tmp = list(reversed(range(len(arr)+1)))
        tmp = tmp[:-n_steps][::-1]
        tmp = pd.DataFrame(tmp).reset_index(drop = False)
        tmp.columns = ["index", "end_ix"]
        
        # Create empty lists 
        X, y = list(), list()

        for i, end_ix in zip(tmp["index"], tmp["end_ix"]):
            
            # Gather input and output parts of the pattern
            seq_x, seq_y = arr[i:end_ix, :-1], arr[end_ix-1, -1]
            X.append(seq_x)
            y.append(seq_y)        
        
        return np.array(X), np.array(y)

In [12]:
def train_val_plot(history):
    #plotting
    plt.figure(figsize=(8,5))
    plt.subplots_adjust(bottom=0.1, top=1.4)
    plt.tight_layout()

    # plot learning curves
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='train')
    plt.plot(history.history['val_loss'], label='validation')
    plt.title("Loss")
    plt.xlabel('Epochs')
    plt.ylabel('Loss (MSE)')
    plt.legend(loc="upper right")


    # plot learning curves
    plt.subplot(1, 2, 2)
    plt.plot(history.history['mae'], label='train')
    plt.plot(history.history['val_mae'], label='validation')
    plt.title("MAE")
    plt.ylabel('mae')
    plt.legend(loc="upper right")

    plt.show()

    plt.cla()
    plt.clf()
    plt.close('all')

    print("Min training Loss:", min(history.history["loss"]))
    print("Min validation Loss: ", min(history.history["val_loss"]))
    print("")
    print("Final training Loss:", history.history['loss'][-1])
    print("Final validation Loss:", history.history['val_loss'][-1])        
    print("")
    print("Min training MAE:", min(history.history["mae"]))
    print("Min validation MAE: ", min(history.history["val_mae"]))
    print("")    
    print("Final training MAE:", history.history['mae'][-1])
    print("Final validation MAE:", history.history['val_mae'][-1])    

In [13]:
# Evaluate a single model
def testing_evaluation(X_test, y_test, model):
        
    # make predictions
    y_prediction = model.predict(X_test)
    
    metrics = dict()
    # evaluate predictions
    # accuracy = accuracy_score(y_test, y_prediction) * 100
    metrics["RMSE"] = mean_squared_error(y_test, y_prediction, squared=False)
    metrics["MAE"] = mean_absolute_error(y_test, y_prediction)
    metrics["MAPE"] = mean_absolute_percentage_error(y_test, y_prediction)
    metrics["R^2"] = r2_score(y_test, y_prediction)
    metrics["Max Error"] = max_error(y_test, y_prediction)    
    
    return metrics

# Sources
* https://scikit-learn.org/stable/modules/model_evaluation.html