# About
Creating samples

In [1]:
%run "main_global.ipynb"

Connection with MySQL database is ready!


# Main

In [None]:
class sample_creation():
    """
    Note: This class has parameters that fixes the sample size to weekly objects (24h x 7 days) using SQL queries.
    
    """
    def __init__(self, sqlq, test_frac_split, group_size = 24 * 7):
        """
        
        Input:
        * test_frac_split: Fraction for the train test to keep from the trimmed dataset [0,1]
        * group_size: Fixed in size to consider sample size of 24h and 7 days.
        """
        
        # Intial parameters
        self.sqlq = sqlq
        self.test_frac_split = test_frac_split
        self.group_size = group_size
        
        
        # Define columns to drop:
        # * nox has great correlation to no2, and no
        # * rainf barely has any information.
        self.cols_to_drop = ["nox", "rainf"]
        
    def trimmed_df(self):
        """
        The dataset will trim off datetimes that don't fit within the standard WEEK definition [Saturday, Sunday]

        Input:
        * sqlq: Simple sql query. 
        """
        sqlq = self.sqlq
        where_sql, table_sql, cols_sql = where_from_sqlq(sqlq), tablename_from_sqlq(sqlq), cols_from_sqlq(sqlq)

        lower_aux_sqlq = "SELECT datetime FROM {} {} AND dayname(datetime) = 'Sunday' and hour(datetime) = '00:00:00' ORDER BY datetime ASC limit 1".format(table_sql, where_sql)
        upper_aux_sqlq = "SELECT datetime FROM {} {} AND dayname(datetime) = 'Saturday' and hour(datetime) = '23:00:00' ORDER BY datetime DESC limit 1".format(table_sql, where_sql)

        aux_sqlq = "Select {} from {} where datetime >= ({}) and datetime <= ({}) ORDER BY DATETIME ASC".format(", ".join(cols_sql), table_sql, lower_aux_sqlq, upper_aux_sqlq)
        
        df = DataFrame(aux_qdata(aux_sqlq), columns = cols_sql)
        
        if not(self.cols_to_drop[0] == ''):
            df = df.drop(self.cols_to_drop, axis = 1)
        
        return df
    
    def trimmed_train_test_frac_split(self):
        """
        Function to split the trimmed dataset into train and test datasets keeping the necessary dimensions
        for the sample creation. 
        """
        
        df = self.trimmed_df()
        n_samples = int(df.shape[0] / self.group_size)
        
        train_shape = floor(n_samples * self.test_frac_split) * self.group_size
        trim_train = df.loc[0:train_shape-1,:]
        trim_test = df.loc[train_shape:,:]
        
        return trim_train.set_index("datetime"), trim_test.set_index("datetime")
    
    def samples(self):
        trim_train, trim_test = self.trimmed_train_test_frac_split()
        
        train = array(split(trim_train, len(trim_train)/self.group_size))
        test =  array(split(trim_test,  len(trim_test)/self.group_size))
        
        return train, test

In [None]:
sqlq = "Select * from sima_station_MVI_MICE_CE WHERE datetime >=\'2020-04-17 23:00:00\'"
test_frac_split = 0.75
total_groups = 35
init_samples = sample_creation(sqlq, test_frac_split)
trim_train, trim_test = init_samples.trimmed_train_test_frac_split()

# Parameters

In [2]:
mvi_method = "MVI_MICE"
station = "CE"

target = "pm25"
look_back = 24 * 5
look_forward = 24 * 2
test_frac_split = 0.75

colums_to_drop = ["nox", "rainf"]
n_batches = 40

leading_times = [1, 2, 3, 4, 5, 6, 12, 24, 48]


In [3]:
sqlq = "Select * from sima_station_{}_{} WHERE datetime >=\'2020-04-17 23:00:00\'".format(mvi_method, station)

# UDF

In [28]:
class sample_creation_time_metadata():
    def __init__(self, target, look_back, look_forward, test_frac_split, sqlq, colums_to_drop, n_batches):
        
        # Parameters
        self.target = target
        self.look_back = look_back
        self.look_forward = look_forward
        self.test_frac_split = test_frac_split
        
        # Columns to drop from final dataset
        self.colums_to_drop = colums_to_drop
        
        # Recognizing parameters from sql command
        self.sqlq = sqlq
        self.where_sql = where_from_sqlq(sqlq) 
        self.table_sql = tablename_from_sqlq(sqlq)
        self.cols_sql = cols_from_sqlq(sqlq)
        
        # Number of batches
        self.n_batches = n_batches
        
        # Counting total number of observations from sql command
        count_obj = aux_qdata("select count(*) from ({}) s1".format(sqlq))[0][0]
        self.count_obj = count_obj
        
        # Calculating total number of observations for each batch
        self.n_observations = int((count_obj - (count_obj % n_batches)) / n_batches)

    def updated_table_time_metadata(self):
        # Recognizing parameters from sql command
        where_sql = self.where_sql
        table_sql = self.table_sql
        cols_sql = self.cols_sql
        
        # Counting total number of observations from sql command
        count_obj = self.count_obj

        # Calculating total number of observations for each batch
        n_observations = self.n_observations

        # Table with added time metadata
        time_cols = ", ".join(cols_sql).replace("datetime", "datetime, monthname(datetime), hour(datetime)")
        time_sqlq = """ Select {}
        from {} 
        {}
        ORDER BY DATETIME ASC
        LIMIT {}
        """.format(time_cols, table_sql, where_sql, n_batches * n_observations)

        # Editing column names 
        cols_sql.insert(1, "month")
        cols_sql.insert(2, "hour")
        time_df = DataFrame(aux_qdata(time_sqlq), columns = cols_sql)
        
        # Update output removing undesired columns
        time_df = time_df.loc[:, ~time_df.columns.isin(self.colums_to_drop)]
        
        # Rearrange target to be at the end of the dataset
        target_col = time_df.pop(self.target)
        time_df[self.target] = target_col
        
        return time_df.set_index("datetime")
    
    def encoding_df(self):
        from category_encoders import LeaveOneOutEncoder
        
        time_df = self.updated_table_time_metadata()
        
        encoder = LeaveOneOutEncoder(cols = ["month", "hour"])
        encoded_df = encoder.fit_transform(time_df, time_df[self.target])
        
        # |print(encoded_df.shape)
        
        return encoded_df 
    
    
    def normalizing_df(self):
        from sklearn.preprocessing import RobustScaler
        
        # Calling encoded dataset
        enc_df = self.encoding_df()
        
        # Backup objects
        indices = enc_df.index
        cols = enc_df.columns
        target_col = enc_df[self.target].copy()
        
        transformer = RobustScaler(with_centering = False).fit(enc_df)
        norm_np = transformer.transform(enc_df)
        norm_df = DataFrame(norm_np, columns = cols).set_index(indices)
        
        # print(norm_df.shape)
        
        return norm_df

    def X_y_sets(self):
        """
        Creating samples for X, y datasets by splitting the multivariate dataset.

        Input:
        * df: This function takes in a pandas dataset for facility, but returns a numpy array of values
        * look_back:
        * look_forward: Multi-step

        """

        df = self.normalizing_df()
        df_values = df.values
        look_back = self.look_back
        look_forward = self.look_forward

        X, y = list(), list()

        limit = len(df_values)

        for i in range(limit):

            # Updated indices delimiting end of series. 
            end_ix = i + look_back
            out_end_ix = end_ix + look_forward-1

            # check if we are beyond the dataset
            if out_end_ix > limit:
                break

            # gather input and output parts of the pattern
            seq_x = df_values[i:end_ix           , :-1]
            seq_y = df_values[end_ix-1:out_end_ix, -1]

            X.append(seq_x)
            y.append(seq_y)
            
        X = array(X)
        y = array(y)   

        #print("X.shape" , X.shape) 
        #print("y.shape" , y.shape)

        return X, y
    
    
    def train_test_sets(self):
        
        X, y = self.X_y_sets()
        
        split_point = floor(self.n_observations * self.n_batches * self.test_frac_split) 
        
        train_X = X[:split_point, :] 
        train_y = y[:split_point, :]
        test_X = X[split_point:, :] 
        test_y  = y[split_point:, :]
        
        print("train_X.shape", train_X.shape)
        print("train_y.shape", train_y.shape)
        print("test_X.shape", test_X.shape)
        print("test_y.shape", test_y.shape)
        
        return train_X , train_y, test_X , test_y

In [29]:
init_sc = sample_creation_time_metadata(target, look_back, look_forward, test_frac_split, sqlq, colums_to_drop, n_batches)
train_X , train_y, test_X , test_y = init_sc.train_test_sets()

train_X.shape (13140, 120, 14)
train_y.shape (13140, 48)
test_X.shape (4214, 120, 14)
test_y.shape (4214, 48)


In [31]:
4214 / 10

421.4

In [None]:
X, y = init_samples.samples()

In [None]:
print(X.shape)
print(y.shape)

In [None]:
480