# Prototype Code for Data Generation and Simulation

Importing libraries:

In [93]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.datasets import make_regression

### Data generation and sampling

We define a class named CustomData that generates the entire dataset and can return batches of data upon request during the simulation.

In [94]:
class CustomData:
    """
    CustomData class generates the entire data set and can return samples
    """
    def __init__(self,n_samples=1000,n_features=20,n_informative=8):
        """
        Parameters
        ----------
        n_samples: number of samples to be generated
        n_features: number of all possible features
        n_informative: number of features the model is based on
        """
        self._n_samples = n_samples
        self._n_features = n_features
        self._n_informative = n_informative
        self._n_targets = 1
        self._bias = 1
        self._effective_rank = 20
        self._tail_strength = 0.7
        self._noise = 0.1
        self._shuffle = True
        self._coef = True
        self._random_state = 42
        self._X, self._y, self._model = make_regression(self._n_samples,
                                                       self._n_features,
                                                       self._n_informative,
                                                       self._n_targets,
                                                       self._bias,
                                                       self._effective_rank,
                                                       self._tail_strength,
                                                       self._noise,
                                                       self._shuffle,
                                                       self._coef,
                                                       self._random_state)
        self._collected = 0
        index = range(1,self._n_samples+1)
        columns = ["feature {}".format(i) for i in range(1,self._n_features+1)] + ["labels"]
        self._complete_data = pd.DataFrame(np.c_[self._X,self._y], index, columns)
    
    def collect(self,n_instances,feature_list=0):
        """
        Returns some data
        Parameters
        ----------
        n_instances: number of collected instances
        feature_list: a list of features that are to be collected
        Returns
        X: X-matrix containing the values corresponding each instance and selected feature
        y: y-vector containing the labels of each instance
        -------
        """
        if feature_list==0:
            feature_list = range(1,self._n_features+1)
        if self._collected + n_instances > self._n_samples:
            print("Not enough samples to collect {} instances. Try a lower number of instances.".format(n_instances))
            print("Returning 0-by-|feature_list| X and 0-by-1 y.")
            return np.empty(shape=(0,len(feature_list))), np.empty(shape=(0,1))
        X = self._complete_data[["feature {}".format(i) for i in feature_list]][self._collected:self._collected+n_instances].copy()
        y = self._complete_data[["labels"]][self._collected:self._collected+n_instances].copy()
        self._collected += n_instances
        return np.array(X), np.array(y)

### Computing correlations

Following functions are defined to compute correlation matrices and vectors for input data and labels.

In [95]:
def get_correlation_matrix(X,y):
    """
    Get correlation matrix for the given data instances X, and outputs y
    Parameters
    ----------
    X: Input data X
    y: Labels y
    Returns
    -------
    corr_matrix: correlation matrix for each (feature,feature) and (feature,label) pair
    """
    index = range(1,X.shape[0]+1)
    n_features = X.shape[1]
    columns = ["feature {}".format(i) for i in range(1,n_features+1)] + ["labels"]
    data = pd.DataFrame(np.c_[X,y], index, columns)
    corr_matrix = data.corr()
    return corr_matrix

In [96]:
def get_correlation_vec(X,y):
    """
    Get correlation vector between features of X and the labels y
    Parameters
    ----------
    X: Input data X
    y: Labels y
    Returns
    -------
    corr_vec: correlation vec (feature,label) pair
    """
    index = range(1,X.shape[0]+1)
    n_features = X.shape[1]
    columns = ["feature {}".format(i) for i in range(1,n_features+1)] + ["labels"]
    data = pd.DataFrame(np.c_[X,y], index, columns)
    corr_matrix = data.corr()
    corr_vec = corr_matrix["labels"].sort_values(ascending=False)
    return corr_vec

### Simulation

Following is a function prototype for feature selection, that returns a list of features depending on the input parameters. To compute the reward for each feature, we use a formulation inspired from Upper-Confidence-Bound Action Selection(Sutton, R. S., & Barto, A. G. (2018). Reinforcement learning: An introduction. MIT press.) J is the value of each feature, c is the trade-off factor for exploitation and exploration, step represents the time, N is the number of samples collected for each feature, feature limit is the number of features that is desired to be collected.

For features with $N=0$, reward is $\infty$, meaning that each feature will be selected and some data will be collected before any real computations are made.

$N$ includes the samples whose features are determined but values are not collected yet. This algorithm accounts for the features that are selected in the previous step but are still pending for data.


In [97]:
def feature_selection(J,c,step,N,feature_limit):
    """
    Selects and returns optimal features
    Parameters
    ----------
    J: Value vector for features
    c: Trade-off factor (exploitation vs. exploration)
    step: Current time (t)
    N: Number of samples collected for each feature (including previously selected features that are to be collected)
    feature_limit: desired number of features to be selected
    Returns
    -------
    feature_list: a list of selected features
    """
    feature_list = list()
    reward = list()
    for feature in range(1,len(J)+1):
        if N[feature-1] == 0:
            reward.append(np.infty)
        else:
            reward.append(J[feature-1] + c*np.sqrt(np.log(step+1)/N[feature-1]))
    while len(feature_list) < feature_limit:
        feature_list.append(int(np.argmax(reward))+1)
        reward[feature_list[-1]-1] = -1*np.infty
        
    return feature_list

Following is a function prototype that adds collected data from the current batch to the memory. Note that $X_{memo}$ has a shape of $n_{samples} \times n_{features}$. Therefore, only collected features will have non-zero entries when the sampled data is being transferred to memory.

In [98]:
def add_to_memory(X_memo, X_batch, y_memo, y_batch, selected_features):
    """
    Add collected batch of data to memory
    Parameters
    ----------
    X_memo: current X-matrix in memory
    X_batch: collected data X
    y_memo: current y-vector in memory
    y_batch: assigned labels y
    selected_features: features that the collected data X is based on
    Returns
    -------
    X_memo: updated X-matrix in memory
    y_memo: updated y-vector in memory
    """
    X_append = np.zeros((X_batch.shape[0],X_memo.shape[1]))
    y_append = np.zeros((y_batch.shape[0],y_memo.shape[1]))
    idx = 0
    for feature in selected_features:
        X_append[:,feature-1] = X_batch[:,idx]
        idx += 1
    y_append = y_batch
    X_memo = np.concatenate((X_memo, X_append))
    y_memo = np.concatenate((y_memo, y_append))
    return X_memo, y_memo

Following is a function prototype for computing the feature values given the data in memory.

Computation of $J$ is currently based on the correlation value between each feature and the label according to the entire data in memory.

In [99]:
def update_J(X_memo, y_memo):
    """
    Computes feature values J
    Parameters
    ----------
    X_memo: current X-matrix in memory
    y_memo: current y-vector in memory
    Returns
    -------
    J: list of computed feature values
    """
    corr_measure = get_correlation_vec(X_memo, y_memo)
    J = list()
    for i in range(1,X_memo.shape[1]+1):
        key = 'feature {}'.format(i)
        if np.isnan(corr_measure[key]):
            J.append(0)
        else:
            J.append(abs(corr_measure[key]))
    return J

Following is a function prototype for updating the number of samples collected for each feature.

In [100]:
def update_N(N, selected_features, n_instances):
    """
    Updates sample numbers N for each feature
    Parameters
    ----------
    N: current number of samples of each feature
    selected_features: selected features that require an update
    n_instances: collected number of samples that are to be added
    Returns
    -------
    N: updated list of sample numbers for each feature
    """
    for feature in selected_features:
        N[feature-1] += n_instances
    return N

Following function prototype simulates a given step of feature selection and data collection procedures.

We initialize $X_{memo}$ and $y_{memo}$ as empty matrix and vector, $J$ and $N$ are initialized to zeros. Trade-off factor is selected as 2. Previos features is initialized as an empty list.

For given number of steps, we select features depending on the feature values $J$, trade-off factor $c$, current step number $step$, number of samples collected for each feature $N$, and the number of features to be selected $feature\_limit$ (defaulted at 5). Then, we collect data using the previously selected features (assuming that there is a delay between the feature selection process and the collection of data, we can only collect data using the previously selected features). However, number of samples $N$ is updated using the most recently selected features. Collected data is added to memory, and feature values are computed depending on the data in memory. Lastly, selected features are passed on to previous features.

In [101]:
def simulate_procedure(dataset,n_steps=100,n_vehicles=100,feature_limit=5):
    """
    Simulates gradual data collection procedure
    dataset: entire dataset including all future data
    n_steps: number of steps
    n_vehicles: number of vehicles to collect data from
    feature_limit: desired number of features to be selected
    """
    X_memo = np.empty(shape=(0,dataset._n_features))
    y_memo = np.empty(shape=(0,1))
    J = np.zeros((dataset._n_features,1))
    N = np.zeros((dataset._n_features,1))
    c = 2
    prev_features = list()
    for step in range(n_steps):
        selected_features = feature_selection(J,c,step,N,feature_limit)
        print("Selected features in step {}:".format(step))
        print(selected_features)
        X_batch, y_batch = dataset.collect(n_vehicles, prev_features)
        N = update_N(N,selected_features, n_vehicles)
        X_memo, y_memo = add_to_memory(X_memo, X_batch, y_memo, y_batch, prev_features)
        J = update_J(X_memo, y_memo)
        prev_features = selected_features

Following code generates the data set and runs a simulation.

In [102]:
dataset = CustomData(n_samples=10000, n_features=20, n_informative=8)

In [103]:
simulate_procedure(dataset)

Selected features in step 0:
[1, 2, 3, 4, 5]
Selected features in step 1:
[6, 7, 8, 9, 10]
Selected features in step 2:
[11, 12, 13, 14, 15]
Selected features in step 3:
[16, 17, 18, 19, 20]
Selected features in step 4:
[4, 3, 11, 9, 10]
Selected features in step 5:
[7, 19, 4, 17, 3]
Selected features in step 6:
[3, 9, 11, 10, 4]
Selected features in step 7:
[3, 4, 19, 9, 11]
Selected features in step 8:
[3, 9, 4, 8, 2]
Selected features in step 9:
[3, 9, 4, 11, 17]
Selected features in step 10:
[3, 9, 4, 11, 13]
Selected features in step 11:
[3, 9, 4, 11, 15]
Selected features in step 12:
[3, 9, 4, 11, 14]
Selected features in step 13:
[3, 9, 4, 11, 6]
Selected features in step 14:
[3, 9, 4, 11, 5]
Selected features in step 15:
[3, 9, 4, 11, 12]
Selected features in step 16:
[3, 9, 4, 11, 1]
Selected features in step 17:
[3, 9, 4, 11, 20]
Selected features in step 18:
[3, 9, 4, 11, 18]
Selected features in step 19:
[3, 9, 4, 11, 16]
Selected features in step 20:
[3, 9, 4, 11, 19]
Sele

Following code provides the correlation vector using the entire dataset, and the actual model that is used to generate the data.

In [104]:
corr_vec = get_correlation_vec(dataset._X,dataset._y)
print("Correlation vector obtained using entire dataset:")
print("================================")
print(corr_vec)

Correlation vector obtained using entire dataset:
labels        1.000000
feature 3     0.468577
feature 9     0.440805
feature 11    0.405514
feature 4     0.384567
feature 19    0.378801
feature 17    0.283558
feature 10    0.272381
feature 8     0.205915
feature 14    0.048127
feature 20    0.037011
feature 15    0.029555
feature 6     0.028081
feature 7     0.025649
feature 12    0.022933
feature 2     0.022606
feature 16    0.006147
feature 1     0.005914
feature 13   -0.001007
feature 18   -0.004236
feature 5    -0.024030
Name: labels, dtype: float64


In [105]:
actual_model = dataset._model
print("Actual model of the system:")
print("================================")
feature = 1
model_eq = "y(x) ~="
for coef in actual_model:
    if coef != 0:
        model_eq += " {:.2f} x_{} +".format(coef,feature)
    feature += 1
if model_eq[-1] == "+":
    actual_model_eq = model_eq[:-1]
print(actual_model_eq)

Actual model of the system:
y(x) ~= 90.99 x_3 + 63.61 x_4 + 39.22 x_8 + 88.86 x_9 + 39.04 x_10 + 65.27 x_11 + 63.08 x_17 + 72.97 x_19 
