# Prototype Code for Data Generation and Simulation

Importing libraries:

In [7]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.datasets import make_regression

### Data generation and sampling

We define a class named CustomData that generates the entire dataset and can return batches of data upon request during the simulation.

In [8]:
class CustomData:
    """
    CustomData class generates the entire data set and can return samples
    """
    def __init__(self,n_samples=1000,n_features=20,n_informative=8):
        """
        n_samples: number of samples to be generated
        n_features: number of all possible features
        n_informative: number of features the model is based on
        """
        self._n_samples = n_samples
        self._n_features = n_features
        self._n_informative = n_informative
        self._n_targets = 1
        self._bias = 1
        self._effective_rank = 20
        self._tail_strength = 0.7
        self._noise = 0.1
        self._shuffle = True
        self._coef = True
        self._random_state = 42
        self._X, self._y, self._model = make_regression(self._n_samples,
                                                       self._n_features,
                                                       self._n_informative,
                                                       self._n_targets,
                                                       self._bias,
                                                       self._effective_rank,
                                                       self._tail_strength,
                                                       self._noise,
                                                       self._shuffle,
                                                       self._coef,
                                                       self._random_state)
        self._collected = 0
        index = range(1,self._n_samples+1)
        columns = ["feature {}".format(i) for i in range(1,self._n_features+1)] + ["labels"]
        self._complete_data = pd.DataFrame(np.c_[self._X,self._y], index, columns)
    
    def collect(self,n_instances,feature_list):
        """
        Returns some data
        n_instances: number of collected instances
        feature_list: a list of features that are to be collected
        """
        X = self._complete_data[["feature {}".format(i) for i in feature_list]][self._collected:self._collected+n_instances].copy()
        y = self._complete_data[["labels"]][self._collected:self._collected+n_instances].copy()
        self._collected += n_instances
        return np.array(X), np.array(y)

### Computing correlations

Following functions are defined to compute correlation matrices and vectors for input data and labels.

In [4]:
def get_correlation_matrix(X,y):
    """
    Get correlation matrix for the given data instances X, and outputs y
    """
    index = range(1,X.shape[0]+1)
    n_features = X.shape[1]
    columns = ["feature {}".format(i) for i in range(1,n_features+1)] + ["labels"]
    data = pd.DataFrame(np.c_[X,y], index, columns)
    corr_matrix = data.corr()
    return corr_matrix

In [6]:
def get_correlation_vec(X,y):
    """
    Get correlation vector between features of X and the labels y
    """
    index = range(1,X.shape[0]+1)
    n_features = X.shape[1]
    columns = ["feature {}".format(i) for i in range(1,n_features+1)] + ["labels"]
    data = pd.DataFrame(np.c_[X,y], index, columns)
    corr_matrix = data.corr()
    corr_vec = corr_matrix["labels"].sort_values(ascending=False)
    return corr_vec

### Simulation

Following is a function prototype for feature selection, that returns a list of features depending on the input performance measure.

In [11]:
def feature_selection(measure):
    """
    Selects and returns optimal features
    """
    feature_list = list()
    return feature_list

Following is a function prototype that adds collected data from the current batch to the memory.

In [13]:
def add_to_memory(X_memo, x_batch, y_memo, y_batch):
    """
    Add collected batch of data to memory
    """
    X_memo = np.array(None)
    y_memo = np.array(None)
    return X_memo, y_memo

Following is a function prototype for computing the performance measure given the history of the data.

In [14]:
def compute_measure(X_memo, y_memo):
    """
    Compute performance measure to select the features accordingly
    """
    measure = 0
    return measure

Following function prototype simulates a given step of feature selection and data collection procedures.

In [138]:
def simulate_procedure(dataset,n_steps=10,n_vehicles=100):
    """
    Simulates gradual data collection procedure
    dataset: entire dataset including all future data
    n_steps: number of steps
    n_vehicles: number of vehicles to collect data from
    """
    X_memo = np.array(None)
    y_memo = np.array(None)
    measure = 0
    for step in range(n_steps):
        selected_features = feature_selection(measure)
        X_batch, y_batch = dataset.collect(n_vehicles, selected_features)
        X_memo, y_memo = add_to_memory(X_memo, X_batch, y_memo, y_batch)
        measure = compute_measure(X_memo, y_memo)

Following code generates the data set and runs a simulation.

In [148]:
dataset = CustomData(n_samples=10000, n_features=20, n_informative=8)

In [149]:
simulate_procedure(dataset)