# STA 561 - HW3
### Nancy Huang, Camilla Yu, Yihan Ma, Eva Gao, Yaze Gao, Shuo Wang

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import sklearn
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

## Function for Tuning Blackbox Regression Models

In [2]:
def blackbox(algo, X, Y, method = "Dropout", M = 100, K = 5, c = None, criteria = "MSE"):
    """
    This function uses different regularization methods to train a predicted model that optimizes the specific criterion.
    @param algo(required): A learning algorithm that takes X, Y as inputs and outputs a model 
    @param X(required): The matrix of indepdent/predictor variables
    @param Y(required): The outcome variable
    @param method(default value is 'Dropout'): Regularization methods 
    Can be 'Dropout', 'NoiseAddition' or 'Robust'
    @param M(default value is 100): The number of Monte Carlo replicates
    @param K(default value is 5): Number of CV folds for tuning hyperparameter
    @param c(only used when the method is 'Robust'): A kxp matrix (k: the number of hyperparameters users want to try, 
    p: the number of predictor variables in X). The column of c means the column bounds for X or the bound for each 
    predictor variable. The row of c means how many hyperparameters users want to tune to find the best one.
    The best hyperparameter here is a p-dimensional vector.
    @param criteria(default value is 'MSE'): A criterion to be used to evaluate the method
    Can be either mean squared error (MSE) or mean absolute deviation (MAD)
    @return: A predictive model that optimizes the specific criterion
    """
    
    if method == "Dropout":
        return dropout(algo, X, Y, M, K, criteria)
    elif method == "NoiseAddition":
        return noiseAddition(algo, X, Y, M, K, criteria)
    elif method == "Robust":
        return Robust(algo, X, Y, K, c, criteria)
    else:
        raise ValueError('Method should be one of Dropout, NoiseAddition or Robust!')

`dropout` function searchs a list of $\phi$ values in the range of (0, 1) and finds the optimal $\phi$ that yields to the smallest value of a criterion of user's choice (either MSE or MAD) using the dropout method. 

For each candidate $\phi$, a K-fold cross validation is conducted. In each fold, the data is split into train and test set. The train set is first repeatedly expanded M times with itself for easier computation purpose. Then, an independent Bernoulli random variables Z is generated with probability $\phi$. The random dropout is created by multiplying each element of X and Z, and then dividing the product by $1-\phi$. It then becomes the new X train data and is fed into the learning algorithm together with the expanded Y train data. A prediction is made on the test X with the model returned by the learning algorithm and is used to calculate the criterion (MSE or MAD based on user's choice). After iterating through all folds, the average CV error is obtained and will be compared with that using different values of $\phi$.

Once all $\phi$ values are examined, the optimal $\phi$ with smallest CV errors is selected to train the final output model. The entire X and Y are expanded M times with themselves, independent Bernoulli variable Z is generated as before but with the optimal $\phi$ as its probability. Once X is modified, the learning algorithm is used to train the new X and Y data, and the returning function will be the final output of this function.

In [3]:
def dropout(algo, X, Y, M, K, criteria):
    """
    This function uses dropout method to train a predicted model that optimizes the specific criterion.
    @param algo(required): A learning algorithm that takes X, Y as inputs and outputs a model 
    @param X(required): The matrix of indepdent/predictor variables
    @param Y(required): The outcome variable
    @param M(required): The number of Monte Carlo replicates
    @param K(required): Number of CV folds for tuning hyperparameter
    @param criteria(required): A criterion to be used to evaluate the method. 
    Can be either mean squared error (MSE) or mean absolute deviation (MAD)
    @return: A predictive model that optimizes the specific criterion
    """
    
    # List of phi values to be tuned
    phi_list = np.arange(0, 1, 0.1)
    cv_error = []
    
    for phi in phi_list:
        # Create k folds for cross-validation
        kf = KFold(n_splits = K, shuffle=True)
        error = []
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            Y_train, Y_test = Y[train_index], Y[test_index]
            
            # Monte Carlo: expand original data M times to generate a new dataset 
            # (X will be modified later)
            X_new_ori = np.repeat(X_train, M, axis=0)
            Y_new = np.repeat(Y_train, M, axis=0)
            
            # Generate independent Bernoulli random variables Z
            z = np.random.binomial(1, phi, size=X_new_ori.shape)
            # Random Dropout for X
            X_new = X_new_ori*z/(1-phi)
            
            # Fit input learning algorithm with new data
            reg = algo(X_new, Y_new)
            # Predict Y using X test
            pred = reg.predict(X_test)
            
            # Calculate MSE/MAD using the model obtained above
            if criteria == "MSE":
                error.append(mean_squared_error(Y_test, pred))
            elif criteria == "MAD":
                error.append(mean_absolute_error(Y_test, pred))
            # Raise error if input is not MSE or MAD
            else:
                raise ValueError('Please input either MSE or MAD!')
        
        cv_error.append(np.mean(error))
    
    # Find optimal phi value that has the smallest CV error
    phi_opt = phi_list[np.argmin(cv_error)]
    
    # Use the optimal phi value to train the predicted model
    X_new_ori = np.repeat(X, M, axis=0)
    Y_new = np.repeat(Y, M, axis=0)
    
    z = np.random.binomial(1, phi_opt, size=X_new_ori.shape)
    X_new = X_new_ori*z/(1-phi_opt)

    reg = algo(X_new, Y_new)
    
    return reg

`noiseAddition` function searchs a list of $\lambda$ values in the range of (0, 1) and finds the optimal $\lambda$ that yields to the smallest value of a criterion of user's choice (either MSE or MAD) using the noise addition method. 

For each candidate $\lambda$, a K-fold cross validation is conducted. In each fold, the data is split into train and test set. The train set is first repeatedly expanded M times with itself for easier computation purpose. Then, a random noise Z which is an independent Normal random variables with mean 0 and variance $\lambda$ is added to X. It then becomes the new X train data and is fed into the learning algorithm together with the expanded Y train data. A prediction is made on the test X with the model returned by the learning algorithm and is used to calculate the criterion (MSE or MAD based on user's choice). After iterating through all folds, the average CV error is obtained and will be compared with that using different values of $\lambda$.

Once all $\lambda$ values are examined, the optimal $\lambda$ with smallest CV errors is selected to train the final output model. The entire X and Y are expanded M times with themselves, independent normal variable Z is added to X as before but with the variance being optimal $\lambda$. Once X is modified, the learning algorithm is used to train the new X and Y data, and the returning function will be the final output of this function.

In [4]:
def noiseAddition(algo, X, Y, M, K, criteria):
    """
    This function uses noiseaddition method to train a predicted model that optimizes the specific criterion.
    @param algo(required): A learning algorithm that takes X, Y as inputs and outputs a model 
    @param X(required): The matrix of indepdent/predictor variables
    @param Y(required): The outcome variable
    @param M(required): The number of Monte Carlo replicates
    @param K(required): Number of CV folds for tuning hyperparameter
    @param criteria(required): A criterion to be used to evaluate the method. 
    Can be either mean squared error (MSE) or mean absolute deviation (MAD)
    @return: A predictive model that optimizes the specific criterion
    """
    
    lambda_list = np.arange(0, 1, 0.1)
    cv_error = []
    
    for l in lambda_list:
        kf = KFold(n_splits = K, shuffle=True)
        error = []
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            Y_train, Y_test = Y[train_index], Y[test_index]
            
            X_new_ori = np.repeat(X_train, M, axis=0)
            Y_new = np.repeat(Y_train, M, axis=0)
            
            z = np.random.normal(0, l, size=X_new_ori.shape)
            X_new = X_new_ori + z
            
            reg = algo(X_new, Y_new)
            pred = reg.predict(X_test)

            if criteria == "MSE":
                error.append(mean_squared_error(Y_test, pred))
            elif criteria == "MAD":
                error.append(mean_absolute_error(Y_test, pred))
            else:
                raise ValueError('Please input either MSE or MAD!')
        
        cv_error.append(np.mean(error))
        
    l_opt = lambda_list[np.argmin(cv_error)]
    
    X_new_ori = np.repeat(X, M, axis=0)
    Y_new = np.repeat(Y, M, axis=0)
    
    z = np.random.normal(0, l_opt, size=X_new_ori.shape)
    X_new = X_new_ori + z

    reg = algo(X_new, Y_new)
    
    return reg

In [5]:
def Robust(algo, X, Y, K, c, criteria):
    """
    This function uses robust method to train a predicted model that optimizes the specific criterion.
    @param algo(required): A learning algorithm that takes X, Y as inputs and outputs a model 
    @param X(required): The matrix of indepdent/predictor variables
    @param Y(required): The outcome variable
    @param K(required): Number of CV folds for tuning hyperparameter
    @param c(required): A kxp matrix (k: the number of hyperparameters users want to try, 
    p: the number of predictor variables in X). The column of c means the column bounds for X or the bound for each 
    predictor variable. The row of c means how many hyperparameters users want to tune to find the best one.
    The best hyperparameter here is a p-dimensional vector.
    @param criteria(required): A criterion to be used to evaluate the method. 
    Can be either mean squared error (MSE) or mean absolute deviation (MAD)
    @return: A predictive model that optimizes the specific criterion
    """
    
    num_pre = X.shape[1]
    c_opt = [0]*num_pre
    cv_error = 100000
    
    for i in range(c.shape[1]):
        c_curr = c[:,i]
        kf = KFold(n_splits = K, shuffle=True)
        error = []
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            Y_train, Y_test = Y[train_index], Y[test_index]
            
            first = 0
            num_row = X_train.shape[0]
            for j in range(num_pre):
                first += 1
                # randomly sample a value k such that k <= c_j^2
                k = np.random.uniform(low=0.001, high=c_curr[j]**2, size=1)[0]
                delta = np.random.randint(1, 10, size=num_row)
                delta = delta/sum(delta) * k
                delta = np.sqrt(delta)
                
                if first == 1:
                    delta_matrix = delta
                else:
                    delta_matrix = np.column_stack((delta_matrix, delta))
                    
            X_new = X_train + delta_matrix
            
            reg = algo(X_new, Y_train)
            pred = reg.predict(X_test)

            if criteria == "MSE":
                error.append(mean_squared_error(Y_test, pred))
            elif criteria == "MAD":
                error.append(mean_absolute_error(Y_test, pred))
            else:
                raise ValueError('Please input either MSE or MAD!')
        
        if np.mean(error) < cv_error:
            cv_error = np.mean(error)
            c_opt = c_curr
    
    first = 0
    num_row = X.shape[0]
    for j in range(num_pre):
        first += 1
        # randomly sample a value k such that k <= c_j^2
        k = np.random.uniform(low=0.001, high=c_opt[j]**2, size=1)[0]
        delta = np.random.randint(1, 10, size=num_row)
        delta = delta/sum(delta) * k
        delta = np.sqrt(delta)

        if first == 1:
            delta_matrix = delta
        else:
            delta_matrix = np.column_stack((delta_matrix, delta))
            
    X_new = X + delta_matrix
    
    reg = algo(X_new, Y)
    
    return reg

## Example

In [6]:
data = pd.read_table('dataset.dat', sep='\s+')

In [7]:
X = data.values[:,:8]
Y = data['lpsa'].values

In [8]:
c1 = np.array([1,2,3,4,5,6,7,8])
c2 = np.array([0.5,1,2.5,4,3,5,1,3])
c_mx = np.column_stack((c1,c2))

### 1. Linear Algorithm

In [9]:
def linear_algo(X, Y):
    return LinearRegression().fit(X, Y)

In [10]:
mod1_mse = blackbox(linear_algo, X, Y, method = "Dropout", K = 5, criteria = "MSE")
mod1_mad = blackbox(linear_algo, X, Y, method = "Dropout", K = 5, criteria = "MAD")

In [11]:
print(mean_squared_error(Y, mod1_mse.predict(X)))
print(mean_absolute_error(Y, mod1_mad.predict(X)))

0.7188379660019427
0.6499431890327839


In [12]:
mod2_mse = blackbox(linear_algo, X, Y, method = "NoiseAddition", K = 5, criteria = "MSE")
mod2_mad = blackbox(linear_algo, X, Y, method = "NoiseAddition", K = 5, criteria = "MAD")

In [13]:
print(mean_squared_error(Y, mod2_mse.predict(X)))
print(mean_absolute_error(Y, mod2_mad.predict(X)))

0.44459013175048956
0.5220624578018508


In [14]:
mod3_mse = blackbox(linear_algo, X, Y, method = "Robust", K = 5, c = c_mx, criteria = "MSE")
mod3_mad = blackbox(linear_algo, X, Y, method = "Robust", K = 5, c = c_mx, criteria = "MAD")

In [15]:
print(mean_squared_error(Y, mod3_mse.predict(X)))
print(mean_absolute_error(Y, mod3_mad.predict(X)))

0.501669144616783
0.5214949677915285


### Machine Learning Algorithm (Non-linear)

In [16]:
def machine_learning_algo(X, Y):
    return GradientBoostingRegressor().fit(X, Y)

In [17]:
mod1_mse_m = blackbox(machine_learning_algo, X, Y, method = "Dropout", K = 5, criteria = "MSE")
mod1_mad_m = blackbox(machine_learning_algo, X, Y, method = "Dropout", K = 5, criteria = "MAD")

In [18]:
print(mean_squared_error(Y, mod1_mse_m.predict(X)))
print(mean_absolute_error(Y, mod1_mad_m.predict(X)))

1.053529836189304
0.7854389479651129


In [19]:
mod2_mse_m = blackbox(machine_learning_algo, X, Y, method = "NoiseAddition", K = 5, criteria = "MSE")
mod2_mad_m = blackbox(machine_learning_algo, X, Y, method = "NoiseAddition", K = 5, criteria = "MAD")

In [20]:
print(mean_squared_error(Y, mod2_mse_m.predict(X)))
print(mean_absolute_error(Y, mod2_mad_m.predict(X)))

0.0257613805074768
0.16235555954633077


In [21]:
mod3_mse_m = blackbox(machine_learning_algo, X, Y, method = "Robust", K = 5, c = c_mx, criteria = "MSE")
mod3_mad_m = blackbox(machine_learning_algo, X, Y, method = "Robust", K = 5, c = c_mx, criteria = "MAD")

In [22]:
print(mean_squared_error(Y, mod3_mse_m.predict(X)))
print(mean_absolute_error(Y, mod3_mad_m.predict(X)))

0.22536417367388287
0.3635020259918495
