In [36]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import sklearn
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression

In [53]:
def blackbox(algo, X, Y, method = "Dropout", M = 100, K = 5, criteria = "MSE"):
    if method == "Dropout":
        return dropout(algo, X, Y, M, K, criteria)
    elif method == "NoiseAddition":
        return noiseAddition_M(algo, X, Y, M, K, criteria)
    elif method == "Robust":
        return Robust(algo, X, Y, K, criteria)
    else:
        raise ValueError('Method should be one of Dropout, NoiseAddition or Robust!')

In [15]:
def dropout(algo, X, Y, M, K, criteria):
    
    phi_list = np.arange(0, 1, 0.1)
    cv_error = []
    
    for phi in phi_list:
        kf = KFold(n_splits = K, shuffle=True)
        error = []
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            Y_train, Y_test = Y[train_index], Y[test_index]
            
            X_new_ori = np.repeat(X_train, M, axis=0)
            Y_new = np.repeat(Y_train, M, axis=0)

            z = np.random.binomial(1, phi, size=X_new_ori.shape)
            X_new = X_new_ori*z/(1-phi)
            #X_new = (X_new - np.mean(X_new, axis=0))/np.std(X_new, axis=0)

            reg = algo(X_new, Y_new)
            pred = reg.predict(X_test)

            if criteria == "MSE":
                error.append(mean_squared_error(Y_test, pred))
            elif criteria == "MAD":
                error.append(mean_absolute_error(Y_test, pred))
            else:
                raise ValueError('Please input either MSE or MAD!')
        
        cv_error.append(np.mean(error))
        
    phi_opt = phi_list[np.argmin(cv_error)]
    
    X_new_ori = np.repeat(X, M, axis=0)
    Y_new = np.repeat(Y, M, axis=0)

    z = np.random.binomial(1, phi_opt, size=X_new_ori.shape)
    X_new = X_new_ori*z/(1-phi_opt)
    #X_new = (X_new - np.mean(X_new, axis=0))/np.std(X_new, axis=0)

    reg = algo(X_new, Y_new)
    
    return reg

In [64]:
def noiseAddition_M(algo, X, Y, M, K, criteria):
    lambda_list = np.arange(0, 1, 0.1)
    cv_error = []
    
    for l in lambda_list:
        kf = KFold(n_splits = K, shuffle=True)
        error = []
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            Y_train, Y_test = Y[train_index], Y[test_index]
            
            X_new_ori = np.repeat(X_train, M, axis=0)
            Y_new = np.repeat(Y_train, M, axis=0)

            z = np.random.normal(0, l, size=X_new_ori.shape)
            X_new = X_new_ori + z
            #X_new = (X_new - np.mean(X_new, axis=0))/np.std(X_new, axis=0)
            
            reg = algo(X_new, Y_new)
            pred = reg.predict(X_test)

            if criteria == "MSE":
                error.append(mean_squared_error(Y_test, pred))
            elif criteria == "MAD":
                error.append(mean_absolute_error(Y_test, pred))
            else:
                raise ValueError('Please input either MSE or MAD!')
        
        cv_error.append(np.mean(error))
        
    l_opt = lambda_list[np.argmin(cv_error)]
    
    X_new_ori = np.repeat(X, M, axis=0)
    Y_new = np.repeat(Y, M, axis=0)

    z = np.random.normal(0, l_opt, size=X_new_ori.shape)
    X_new = X_new_ori + z
    #X_new = (X_new - np.mean(X_new, axis=0))/np.std(X_new, axis=0)

    reg = algo(X_new, Y_new)
    
    return reg

In [24]:
def noiseAddition(algo, X, Y, M, K, criteria):
    lambda_list = np.arange(0, 1, 0.1)
    cv_error = []
    
    for l in lambda_list:
        kf = KFold(n_splits = K, shuffle=True)
        error = []
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            Y_train, Y_test = Y[train_index], Y[test_index]
    
            z = np.random.normal(0, l, size=X_train.shape)
            X_new = X_train + z
            #X_new = (X_new - np.mean(X_new, axis=0))/np.std(X_new, axis=0)

            reg = algo(X_new, Y_train)
            pred = reg.predict(X_test)

            if criteria == "MSE":
                error.append(mean_squared_error(Y_test, pred))
            elif criteria == "MAD":
                error.append(mean_absolute_error(Y_test, pred))
            else:
                raise ValueError('Please input either MSE or MAD!')
        
        cv_error.append(np.mean(error))
        
    l_opt = lambda_list[np.argmin(cv_error)]
    
    z = np.random.normal(0, l_opt, size=X.shape)
    #X_new = (X_new - np.mean(X_new, axis=0))/np.std(X_new, axis=0)

    reg = algo(X+z, Y)
    
    return reg

In [49]:
def Robust(algo, X, Y, K, criteria):
    num_pre = X.shape[1]
    cj_sum_list = np.arange(0, 1, 0.1)
    c_opt = [0]*num_pre
    cv_error = 100000
    
    for c_sum in cj_sum_list:
        c = np.random.randint(1, 10, size=num_pre)
        c = c/sum(c) * c_sum
        
        kf = KFold(n_splits = K, shuffle=True)
        error = []
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            Y_train, Y_test = Y[train_index], Y[test_index]
            
            first = 0
            num_row = X_train.shape[0]
            for j in range(num_pre):
                first += 1
                # randomly sample a value k such that k <= c_j^2
                k = np.random.uniform(low=0.001, high=c[j]**2, size=1)[0]
                delta = np.random.randint(1, 10, size=num_row)
                delta = delta/sum(delta) * k
                delta = np.sqrt(delta)
                
                if first == 1:
                    delta_matrix = delta
                else:
                    delta_matrix = np.column_stack((delta_matrix, delta))
                    
            X_new = X_train + delta_matrix
            #X_new = (X_new - np.mean(X_new, axis=0))/np.std(X_new, axis=0)

            reg = algo(X_new, Y_train)
            pred = reg.predict(X_test)

            if criteria == "MSE":
                error.append(mean_squared_error(Y_test, pred))
            elif criteria == "MAD":
                error.append(mean_absolute_error(Y_test, pred))
            else:
                raise ValueError('Please input either MSE or MAD!')
        
        if np.mean(error) < cv_error:
            cv_error = np.mean(error)
            c_opt = c
    
    first = 0
    num_row = X.shape[0]
    for j in range(num_pre):
        first += 1
        # randomly sample a value k such that k <= c_j^2
        k = np.random.uniform(low=0.001, high=c_opt[j]**2, size=1)[0]
        delta = np.random.randint(1, 10, size=num_row)
        delta = delta/sum(delta) * k
        delta = np.sqrt(delta)

        if first == 1:
            delta_matrix = delta
        else:
            delta_matrix = np.column_stack((delta_matrix, delta))
            
    X_new = X + delta_matrix
    #X_new = (X_new - np.mean(X_new, axis=0))/np.std(X_new, axis=0)

    reg = algo(X_new, Y)
    
    return reg

In [38]:
data = pd.read_table('dataset.dat', sep='\s+')

In [39]:
def algorithm(X, Y):
    return LinearRegression().fit(X, Y)

In [40]:
X = data.values[:,:8]
Y = data['lpsa'].values

In [69]:
mod1_mse = blackbox(algorithm, X, Y, method = "Dropout", K = 5, criteria = "MSE")
mod1_mad = blackbox(algorithm, X, Y, method = "Dropout", K = 5, criteria = "MAD")

In [70]:
print(mean_squared_error(Y, mod1_mse.predict(X)))
print(mean_absolute_error(Y, mod1_mad.predict(X)))

0.676075654684088
0.6628653328647743


In [65]:
mod2_mse = blackbox(algorithm, X, Y, method = "NoiseAddition", K = 5, criteria = "MSE")
mod2_mad = blackbox(algorithm, X, Y, method = "NoiseAddition", K = 5, criteria = "MAD")

In [66]:
print(mean_squared_error(Y, mod2_mse.predict(X)))
print(mean_absolute_error(Y, mod2_mad.predict(X)))

0.4445726594315684
0.5083114292194318


In [67]:
mod3_mse = blackbox(algorithm, X, Y, method = "Robust", K = 5, criteria = "MSE")
mod3_mad = blackbox(algorithm, X, Y, method = "Robust", K = 5, criteria = "MAD")

In [68]:
print(mean_squared_error(Y, mod3_mse.predict(X)))
print(mean_absolute_error(Y, mod3_mad.predict(X)))

0.4440847303856871
0.5055265041483498
