In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from scipy.spatial import distance

In [2]:
df = pd.read_csv('data.csv')

In [None]:
z = np.random.normal(0, 1, size=X_new_ori.shape)
X_new = X_new_ori + z

In [None]:
df

In [3]:
train, test = train_test_split(df, test_size = 0.2, random_state = 42)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

In [54]:
train.describe()

Unnamed: 0,Bp,Sg,Al,Su,Rbc,Bu,Sc,Sod,Pot,Hemo,Wbcc,Rbcc,Htn,Class
count,320.0,320.0,320.0,320.0,320.0,320.0,320.0,320.0,320.0,320.0,320.0,320.0,320.0,320.0
mean,75.95625,1.01775,0.99375,0.384375,0.89375,55.386563,2.873625,137.46475,4.680781,12.589656,8422.9375,4.719031,0.352313,0.61875
std,13.833242,0.005494,1.25694,1.019745,0.30864,48.875895,5.424476,9.778089,3.125634,2.66275,2563.656577,0.815552,0.476909,0.486454
min,50.0,1.005,0.0,0.0,0.0,10.0,0.4,4.5,2.5,3.1,2200.0,2.1,0.0,0.0
25%,70.0,1.015,0.0,0.0,1.0,27.0,0.9,135.0,4.0,10.9,6900.0,4.5,0.0,0.0
50%,76.0,1.02,0.5,0.0,1.0,42.0,1.35,137.53,4.63,12.53,8406.0,4.71,0.0,1.0
75%,80.0,1.02,2.0,0.0,1.0,57.0,2.925,140.25,4.8,14.7,9400.0,5.1,1.0,1.0
max,180.0,1.025,5.0,5.0,1.0,391.0,76.0,163.0,47.0,17.8,26400.0,8.0,1.0,1.0


In [4]:
y_train = train['Class']
X_train = train.loc[:, train.columns!='Class']
y_test = test['Class']
X_test = test.loc[:, test.columns!='Class']

In [91]:
X_new2 = pd.DataFrame(X_new)

In [92]:
lgt = LogisticRegression(random_state=42, penalty='none')
lgt.fit(X_new2, y_train)
np.mean(lgt.predict(X_test_new) == y_test)

0.8

In [5]:
def dropout(algo, X, Y, M, K, criteria):
    """
    This function uses dropout method to train a predicted model that optimizes the specific criterion.
    @param algo(required): A learning algorithm that takes X, Y as inputs and outputs a model 
    @param X(required): The matrix of indepdent/predictor variables
    @param Y(required): The outcome variable
    @param M(required): The number of Monte Carlo replicates
    @param K(required): Number of CV folds for tuning hyperparameter
    @param criteria(required): A criterion to be used to evaluate the method. 
    Can be either mean squared error (MSE) or mean absolute deviation (MAD)
    @return: A predictive model that optimizes the specific criterion
    """
    
    # List of phi values to be tuned
    phi_list = np.arange(0, 1, 0.1)
    cv_error = []
    
    for phi in phi_list:
        # Create k folds for cross-validation
        kf = KFold(n_splits = K, shuffle=True)
        error = []
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            Y_train, Y_test = Y[train_index], Y[test_index]
            
            # Monte Carlo: expand original data M times to generate a new dataset 
            # (X will be modified later)
            X_new_ori = np.repeat(X_train, M, axis=0)
            Y_new = np.repeat(Y_train, M, axis=0)
            
            # Generate independent Bernoulli random variables Z
            z = np.random.binomial(1, phi, size=X_new_ori.shape)
            # Random Dropout for X
            X_new = X_new_ori*z/(1-phi)
            
            # Fit input learning algorithm with new data
            reg = algo(X_new, Y_new)
            # Predict Y using X test
            pred = reg.predict(X_test)
            
            # Calculate MSE/MAD using the model obtained above
            if criteria == "MSE":
                error.append(mean_squared_error(Y_test, pred))
            elif criteria == "MAD":
                error.append(mean_absolute_error(Y_test, pred))
            # Raise error if input is not MSE or MAD
            else:
                raise ValueError('Please input either MSE or MAD!')
        
        cv_error.append(np.mean(error))
    
    # Find optimal phi value that has the smallest CV error
    phi_opt = phi_list[np.argmin(cv_error)]
    
    # Use the optimal phi value to train the predicted model
    X_new_ori = np.repeat(X, M, axis=0)
    Y_new = np.repeat(Y, M, axis=0)
    
    z = np.random.binomial(1, phi_opt, size=X_new_ori.shape)
    X_new = X_new_ori*z/(1-phi_opt)

    reg = algo(X_new, Y_new)
    
    return reg

In [6]:
def lgt_drop(X, Y):
    return LogisticRegression(random_state=42, penalty='none').fit(X, Y)

In [7]:
X = X_train.values
y = y_train.values

In [8]:
z = np.random.normal(0, 3, size=X.shape)
X_new = X + z

In [9]:
z2 = np.random.normal(0, 20, size=X_test.shape)
X_test_new = X_test + z2

In [10]:
lgt_drop_fit = dropout(lgt_drop, X_new, y, 100, 5, 'MSE')

In [11]:
np.mean(lgt_drop_fit.predict(X_test_new) == y_test)

0.6625

In [None]:
def dropout_updated(algo, X, Y, M, K, criteria):
    """
    This function uses dropout method to train a predicted model that optimizes the specific criterion.
    @param algo(required): A learning algorithm that takes X, Y as inputs and outputs a model 
    @param X(required): The matrix of indepdent/predictor variables
    @param Y(required): The outcome variable
    @param M(required): The number of Monte Carlo replicates
    @param K(required): Number of CV folds for tuning hyperparameter
    @param criteria(required): A criterion to be used to evaluate the method. 
    Can be either mean squared error (MSE) or mean absolute deviation (MAD)
    @return: A predictive model that optimizes the specific criterion
    """
    
    # List of phi values to be tuned
    cv_error = []
    # Create k folds for cross-validation
    kf = KFold(n_splits = K, shuffle=True)
    error = []
    p_up_list = np.arange(0.5, 1, 0.1)
    
    for p_up in p_up_list:
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            Y_train, Y_test = Y[train_index], Y[test_index]

            # Monte Carlo: expand original data M times to generate a new dataset 
            # (X will be modified later)
            X_new_ori = np.repeat(X_train, M, axis=0)
            Y_new = np.repeat(Y_train, M, axis=0)

            # Generate independent Bernoulli random variables Z
            center = np.mean(X_new_ori, axis=0)
            dist = [distance.euclidean(point, center) for point in X_new_ori]
            quant_list = np.quantile(dist, [0.25, 0.5, 0.75])
            dist_df = pd.DataFrame({'dist': dist})
            dist_df['category'] = pd.cut(dist_df['dist'], quant_list, 
                                         labels=['0-25%', '25-50%', '50-75%', '75-100%'])
            dist_df['prob'] = 0
            dist_df.loc[dist_df.category == '0-25%', 'prob'] = p_up
            dist_df.loc[dist_df.category == '25-50%', 'prob'] = p_up - 0.1
            dist_df.loc[dist_df.category == '50-75%', 'prob'] = p_up - 0.2
            dist_df.loc[dist_df.category == '75-100%', 'prob'] = p_up - 0.3
            phi_list = dist_df['prob']
            z_list = np.random.binomial(1, phi_list, size=X_new_ori.shape[0])

            # Random Dropout for X
            X_new = X_new_ori*z_list/(1-phi_list)

            # Fit input learning algorithm with new data
            reg = algo(X_new, Y_new)
            # Predict Y using X test
            pred = reg.predict(X_test)

            # Calculate MSE/MAD using the model obtained above
            if criteria == "MSE":
                error.append(mean_squared_error(Y_test, pred))
            elif criteria == "MAD":
                error.append(mean_absolute_error(Y_test, pred))
            # Raise error if input is not MSE or MAD
            else:
                raise ValueError('Please input either MSE or MAD!')

        cv_error.append(np.mean(error))
    
    # Find optimal p_up value that has the smallest CV error
    p_up_opt = p_up_list[np.argmin(cv_error)]
    
    # Use the optimal phi value to train the predicted model
    X_new_ori = np.repeat(X, M, axis=0)
    Y_new = np.repeat(Y, M, axis=0)
    
    # Generate independent Bernoulli random variables Z
    center = np.mean(X_new_ori, axis=0)
    dist = [distance.euclidean(point, center) for point in X_new_ori]
    quant_list = np.quantile(dist, [0.25, 0.5, 0.75])
    dist_df = pd.DataFrame({'dist': dist})
    dist_df['category'] = pd.cut(dist_df['dist'], quant_list, 
                                 labels=['0-25%', '25-50%', '50-75%', '75-100%'])
    dist_df['prob'] = 0
    dist_df.loc[dist_df.category == '0-25%', 'prob'] = p_up_opt
    dist_df.loc[dist_df.category == '25-50%', 'prob'] = p_up_opt - 0.1
    dist_df.loc[dist_df.category == '50-75%', 'prob'] = p_up_opt - 0.2
    dist_df.loc[dist_df.category == '75-100%', 'prob'] = p_up_opt - 0.3
    phi_list = dist_df['prob']
    z_list = np.random.binomial(1, phi_list, size=X_new_ori.shape[0])

    # Random Dropout for X
    X_new = X_new_ori*z_list/(1-phi_list)

    # Fit input learning algorithm with new data
    reg = algo(X_new, Y_new)
    
    return reg

In [32]:
center = np.mean(X_new, axis=0)
dist = [distance.euclidean(point, center) for point in X_new]
quant_list = np.quantile(dist, [0, 0.25, 0.5, 0.75, 1])
cat_four = pd.cut(dist, quant_list, 
                              labels=['0-25%', '25-50%', '50-75%', '75-100%'])

In [37]:
dist_df = pd.DataFrame({'dist': dist})
dist_df['category'] = pd.cut(dist_df['dist'], quant_list, labels=['0-25%', '25-50%', '50-75%', '75-100%'])

In [41]:
dist_df['prob'] = 0
dist_df.loc[dist_df.category == '0-25%', 'prob'] = 0.1
dist_df.loc[dist_df.category == '25-50%', 'prob'] = 0.2
dist_df.loc[dist_df.category == '50-75%', 'prob'] = 0.3
dist_df.loc[dist_df.category == '75-100%', 'prob'] = 0.4

In [59]:
phi_list = dist_df['prob']

In [56]:
X_new.shape[0]

320

In [60]:
z_list = np.random.binomial(1, phi_list, size=X_new.shape[0])

In [58]:
z_list

array([1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,

In [33]:
cat_four

['50-75%', '75-100%', '0-25%', '50-75%', '0-25%', ..., '75-100%', '50-75%', '50-75%', '25-50%', '50-75%']
Length: 320
Categories (4, object): ['0-25%' < '25-50%' < '50-75%' < '75-100%']

In [20]:
quant_list

array([1.78149680e+01, 5.55483572e+01, 1.17874644e+03, 2.30720834e+03,
       1.79747725e+04])

In [24]:
cat_four = pd.cut(dist, quant_list)

In [31]:
cat_four[0]

Interval(1178.746, 2307.208, closed='right')

In [34]:
dic = {'x': X_new, 'category': cat_four}

In [35]:
dic

{'x': array([[ 7.13730500e+01, -2.10885796e+00,  8.01682114e+00, ...,
          6.70118972e+03,  7.39424088e+00,  1.79700687e+00],
        [ 1.03694526e+02,  5.97891267e-01,  2.44743996e-01, ...,
          1.14040797e+04, -2.91921745e+00,  1.00484074e+00],
        [ 5.87236062e+01,  2.43315179e+00,  8.21343060e+00, ...,
          8.40387538e+03,  1.35804866e+00,  1.92310447e+00],
        ...,
        [ 8.09615736e+01, -2.06199581e+00,  9.09108980e-01, ...,
          7.19887525e+03,  6.62937974e+00,  2.57408873e+00],
        [ 7.87718297e+01, -4.24170747e+00, -3.29828939e-01, ...,
          7.30053277e+03,  2.54083478e+00,  8.10541089e-01],
        [ 5.75904058e+01,  1.72009625e+00, -1.88743420e-01, ...,
          6.99172208e+03,  2.90377637e+00,  3.51554487e-01]]),
 'category': ['50-75%', '75-100%', '0-25%', '50-75%', '0-25%', ..., '75-100%', '50-75%', '50-75%', '25-50%', '50-75%']
 Length: 320
 Categories (4, object): ['0-25%' < '25-50%' < '50-75%' < '75-100%']}

In [36]:
data_new = pd.DataFrame(dic)

ValueError: Per-column arrays must each be 1-dimensional

In [16]:
quant_list

array([1.78149680e+01, 5.55483572e+01, 1.17874644e+03, 2.30720834e+03,
       1.79747725e+04])

In [108]:
center = np.mean(X_new, axis=0)
dist = [distance.euclidean(point, center) for point in X_new]
quant_list = np.quantile(dist, [0.25, 0.5, 0.75])

In [111]:
np.min(dist)

16.11635598696819

In [120]:
center = np.mean(X_new, axis=0)
dist = [distance.euclidean(point, center) for point in X_new]
dist_std = (dist-np.min(dist))/(np.max(dist)-np.min(dist))
dist_std = [d if d>0 else dfor d in dist_std]
# dist_scaled = dist_std * (np.max(dist)-np.min(dist)) + np.min(dist)
phi_list = 1/dist_std
z_list = np.random.binomial(1, phi_list)
# # Random Dropout for X
# X_new = X_new_ori*z_list/(1-phi_list)

ValueError: p < 0, p > 1 or p contains NaNs

In [123]:
center = np.mean(X_new, axis=0)
dist = [distance.euclidean(point, center) for point in X_new]
# dist_scaled = dist_std * (np.max(dist)-np.min(dist)) + np.min(dist)
phi_list = dist/np.sum(dist)
z_list = np.random.binomial(1, phi_list)
# # Random Dropout for X
# X_new = X_new_ori*z_list/(1-phi_list)

In [124]:
z_list

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

按照distance四等分（0-25，25-50...）,tune upper buond for probability (starting from 0.5 to 0.9), assign probability to 4 groups 