In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from scipy.spatial import distance

In [2]:
df = pd.read_csv('data.csv')

In [3]:
train, test = train_test_split(df, test_size = 0.2, random_state = 42)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

In [4]:
y_train = train['Class']
X_train = train.loc[:, train.columns!='Class']
y_test = test['Class']
X_test = test.loc[:, test.columns!='Class']

In [5]:
def dropout(algo, X, Y, M, K, criteria):
    """
    This function uses dropout method to train a predicted model that optimizes the specific criterion.
    @param algo(required): A learning algorithm that takes X, Y as inputs and outputs a model 
    @param X(required): The matrix of indepdent/predictor variables
    @param Y(required): The outcome variable
    @param M(required): The number of Monte Carlo replicates
    @param K(required): Number of CV folds for tuning hyperparameter
    @param criteria(required): A criterion to be used to evaluate the method. 
    Can be either mean squared error (MSE) or mean absolute deviation (MAD)
    @return: A predictive model that optimizes the specific criterion
    """
    
    # List of phi values to be tuned
    phi_list = np.arange(0, 1, 0.1)
    cv_error = []
    
    for phi in phi_list:
        # Create k folds for cross-validation
        kf = KFold(n_splits = K, shuffle=True)
        error = []
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            Y_train, Y_test = Y[train_index], Y[test_index]
            
            # Monte Carlo: expand original data M times to generate a new dataset 
            # (X will be modified later)
            X_new_ori = np.repeat(X_train, M, axis=0)
            Y_new = np.repeat(Y_train, M, axis=0)
            
            # Generate independent Bernoulli random variables Z
            z = np.random.binomial(1, phi, size=X_new_ori.shape)
            # Random Dropout for X
            X_new = X_new_ori*z/(1-phi)
            
            # Fit input learning algorithm with new data
            reg = algo(X_new, Y_new)
            # Predict Y using X test
            pred = reg.predict(X_test)
            
            # Calculate MSE/MAD using the model obtained above
            if criteria == "MSE":
                error.append(mean_squared_error(Y_test, pred))
            elif criteria == "MAD":
                error.append(mean_absolute_error(Y_test, pred))
            # Raise error if input is not MSE or MAD
            else:
                raise ValueError('Please input either MSE or MAD!')
        
        cv_error.append(np.mean(error))
    
    # Find optimal phi value that has the smallest CV error
    phi_opt = phi_list[np.argmin(cv_error)]
    
    # Use the optimal phi value to train the predicted model
    X_new_ori = np.repeat(X, M, axis=0)
    Y_new = np.repeat(Y, M, axis=0)
    
    z = np.random.binomial(1, phi_opt, size=X_new_ori.shape)
    X_new = X_new_ori*z/(1-phi_opt)

    reg = algo(X_new, Y_new)
    
    return reg

In [None]:
def dropout_updated(algo, X, Y, M, K, criteria):
    """
    This function uses dropout method to train a predicted model that optimizes the specific criterion.
    @param algo(required): A learning algorithm that takes X, Y as inputs and outputs a model 
    @param X(required): The matrix of indepdent/predictor variables
    @param Y(required): The outcome variable
    @param M(required): The number of Monte Carlo replicates
    @param K(required): Number of CV folds for tuning hyperparameter
    @param criteria(required): A criterion to be used to evaluate the method. 
    Can be either mean squared error (MSE) or mean absolute deviation (MAD)
    @return: A predictive model that optimizes the specific criterion
    """
    
    cv_error = []
    # Create k folds for cross-validation
    kf = KFold(n_splits = K, shuffle=True)
    error = []
    # a list of upper bounds for probability 
    p_up_list = np.arange(0.5, 1, 0.1)
    
    # tune the upper bound for probability
    for p_up in p_up_list:
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            Y_train, Y_test = Y[train_index], Y[test_index]

            # Monte Carlo: expand original data M times to generate a new dataset 
            # (X will be modified later)
            X_new_ori = np.repeat(X_train, M, axis=0)
            Y_new = np.repeat(Y_train, M, axis=0)

            # Calculate the centroid for the data 
            center = np.mean(X_new_ori, axis=0)
            # Euclidean distance (feature importance) for each observation
            dist = [distance.euclidean(point, center) for point in X_new_ori]
            # Seperate distances into 4 categories (ex: 25% quantile...)
            quant_list = np.quantile(dist, [0, 0.25, 0.5, 0.75, 1])
            dist_df = pd.DataFrame({'dist': dist})
            # Create another column for the category 
            dist_df['category'] = pd.cut(dist_df['dist'], quant_list, 
                                         labels=['0-25%', '25-50%', '50-75%', '75-100%'])
            dist_df['prob'] = 0
            # Assign probabilities for each category
            # The larger distance, the lower probability to be dropped 
            dist_df.loc[dist_df.category == '0-25%', 'prob'] = p_up
            dist_df.loc[dist_df.category == '25-50%', 'prob'] = p_up - 0.1
            dist_df.loc[dist_df.category == '50-75%', 'prob'] = p_up - 0.2
            dist_df.loc[dist_df.category == '75-100%', 'prob'] = p_up - 0.3
            phi_list = dist_df['prob'].values
            phi_list = phi_list.reshape(-1,1)
            # Generate independent Bernoulli random variables Z in terms of 
            # different probabilities for each observation 
            z_list = np.random.binomial(1, phi_list, size=X_new_ori.shape)

            # Random Dropout for X
            X_new = X_new_ori*z_list/(1-phi_list)

            # Fit input learning algorithm with new data
            reg = algo(X_new, Y_new)
            # Predict Y using X test
            pred = reg.predict(X_test)

            # Calculate MSE/MAD using the model obtained above
            if criteria == "MSE":
                error.append(mean_squared_error(Y_test, pred))
            elif criteria == "MAD":
                error.append(mean_absolute_error(Y_test, pred))
            # Raise error if input is not MSE or MAD
            else:
                raise ValueError('Please input either MSE or MAD!')

        cv_error.append(np.mean(error))
    
    # Find optimal upper bound for probability value that has the smallest CV error
    p_up_opt = p_up_list[np.argmin(cv_error)]
    
    # Use the optimal p_up value to train the predicted model
    X_new_ori = np.repeat(X, M, axis=0)
    Y_new = np.repeat(Y, M, axis=0)
    
    # Generate independent Bernoulli random variables Z
    center = np.mean(X_new_ori, axis=0)
    dist = [distance.euclidean(point, center) for point in X_new_ori]
    quant_list = np.quantile(dist, [0, 0.25, 0.5, 0.75, 1])
    dist_df = pd.DataFrame({'dist': dist})
    dist_df['category'] = pd.cut(dist_df['dist'], quant_list, 
                                 labels=['0-25%', '25-50%', '50-75%', '75-100%'])
    dist_df['prob'] = 0
    dist_df.loc[dist_df.category == '0-25%', 'prob'] = p_up_opt
    dist_df.loc[dist_df.category == '25-50%', 'prob'] = p_up_opt - 0.1
    dist_df.loc[dist_df.category == '50-75%', 'prob'] = p_up_opt - 0.2
    dist_df.loc[dist_df.category == '75-100%', 'prob'] = p_up_opt - 0.3
    phi_list = dist_df['prob'].values
    phi_list = phi_list.reshape(-1,1)
    z_list = np.random.binomial(1, phi_list, size=X_new_ori.shape)

    # Random Dropout for X
    X_new = X_new_ori*z_list/(1-phi_list)

    # Fit input learning algorithm with new data
    reg = algo(X_new, Y_new)
    
    return reg

In [62]:
def lgt_drop(X, Y):
    return LogisticRegression(random_state=42, penalty='none').fit(X, Y)

In [175]:
lgt = LogisticRegression(random_state=42, penalty='none')
lgt.fit(X_train, y_train)
np.mean(lgt.predict(X_test) == y_test)

0.9125

In [106]:
X = X_train.values
y = y_train.values

In [171]:
lgt_drop_fit = dropout(lgt_drop, X, y, 100, 5, 'MSE')

In [172]:
np.mean(lgt_drop_fit.predict(X_test) == y_test)

0.925

In [168]:
lgt_dropup_fit = dropout_updated(lgt_drop, X, y, 100, 5, 'MSE')

In [169]:
np.mean(lgt_dropup_fit.predict(X_test) == y_test)

0.9375

The above indicates that the updated dropout has the best performance among original dropout and logistic regression model.