In [62]:
import pandas as pd
import numpy as np
import pickle, warnings

from diffprivlib import models as dp
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import recall_score, f1_score, roc_curve, auc, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from datetime import datetime

scaler = StandardScaler()
warnings.filterwarnings("ignore")

In [63]:
df = pd.read_csv('dataset_featured_v2.csv')
df_raw = df.copy()
df.head()

Unnamed: 0,Curricular_units_2nd_sem_approved,Tuition_fees_up_to_date,Course_Technology,Mother_qualification_Others,Scholarship_holder,Application_mode_First_Phase,Course_Others,Debtor,Father_occupation_Others,Course_Communication,...,Displaced,Father_occupation_Skilled,Application_mode_Course_Change,morning_attendance,Father_occupation_Unskilled,Mother_qualification_High_School_Eq,Father_occupation_Highly_Skilled,Curricular_units_1st_sem_without_evaluations,Marital_status,Target
0,0,1,1,0,0,0,0,0,0,0,...,1,0,0,1,1,1,0,0,0,1
1,6,0,0,0,0,0,1,0,0,0,...,1,1,0,1,0,1,0,0,0,0
2,0,0,0,0,0,1,0,0,0,1,...,1,0,0,1,1,0,0,0,0,1
3,5,1,0,0,0,0,0,0,0,1,...,1,1,0,1,0,0,0,0,0,0
4,6,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [64]:
df.columns

Index(['Curricular_units_2nd_sem_approved', 'Tuition_fees_up_to_date',
       'Course_Technology', 'Mother_qualification_Others',
       'Scholarship_holder', 'Application_mode_First_Phase', 'Course_Others',
       'Debtor', 'Father_occupation_Others', 'Course_Communication',
       'Curricular_units_1st_sem_credited', 'GDP', 'Course_Nature', 'Gender',
       'Course_Social_Service', 'enrollment_age', 'Course_Management',
       'Curricular_units_2nd_sem_without_evaluations', 'Application_order',
       'Previous_qualification_grade', 'Application_mode_OverAge',
       'Educational_special_needs', 'Inflation_rate', 'Admission_grade',
       'Application_mode_Last_Phase', 'Application_mode_Transfer',
       'Previous_qualification_Graduation_plus',
       'Mother_qualification_Graduation_plus', 'Course_Medical', 'Nacionality',
       'Application_mode_Courses_Holders',
       'Mother_qualification_Basic_Education', 'Displaced',
       'Father_occupation_Skilled', 'Application_mode_Cours

## Applying noise on enrollment_age

In [65]:
sensitivity = df.enrollment_age.max()/(df.shape[0]+1)
sensitivity

0.015819209039548022

In [66]:
original = df.enrollment_age.mean()

V=[]
ep=[]
count=[]

for j in range (1,6):
    epsilon = j/10
    for k in range(0,df.enrollment_age.shape[0]):
        value = df.enrollment_age.loc[k] + np.random.laplace(loc=0, scale=sensitivity/epsilon)
        V.append(value)
        ep.append(epsilon)
        count.append(k)

In [67]:
fra = pd.DataFrame({'Value':V, 'ep':ep, 'count':count})

In [68]:
print(original)

epsilon_difference = []

for eps, grouped_data in fra.groupby('ep'):
    grouped_mean = grouped_data['Value'].mean()
    epsilon_difference.append((eps, grouped_mean - original))
    print(eps, f"Epsilon_mean: {grouped_mean}\t Mean_Difference:{grouped_mean - original}")

23.265144665461122
0.1 Epsilon_mean: 23.266665329447413	 Mean_Difference:0.0015206639862910265
0.2 Epsilon_mean: 23.26617223299573	 Mean_Difference:0.0010275675346065327
0.3 Epsilon_mean: 23.263497386976855	 Mean_Difference:-0.0016472784842669341
0.4 Epsilon_mean: 23.265052977895113	 Mean_Difference:-9.168756600885786e-05
0.5 Epsilon_mean: 23.26545473677804	 Mean_Difference:0.0003100713169175151


In [69]:
nearest_epsilon = min(epsilon_difference, key=lambda x: abs(x[1]-0))
epsilon_to_consider = nearest_epsilon[0]
print(epsilon_to_consider)

0.4


In [70]:
for k in range(0,df.enrollment_age.shape[0]):
    df.enrollment_age.loc[k] = df.enrollment_age.loc[k] + np.random.laplace(loc=0, scale=sensitivity/epsilon_to_consider)

# Models

In [71]:
columns_to_drop = ['Marital_status', 'Gender', 'Nacionality', 'Target'] # Removed PII features and Target for modelling

In [72]:
X_dp = df.drop(columns=columns_to_drop, axis=1)
Y_dp = df.Target
X_raw = df_raw.drop(columns=columns_to_drop, axis=1)
Y_raw = df_raw.Target

X_train_dp, X_test_dp, y_train_dp, y_test_dp = train_test_split(X_dp, Y_dp, test_size=0.30, random_state=42)
X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(X_raw, Y_raw, test_size=0.30, random_state=42)

X_train_dp_scaled = scaler.fit_transform(X_train_dp)
X_test_dp_scaled = scaler.fit_transform(X_test_dp)

X_train_raw_scaled = scaler.fit_transform(X_train_raw)
X_test_raw_scaled = scaler.fit_transform(X_test_raw)

In [73]:
models_to_run = {
    'Raw_Data': {
        'DiffPrivLib': {
            'RandomForest': {
                'X_train': X_train_raw,
                'X_test': X_test_raw,
                'Y_train': y_train_raw,
                'Y_test': y_test_raw,
                'PickleFileName': "raw_diff_rf.pkl",
                'MinMax': (X_train_raw.min().min(), X_train_raw.max().max())
            },
            'LogisticRegression': {
                'X_train': X_train_raw_scaled,
                'X_test': X_test_raw_scaled,
                'Y_train': y_train_raw,
                'Y_test': y_test_raw,
                'PickleFileName': "raw_diff_lr.pkl",
                'MinMax': (X_train_raw_scaled.min().min(), X_train_raw_scaled.max().max())
            } 
        },
        'SKLearn': {
            'RandomForest': {
                'X_train': X_train_raw,
                'X_test': X_test_raw,
                'Y_train': y_train_raw,
                'Y_test': y_test_raw,
                'PickleFileName': "raw_sklearn_rf.pkl",
                'MinMax': (X_train_raw.min().min(), X_train_raw.max().max())
            },
            'LogisticRegression': {
                'X_train': X_train_raw_scaled,
                'X_test': X_test_raw_scaled,
                'Y_train': y_train_raw,
                'Y_test': y_test_raw,
                'PickleFileName': "raw_sklearn_lr.pkl",
                'MinMax': (X_train_raw_scaled.min().min(), X_train_raw_scaled.max().max())
            } 
        } 
    },
    'DP_Data': {
        'DiffPrivLib': {
            'RandomForest': {
                'X_train': X_train_dp,
                'X_test': X_test_dp,
                'Y_train': y_train_dp,
                'Y_test': y_test_dp,
                'PickleFileName': "dp_diff_rf.pkl",
                'MinMax': (X_train_dp.min().min(), X_train_dp.max().max())
            },
            'LogisticRegression': {
                'X_train': X_train_dp_scaled,
                'X_test': X_test_dp_scaled,
                'Y_train': y_train_dp,
                'Y_test': y_test_dp,
                'PickleFileName': "dp_diff_lr.pkl",
                'MinMax': (X_train_dp_scaled.min().min(), X_train_dp_scaled.max().max())
            } 
        },
        'SKLearn': {
            'RandomForest': {
                'X_train': X_train_dp,
                'X_test': X_test_dp,
                'Y_train': y_train_dp,
                'Y_test': y_test_dp,
                'PickleFileName': "dp_sklearn_rf.pkl",
                'MinMax': (X_train_dp.min().min(), X_train_dp.max().max())
            },
            'LogisticRegression': {
                'X_train': X_train_dp_scaled,
                'X_test': X_test_dp_scaled,
                'Y_train': y_train_dp,
                'Y_test': y_test_dp,
                'PickleFileName': "dp_sklearn_lr.pkl",
                'MinMax': (X_train_dp_scaled.min().min(), X_train_dp_scaled.max().max())
            } 
        } 
    }
}

# Run Models

In [74]:
forest_depth = 10

def get_predictions(model, inputs):
    model.fit(inputs['X_train'], inputs['Y_train'])
    y_pred = model.predict(inputs['X_test'])
    return y_pred

def get_FNR(y_test, y_pred):
    tpr = recall_score(y_test, y_pred)
    tnr = recall_score(y_test, y_pred, pos_label=0)

    fpr = 1 - tnr
    fnr = 1 - tpr

    return fnr

def get_metrics(y_test, y_pred):
    fnr_val = get_FNR(y_test, y_pred)
    f1_val = f1_score(y_test, y_pred)
    fpr, tpr, threshold = roc_curve(y_test, y_pred)
    auc_val = auc(fpr, tpr)
    acc_val = accuracy_score(y_test, y_pred)
    return fnr_val, f1_val, auc_val, acc_val

def get_metrics_for_model(model, inputs):
    y_pred = get_predictions(model, inputs)
    metrics = get_metrics(inputs['Y_test'], y_pred)
    return metrics

def get_best_metrics(metrices):
    fnrs = []
    f1s = []
    eps = []

    for metric in metrices:
        fnrs.append(metric[0][0])
        f1s.append(metric[0][1])
        eps.append(metric[1])

    m_df = pd.DataFrame({'FNR': fnrs, 'F1': f1s, 'Eps': eps})
    output = m_df.sort_values(by = ['FNR', 'F1'], ascending = [True, False])
    return output.iloc[0]

def get_max_l2_norm(inputs):
    train = inputs['X_train']
    norms = np.sqrt(np.square(X_test_raw).sum(axis=1))
    return norms.values.max()

def get_dp_model(modelType, epsilon, inputs):
    model = None
    if modelType == "RandomForest":
        minmax = inputs['MinMax']
        model = dp.RandomForestClassifier(epsilon=epsilon, n_estimators=100, random_state=42, classes=inputs['Y_train'].unique(), n_jobs=-1, max_depth=forest_depth)
    else:
        data_norm = get_max_l2_norm(inputs)
        model = dp.LogisticRegression(epsilon=epsilon, random_state=42, data_norm=data_norm, n_jobs=-1)
    return model

def get_sklearn_model(modelType):
    model = None
    if modelType == "RandomForest":
        model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, max_depth=forest_depth)
    else:
        model = LogisticRegression(random_state=42, n_jobs=-1)
    return model

def get_best_epsilon(modelType, inputs):
    epsilons_to_consider = [round(x, 1) for x in list(np.arange(0.1, 1.1, 0.1))]
    metrices = []

    for epsilon in epsilons_to_consider:
        starttime = datetime.now()
        model = get_dp_model(modelType, epsilon, inputs)
        metrics = get_metrics_for_model(model, inputs)
        endtime = datetime.now()
        diff = endtime - starttime
        print(f"Model: {modelType}, Epsilon: {epsilon}, Time Taken: {diff}")
        metrices.append((metrics, epsilon))
    
    best_metrics = get_best_metrics(metrices)
    return best_metrics['Eps']

def get_model(package_name, model_name, inputs):
    if package_name == "DiffPrivLib":
        epsilon = get_best_epsilon(model_name, inputs)
        model = get_dp_model(model_name, epsilon, inputs)
        return model

    return get_sklearn_model(model_name)

def get_model_with_metrics(package_name, model_name, inputs):
    model = get_model(package_name, model_name, inputs)
    return get_metrics_for_model(model, inputs), model

In [75]:
datatype = []
package = []
model = []
accuracy = []
mae = []
mse = []
rmse = []

f1 = []
fnr = []
aucs = []

for data, models in models_to_run.items():
    for package_name, model_inputs in models.items():
        for model_name, inputs in model_inputs.items():          
            datatype.append(data)
            package.append(package_name)
            model.append(model_name)

            metrics, mdl = get_model_with_metrics(package_name, model_name, inputs)
            f1.append(metrics[1])
            fnr.append(metrics[0])
            aucs.append(metrics[2])
            accuracy.append(metrics[3])

            with open(f"picklefiles/{inputs['PickleFileName']}", 'wb') as input_file:
                pickle.dump(mdl, input_file)

Model: RandomForest, Epsilon: 0.1, Time Taken: 0:00:09.730074
Model: RandomForest, Epsilon: 0.2, Time Taken: 0:00:10.176580
Model: RandomForest, Epsilon: 0.3, Time Taken: 0:00:10.256929
Model: RandomForest, Epsilon: 0.4, Time Taken: 0:00:09.574286
Model: RandomForest, Epsilon: 0.5, Time Taken: 0:00:09.890543
Model: RandomForest, Epsilon: 0.6, Time Taken: 0:00:10.570636
Model: RandomForest, Epsilon: 0.7, Time Taken: 0:00:11.397912
Model: RandomForest, Epsilon: 0.8, Time Taken: 0:00:13.987103
Model: RandomForest, Epsilon: 0.9, Time Taken: 0:00:11.410411
Model: RandomForest, Epsilon: 1.0, Time Taken: 0:00:10.317713
Model: LogisticRegression, Epsilon: 0.1, Time Taken: 0:00:00.015513
Model: LogisticRegression, Epsilon: 0.2, Time Taken: 0:00:00.008089
Model: LogisticRegression, Epsilon: 0.3, Time Taken: 0:00:00.008063
Model: LogisticRegression, Epsilon: 0.4, Time Taken: 0:00:00.016493
Model: LogisticRegression, Epsilon: 0.5, Time Taken: 0:00:00.016571
Model: LogisticRegression, Epsilon: 0.6,

In [76]:
df_model_outputs_1 = pd.DataFrame({'DataType': datatype, 'Package': package, 'ModelName': model, 'F1': f1, 'FNR': fnr, 'AUC': aucs, 'Accuracy': accuracy})
df_model_outputs_1

Unnamed: 0,DataType,Package,ModelName,F1,FNR,AUC,Accuracy
0,Raw_Data,DiffPrivLib,RandomForest,0.25,0.834467,0.543308,0.670181
1,Raw_Data,DiffPrivLib,LogisticRegression,0.541446,0.303855,0.630485,0.608434
2,Raw_Data,SKLearn,RandomForest,0.780669,0.285714,0.828394,0.866717
3,Raw_Data,SKLearn,LogisticRegression,0.793727,0.253968,0.839758,0.871235
4,DP_Data,DiffPrivLib,RandomForest,0.250429,0.834467,0.543871,0.670934
5,DP_Data,DiffPrivLib,LogisticRegression,0.541446,0.303855,0.630485,0.608434
6,DP_Data,SKLearn,RandomForest,0.789346,0.260771,0.836356,0.868976
7,DP_Data,SKLearn,LogisticRegression,0.794686,0.253968,0.840321,0.871988
