# Social Ads

In [1]:
#! git clone https://github.com/cesar-claros/synergistic
#% cd synergistic/

## Dependencies 

In [None]:
#%%
# Command line instalation
# ---------------------------
#! pip install torch
#! pip install gpytorch

# Imports
# ---------------------------
import io #Used as buffer
import sys
import matplotlib
# matplotlib.use('qt5Agg')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import auxfunc.funcs as sgn
import seaborn as sns
import torch
import tensorflow as tf
from scipy.stats import entropy, spearmanr
from sklearn import model_selection, svm, ensemble, linear_model, pipeline, decomposition,\
     tree, neighbors, discriminant_analysis, gaussian_process, preprocessing
from sklearn.gaussian_process.kernels import ConstantKernel, RBF, Matern
plt.style.use(['ggplot','style/style.mplstyle'])
import os
import random

# Trust Score request
# ---------------------------
import urllib.request
ts_code  = 'https://raw.githubusercontent.com/google/TrustScore/master/trustscore.py'
ts_req   = urllib.request.urlopen(ts_code)
read_req = ts_req.read()
exec(read_req)

## Auxiliar functions 

In [3]:
#%%
# MODELS
# ====================
# Grid search for parameters and classifiers

models = { 
        'predictor':
            [
            svm.SVC(),
            gaussian_process.GaussianProcessClassifier(),
            linear_model.LinearRegression(),
            linear_model.Lasso(),
            svm.SVR()],
        'name':
            [
            'SVM',
            'GPClassifier',
            'LinReg',
            'Lasso',
            'SVR']}
parameters = [
            {'SVM__kernel':['poly'],'SVM__degree':[3,4,5]},
            {'GPClassifier__kernel':[]},
            {},
            {'Lasso__alpha':np.linspace(0.01,1,10)},
            {'SVR__kernel':['linear'], 'SVR__C':np.logspace(-1, 1, 10), 'SVR__epsilon':np.logspace(-2, 2, 10)} ]

In [4]:
#%%
# Signaling function fitting and evaluation
def signalingFunction(X_train, y_train, y_train_pred_th, X_val, y_val, y_val_pred_th, X_test, y_test, y_test_pred_th, kernel='exponential', norm='l01'):
    # X_train, X_val should be scaled
    # Fit signaling function 
    exp = sgn.signaling(norm=norm) # idx = [train,test,val]
    exp.fit(X_train, y_train, y_train_pred_th, kernel=kernel, n_iter=500, lr=0.01)
    table_val = exp.evaluate(X_val, y_val, y_val_pred_th, rule_grid=np.linspace(0,3,30, endpoint=False), rho_grid=[0.1, 0.15])
    table_test = exp.test(X_test, y_test, y_test_pred_th, table_val['rule'].to_numpy(), table_val['eta'].to_numpy())
    table = pd.concat([table_val,table_test],axis=1)
    return table, exp

In [5]:
#%%
# Initialize model
def init_model(input_dim, models, parameters, clf):
    
    if clf=='svm':
      i = 0
    elif clf=='gpc':
      i = 1
      kernel = 1.0 * RBF(length_scale=1.0*np.ones(input_dim)) 
      parameters[i]['GPClassifier__kernel'].append(kernel)
      
    scaler = preprocessing.StandardScaler()
    steps = [('scaler', scaler), (models['name'][i], models['predictor'][i])]
    ppline = pipeline.Pipeline(steps) # define the pipeline object.

    clf = model_selection.GridSearchCV(ppline, param_grid=parameters[i], cv=5)
    return clf

In [6]:
#%%
# Soft and thresholded output predictions
def pred_output(model, X):
    if hasattr(model, "decision_function"):
      y_pred_soft = model.best_estimator_.decision_function(X)[:,None]
      y_pred_th = model.best_estimator_.predict(X)
    else:
      y_pred_soft = model.best_estimator_.predict_proba(X)
      y_pred_th = model.best_estimator_.predict(X)
    return y_pred_soft, y_pred_th

In [7]:
#%%
# Jaccard similarity index
def jaccard_similarity(list1, list2):
    s1 = set(list1)
    s2 = set(list2)
    return len(s1.intersection(s2)) / len(s1.union(s2))

In [8]:
#%%
# Baseline comparison
def baselineCriteria(y_val, y_val_pred_soft, y_val_pred_th, y_test, y_test_pred_soft, y_test_pred_th, table, exp, clf,trust_val,trust_test):
      if clf=='svm':
          direction = 'closer'
          crit_val  = np.abs(y_val_pred_soft.ravel())
          crit_test = np.abs(y_test_pred_soft.ravel())
      else:
          direction = 'further'
          p_val = np.concatenate((y_val_pred_soft,1-y_val_pred_soft),axis=1)
          crit_val  = entropy(p_val, axis=1, base=2)
          p_test    = np.concatenate((y_test_pred_soft,1-y_test_pred_soft),axis=1)
          crit_test = entropy(p_test, axis=1, base=2)
      # Criteria 1
      critFunc   = sgn.critEvaluation(norm='l01',direction=direction)
      d_val      = critFunc.evaluate(y_val, y_val_pred_th, crit_val, rho_grid=[0.1, 0.15])
      d_test     = critFunc.test(y_test, y_test_pred_th, crit_test, d_val['thresh'].to_numpy())
      crit_table = pd.concat([d_val,d_test],axis=1)   
      # Best rules from signailing function on val are used to get UCBs on test
      gamma      = table['rule'].to_numpy().reshape(-1,1)
      f_test     = exp.gpr_mean_test + gamma*np.sqrt(exp.gpr_var_test)
      # Threshold values on val data. UCB Signailing (eta) and new criteria (theta) 
      eta        = table['eta'].to_numpy().reshape(-1,1)
      theta      = crit_table['thresh'].to_numpy().reshape(-1,1)        
      if direction == 'closer':
        crit_mask, crit_idx = np.nonzero(crit_test.reshape(1,-1)<theta)
      else:
        crit_mask, crit_idx = np.nonzero(crit_test.reshape(1,-1)>theta)
      f_mask, f_idx = np.nonzero(f_test>eta)
      shared = set(list(np.unique(f_mask))).intersection(set(list(np.unique(crit_mask))))
      # Jaccard index btw signaled instances using both methods for ith rule-threshold
      J = [jaccard_similarity(crit_idx[crit_mask==i],f_idx[f_mask==i]) if i in shared else np.nan for i in range(f_test.shape[0])]            
      crit_table['jaccard']=J
      Sp = [spearmanr(f_test[i,:],crit_test)[0] for i in range(f_test.shape[0])]
      crit_table['spearman'] = Sp
      crit_table['gamma'] = gamma
        
      # Criteria 2
      critFuncSc = sgn.critEvaluation(norm='l01',direction='closer')
      s_val      = critFuncSc.evaluate(y_val, y_val_pred_th, trust_val, rho_grid=[0.1, 0.15])
      s_test     = critFuncSc.test(y_test, y_test_pred_th, trust_test, s_val['thresh'].to_numpy())
      score_table= pd.concat([s_val,s_test],axis=1)
      # Threshold values on val data. TrustScore (theta0) 
      theta0     = score_table['thresh'].to_numpy().reshape(-1,1) 
      crit_mask0, crit_idx0 = np.nonzero(trust_test.reshape(1,-1)<theta0)
      # Jaccard index btw signaled instances using both methods for ith rule-threshold  
      shared0    = set(list(np.unique(f_mask))).intersection(set(list(np.unique(crit_mask0))))
      J0         = [jaccard_similarity(crit_idx0[crit_mask0==i],f_idx[f_mask==i]) if i in shared0 else np.nan for i in range(f_test.shape[0])]
      Sp0        = [spearmanr(f_test[i,:],trust_test)[0] for i in range(f_test.shape[0])]
      score_table['jaccard']  = J0 
      score_table['spearman'] = Sp0      
      return crit_table,score_table

In [9]:
#%%
# Trust score adaptation for python3 (xrange)
class trust_score(TrustScore):
    def __init__(self,k=10, alpha=0., filtering="none", min_dist=1e-12):
        super().__init__(k,alpha,filtering,min_dist)
    def fit(self, X, y):
      """Initialize trust score precomputations with training data.
      WARNING: assumes that the labels are 0-indexed (i.e.
      0, 1,..., n_labels-1).
      Args:
      X: an array of sample points.
      y: corresponding labels.
      """
      self.n_labels = np.max(y) + 1
      self.kdtrees = [None] * self.n_labels
      if self.filtering == "uncertainty":
        X_filtered, y_filtered = self.filter_by_uncertainty(X, y)
      for label in range(self.n_labels):
        if self.filtering == "none":
          X_to_use = X[np.where(y == label)[0]]
          self.kdtrees[label] = KDTree(X_to_use)
        elif self.filtering == "density":
          X_to_use = self.filter_by_density(X[np.where(y == label)[0]])
          self.kdtrees[label] = KDTree(X_to_use)
        elif self.filtering == "uncertainty":
          X_to_use = X_filtered[np.where(y_filtered == label)[0]]
          self.kdtrees[label] = KDTree(X_to_use)

        if len(X_to_use) == 0:
          print("Filtered too much or missing examples from a label! Please lower alpha or check data.")

    def get_score(self, X, y_pred):
      """Compute the trust scores.
      Given a set of points, determines the distance to each class.
      Args:
      X: an array of sample points.
      y_pred: The predicted labels for these points.
      Returns:
      The trust score, which is ratio of distance to closest class that was not
      the predicted class to the distance to the predicted class.
      """
      d = np.tile(None, (X.shape[0], self.n_labels))
      for label_idx in range(self.n_labels):
        d[:, label_idx] = self.kdtrees[label_idx].query(X, k=2)[0][:, -1]

      sorted_d = np.sort(d, axis=1)
      d_to_pred = d[range(d.shape[0]), y_pred]
      d_to_closest_not_pred = np.where(sorted_d[:, 0] != d_to_pred,
                                      sorted_d[:, 0], sorted_d[:, 1])
      return d_to_closest_not_pred / (d_to_pred + self.min_dist)

## Signailing function & baselines

In [10]:
#%%
# Load data
# ---------------------------
df = pd.read_table('datasets/Social_Network_Ads.csv')
Data_X = df.iloc[:,[2,3]]
Data_y = df.iloc[:,4]
Data_X = Data_X.to_numpy()
Data_y = Data_y.to_numpy()

# Seed definition for reproducibility
# ---------------------------
os.environ['TF_DETERMINISTIC_OPS'] = '1'
SEED = 123
os.environ['PYTHONHASHSEED']=str(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [None]:
#%%
# Cross validation setup
# ---------------------------
report_table    = []
report_criteria = []
trust_criteria  = []
addPredictions  = True
clf      = 'gpc'
accuracy = 0
fold     = 1
kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

# ML model fit + Signailing function + Baseline comparison
# ---------------------------
for sample, test in kf.split(Data_X, Data_y):
    X = Data_X[sample]
    y = Data_y[sample]
    X_train, X_val, y_train, y_val = model_selection.train_test_split(X, y, test_size=0.20, random_state=SEED)
    X_test = Data_X[test]
    y_test = Data_y[test]

    # ML Model fit & prediction
    model = init_model(input_dim=X.shape[1], models=models, parameters=parameters, clf=clf)
    model.fit(X_train, y_train)
    y_train_pred_soft, y_train_pred_th = pred_output(model, X_train)
    y_val_pred_soft  , y_val_pred_th   = pred_output(model, X_val)
    y_test_pred_soft , y_test_pred_th  = pred_output(model, X_test)
    print('accuracy(Train)={}'.format(np.sum(y_train==y_train_pred_th)/np.size(y_train)))
    
    # Signailing function. Design matrix
    X_train_GP = X_train
    X_val_GP   = X_val
    X_test_GP  = X_test
    if addPredictions:
        X_train_GP = np.concatenate((X_train_GP, y_train_pred_soft), axis=1)
        X_val_GP   = np.concatenate((X_val_GP, y_val_pred_soft), axis=1)
        X_test_GP  = np.concatenate((X_test_GP, y_test_pred_soft), axis=1)
    scaleX_GP  = preprocessing.StandardScaler().fit(X_train_GP)
    X_train_GP = scaleX_GP.transform(X_train_GP)
    X_val_GP   = scaleX_GP.transform(X_val_GP)
    X_test_GP  = scaleX_GP.transform(X_test_GP)
    
    # Signailing function. Call
    table, exp = signalingFunction(X_train_GP, y_train, y_train_pred_th, X_val_GP, y_val, y_val_pred_th, X_test_GP, y_test, y_test_pred_th)
    report_table.append(pd.concat([pd.DataFrame({'fold':[fold]*table.shape[0]}),table],axis=1))

    # Trust Score fitted on train data to evaluate loss reduction on val-test data
    trust_model = trust_score()
    trust_model.fit(X=X_train_GP,y=y_train)
    trust_val  = trust_model.get_score(X_val_GP, y_val_pred_th)
    trust_test = trust_model.get_score(X_test_GP, y_test_pred_th)
    
    # Baseline for comparison
    crit_table,score_table = baselineCriteria(y_val, y_val_pred_soft, y_val_pred_th, y_test, y_test_pred_soft, y_test_pred_th, table, exp, clf, trust_val, trust_test)
    report_criteria.append(pd.concat([pd.DataFrame({'fold':[fold]*crit_table.shape[0]}),crit_table],axis=1))
    trust_criteria.append(pd.concat([pd.DataFrame({'fold':[fold]*score_table.shape[0]}),score_table],axis=1))

    if accuracy < model.best_estimator_.score(X_val,y_val):
        accuracy = model.best_estimator_.score(X_val,y_val)
        classifier = model.best_estimator_
        X_test_surface_plot = X_test
        y_test_surface_plot = y_test
        X_train_surface_plot = X_train
        y_train_surface_plot = y_train
        X_val_surface_plot = X_val
        y_val_surface_plot = y_val
        scaler_surface_plot = scaleX_GP
        exp_surface_plot = exp
        table_surface_plot = table
    fold +=1
    del(model)

In [12]:
report_table_concat = pd.concat(report_table)
report_table_concat.sort_values(by=['fold','rho_user'])

Unnamed: 0,fold,rho_user,rule,corrected_val,queries_val,total_wrong_val,loss_query_val,rho_hat_val,%loss_red_val,eta,p_value,corrected_test,queries_test,total_wrong_test,loss_query_test,rho_hat_test,%loss_red_test
0,1,0.1,0.0,1.0,6,6.0,0.17,0.09,16.67,0.249682,0.269006,4.0,10,10.0,0.4,0.12,40.0
1,1,0.15,0.0,1.0,9,6.0,0.11,0.14,16.67,0.171871,0.431673,5.0,12,10.0,0.42,0.15,50.0
0,2,0.1,0.0,3.0,6,11.0,0.5,0.09,27.27,0.258228,0.013808,4.0,9,6.0,0.44,0.11,66.67
1,2,0.15,0.1,5.0,9,11.0,0.56,0.14,45.45,0.247508,0.000576,4.0,10,6.0,0.4,0.12,66.67
0,3,0.1,0.0,1.0,6,6.0,0.17,0.09,16.67,0.197382,0.269006,5.0,13,9.0,0.38,0.16,55.56
1,3,0.15,0.0,1.0,9,6.0,0.11,0.14,16.67,0.075075,0.431673,6.0,23,9.0,0.26,0.29,66.67
0,4,0.1,0.0,1.0,6,4.0,0.17,0.09,25.0,0.260077,0.142057,3.0,10,9.0,0.3,0.12,33.33
1,4,0.15,0.0,1.0,9,4.0,0.11,0.14,25.0,0.169816,0.267048,5.0,18,9.0,0.28,0.22,55.56
0,5,0.1,0.0,0.0,6,4.0,0.0,0.09,0.0,0.271372,0.753838,3.0,9,5.0,0.33,0.11,60.0
1,5,0.15,0.0,0.0,9,4.0,0.0,0.14,0.0,0.24183,0.802906,3.0,12,5.0,0.25,0.15,60.0


In [13]:
report_criteria_concat = pd.concat(report_criteria)
report_criteria_concat.sort_values(by=['fold','rho_user'])

Unnamed: 0,fold,rho_user,corrected_val,queries_val,total_wrong_val,loss_query_val,rho_hat_val,%loss_red_val,thresh,corrected_test,queries_test,total_wrong_test,loss_query_test,rho_hat_test,%loss_red_test,jaccard,spearman,gamma
0,1,0.1,1.0,6,6.0,0.17,0.09375,16.67,1.83868,3.0,6,10.0,0.5,0.08,30.0,0.6,0.94371,0.0
1,1,0.15,1.0,9,6.0,0.11,0.140625,16.67,1.802707,3.0,7,10.0,0.43,0.09,30.0,0.461538,0.94371,0.0
0,2,0.1,3.0,6,11.0,0.5,0.09375,27.27,1.829422,4.0,9,6.0,0.44,0.11,66.67,0.636364,0.844183,0.0
1,2,0.15,5.0,9,11.0,0.56,0.140625,45.45,1.778266,4.0,11,6.0,0.36,0.14,66.67,0.75,0.86551,0.1
0,3,0.1,1.0,6,6.0,0.17,0.09375,16.67,1.785725,5.0,12,9.0,0.42,0.15,55.56,0.666667,0.786528,0.0
1,3,0.15,1.0,9,6.0,0.11,0.140625,16.67,1.687872,6.0,18,9.0,0.33,0.22,66.67,0.464286,0.786528,0.0
0,4,0.1,2.0,6,4.0,0.33,0.09375,50.0,1.860635,4.0,7,9.0,0.57,0.09,44.44,0.416667,0.776643,0.0
1,4,0.15,2.0,9,4.0,0.22,0.140625,50.0,1.796995,4.0,13,9.0,0.31,0.16,44.44,0.55,0.776643,0.0
0,5,0.1,1.0,6,4.0,0.17,0.09375,25.0,1.815566,1.0,7,5.0,0.14,0.09,20.0,0.333333,0.881944,0.0
1,5,0.15,1.0,9,4.0,0.11,0.140625,25.0,1.639567,4.0,22,5.0,0.18,0.28,80.0,0.36,0.881944,0.0


In [14]:
trust_criteria_concat = pd.concat(trust_criteria)
trust_criteria_concat.sort_values(by=['fold','rho_user'])

Unnamed: 0,fold,rho_user,corrected_val,queries_val,total_wrong_val,loss_query_val,rho_hat_val,%loss_red_val,thresh,corrected_test,queries_test,total_wrong_test,loss_query_test,rho_hat_test,%loss_red_test,jaccard,spearman
0,1,0.1,1.0,6,6.0,0.17,0.09375,16.67,1.501108,5.0,9,10.0,0.56,0.11,50.0,0.357143,-0.556193
1,1,0.15,2.0,9,6.0,0.22,0.140625,33.33,2.219297,5.0,14,10.0,0.36,0.18,50.0,0.368421,-0.556193
0,2,0.1,3.0,6,11.0,0.5,0.09375,27.27,1.7317,3.0,9,6.0,0.33,0.11,50.0,0.384615,-0.550482
1,2,0.15,4.0,9,11.0,0.44,0.140625,36.36,2.02298,4.0,13,6.0,0.31,0.16,66.67,0.533333,-0.575373
0,3,0.1,1.0,6,6.0,0.17,0.09375,16.67,1.738069,4.0,13,9.0,0.31,0.16,44.44,0.529412,-0.657407
1,3,0.15,1.0,9,6.0,0.11,0.140625,16.67,2.136126,4.0,14,9.0,0.29,0.18,44.44,0.541667,-0.657407
0,4,0.1,1.0,6,4.0,0.17,0.09375,25.0,1.434645,2.0,6,9.0,0.33,0.08,22.22,0.333333,-0.494055
1,4,0.15,1.0,9,4.0,0.11,0.140625,25.0,1.978568,4.0,14,9.0,0.29,0.18,44.44,0.454545,-0.494055
0,5,0.1,1.0,6,4.0,0.17,0.09375,25.0,1.482466,3.0,10,5.0,0.3,0.12,60.0,0.583333,-0.86373
1,5,0.15,2.0,9,4.0,0.22,0.140625,50.0,1.808942,3.0,11,5.0,0.27,0.14,60.0,0.533333,-0.86373


## Output visualization 

In [None]:
! sudo apt-get install texlive-latex-recommended #1
! sudo apt-get install dvipng texlive-fonts-recommended #2
! wget http://mirrors.ctan.org/macros/latex/contrib/type1cm.zip #3
! unzip type1cm.zip -d /tmp/type1cm #4
! cd /tmp/type1cm/type1cm/ && sudo latex type1cm.ins  #5
! sudo mkdir /usr/share/texmf/tex/latex/type1cm #6
! sudo cp /tmp/type1cm/type1cm/type1cm.sty /usr/share/texmf/tex/latex/type1cm #7
! sudo texhash #8

In [None]:
fig, ax = plt.subplots(1,2,figsize=(15, 5.1), constrained_layout=False, dpi=90)
pal = sns.color_palette('Paired')
sns.boxplot(x=df['rho_user'], y=df['%reduction_test'], hue='label', data=df, ax=ax[0], palette=pal)
ax[0].set_xlabel(r'budget $\rho$')
ax[0].set_ylabel(r'Loss reduction $r_{test}(\%)$')
ax[0].legend(loc='upper left')
pal = sns.color_palette('BuGn_r')
sns.boxplot(x=df_jaccard['rho_user'], y=df_jaccard['jaccard'], data=df_jaccard, ax=ax[1], palette=pal)
ax[1].set_xlabel(r'budget $\rho$')
ax[1].set_ylabel(r'Jaccard index $J$')
plt.tight_layout()
path_fig_fxgx = "drive/My Drive/NIPS2020/results/socialadsnet/fig_fxgx_{clf}_yhat{yhat}_pca{pca}.pdf".format(clf=clf, pca=applyPCA, yhat=addPredictions)
plt.savefig(path_fig_fxgx, bbox_inches='tight', facecolor='w')

In [None]:
  #%%
# PLOT DECISION SURFACE
# ==================
# Plot test instances and decision surface
# ----------------------------------------------
# Visualising the Train set results
fig1 = plt.figure(figsize=(10,8),dpi=120)
ax1 = fig1.add_subplot(111)
X_set, y_set = X_train_surface_plot, y_train_surface_plot
y_set[y_set==0] = -1
if hasattr(classifier, "decision_function"):
    d_set = 1-y_set*classifier.decision_function(X_set)
else:
    pred_x = classifier.predict(X_set)
    pred_x[pred_x==0] = -1
    d_set = y_set*pred_x
xi_set = np.max([[np.zeros(d_set.size)],[d_set]],axis=0).ravel()

aranged_ages = np.arange(start = X_set[:, 0].min()-5, stop = X_set[:, 0].max()+5, step = 0.025)
aranged_salaries = np.arange(start = X_set[:, 1].min()-4000, stop = X_set[:, 1].max()+4000, step = 500)

X1, X2 = np.meshgrid(aranged_ages, aranged_salaries)
Z = classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape)
if hasattr(classifier, "decision_function"):
    d = classifier.decision_function(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape)
    ax1.contourf(X1, X2, np.where((np.abs(d)<1),np.abs(d),np.nan), alpha = 0.6, cmap='gist_gray', label='margin region')
    ax1.contour(X1, X2, d, levels=[-1, 0, 1], colors='black', linestyles='dashed')
    ax1.contourf(X1, X2, Z, alpha = 0.3, cmap = matplotlib.colors.ListedColormap(('red', 'blue')))
    x = X_set[xi_set>1,0]
    y = X_set[xi_set>1,1]
else:
    d = classifier.predict_proba(np.array([X1.ravel(), X2.ravel()]).T)[:, 1].reshape(X1.shape)-0.5
    ax1.contourf(X1, X2, np.where((np.abs(d)<0.25),np.abs(d),np.nan), alpha = 0.6, cmap='gist_gray', label='margin region')
    ax1.contour(X1, X2, d, levels=[-0.25, 0.0, 0.25], colors='black', linestyles='dashed')
    ax1.contourf(X1, X2, Z, alpha = 0.3, cmap = matplotlib.colors.ListedColormap(('red', 'blue')))
    x = X_set[xi_set==0,0]
    y = X_set[xi_set==0,1]
dots = ['red','blue']
for i, j in enumerate(np.unique(y_set)):
    ax1.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                color = dots[i], label = '{}'.format('no purchase' if i==0 else 'purchase'),alpha=0.7, s=65, linewidths=5)
# Plot slack variables magnitudes
ax1.scatter(x, y, s=150, facecolors='none', edgecolors='g', label='misclassified', linewidths=5)

ax1.set_xlabel('Age', fontsize=40)
ax1.set_ylabel('Salary', fontsize=40)
ax1.set_title('Training set', fontsize=40)
ax1.legend(loc='lower left', framealpha=0.5, prop={'size': 30}, labelspacing=0.0)
formatter0 = matplotlib.ticker.EngFormatter()
ax1.yaxis.set_major_formatter(formatter0)
X1_min, X1_max = X1.min(), X1.max()
X2_min, X2_max = X2.min(), X2.max()
ax1.set_xlim(X1_min, X1_max)
ax1.set_ylim(X2_min, X2_max)
plt.tight_layout()
path_fig_fx_train = "drive/My Drive/NIPS2020/results/socialadsnet/fig_fx_train_{clf}_yhat{yhat}_pca{pca}.svg".format(clf=clf, pca=applyPCA, yhat=addPredictions)
plt.savefig(path_fig_fx_train, bbox_inches='tight', facecolor='w')

In [None]:
#%%
# PLOT DECISION SURFACE
# ==================
# Plot test instances and decision surface
# ----------------------------------------------
# Visualising the Train set results
fig1 = plt.figure(figsize=(8,8),dpi=120)
ax1 = fig1.add_subplot(111)
X_set, y_set = X_val_surface_plot, y_val_surface_plot
y_set[y_set==0] = -1
if hasattr(classifier, "decision_function"):
    d_set = 1-y_set*classifier.decision_function(X_set)
else:
    pred_x = classifier.predict(X_set)
    pred_x[pred_x==0] = -1
    d_set = y_set*pred_x
xi_set = np.max([[np.zeros(d_set.size)],[d_set]],axis=0).ravel()
if hasattr(classifier, "decision_function"):
    d = classifier.decision_function(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape)
    ax1.contourf(X1, X2, np.where((np.abs(d)<1),np.abs(d),np.nan), alpha = 0.6, cmap='gist_gray', label='margin region')
    ax1.contour(X1, X2, d, levels=[-1,0,1], colors='black', linestyles='dashed')
    ax1.contourf(X1, X2, Z, alpha = 0.3, cmap = matplotlib.colors.ListedColormap(('red', 'blue')))
    x = X_set[xi_set>1,0]
    y = X_set[xi_set>1,1]
else:
    d = classifier.predict_proba(np.array([X1.ravel(), X2.ravel()]).T)[:, 1].reshape(X1.shape)-0.5
    ax1.contourf(X1, X2, np.where((np.abs(d)<0.25),np.abs(d),np.nan), alpha = 0.6, cmap='gist_gray', label='margin region')
    ax1.contour(X1, X2, d, levels=[-0.25, 0, 0.25], colors='black', linestyles='dashed')
    ax1.contourf(X1, X2, Z, alpha = 0.3, cmap = matplotlib.colors.ListedColormap(('red', 'blue')))
    x = X_set[xi_set==0,0]
    y = X_set[xi_set==0,1]

dots = ['red','blue']
for i, j in enumerate(np.unique(y_set)):
    ax1.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                color = dots[i], label = '{}'.format('no purchase' if i==0 else 'purchase'),alpha=0.7,s=150, marker='*', linewidths=5)
# Plot slack variables magnitudes
ax1.scatter(x, y, s=150, facecolors='none', edgecolors='g', label='misclassified', linewidths=5)
 
ax1.set_xlabel('Age',fontsize=40)
ax1.set_yticks([])
ax1.set_title('Validation set',fontsize=40)
ax1.legend(loc='lower left', framealpha=0.5, prop={'size': 30},labelspacing=0.0)
ax1.set_xlim(X1_min, X1_max)
ax1.set_ylim(X2_min, X2_max)
plt.tight_layout()
path_fig_fx_val = "drive/My Drive/NIPS2020/results/socialadsnet/fig_fx_val_{clf}_yhat{yhat}_pca{pca}.svg".format(clf=clf, pca=applyPCA, yhat=addPredictions)
plt.savefig(path_fig_fx_val, bbox_inches='tight', facecolor='w')

In [None]:
t = np.stack((X1.ravel(), X2.ravel()), axis=1)
t_scaled = scaler_surface_plot.transform(t)
f,v = exp_surface_plot.gpr.predict(t_scaled)
f = f.reshape(X1.shape)
# PLOT DECISION SURFACE
# ==================
# Plot test instances and decision surface
# ----------------------------------------------
# Visualising the Test set results
fig1 = plt.figure(figsize=(10,8),dpi=120)
ax1 = fig1.add_subplot(111)
X_set, y_set = X_test_surface_plot, y_test_surface_plot
y_set[y_set==0] = -1
if hasattr(classifier, "decision_function"):
    d_set = 1-y_set*classifier.decision_function(X_set)
else:
    pred_x = classifier.predict(X_set)
    pred_x[pred_x==0] = -1
    d_set = y_set*pred_x
xi_set = np.max([[np.zeros(d_set.size)],[d_set]],axis=0).ravel()
if hasattr(classifier, "decision_function"):
    d = classifier.decision_function(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape)
    ax1.contour(X1, X2, d, levels=[0], colors='black', linestyles='dashed')
    ax1.contourf(X1, X2, Z, alpha = 0.3, cmap = matplotlib.colors.ListedColormap(('red', 'blue')))
    x = X_set[xi_set>1,0]
    y = X_set[xi_set>1,1]
else:
    d = classifier.predict_proba(np.array([X1.ravel(), X2.ravel()]).T)[:, 1].reshape(X1.shape)
    ax1.contour(X1, X2, d, levels=[0.5], colors='black', linestyles='dashed')
    ax1.contourf(X1, X2, Z, alpha = 0.3, cmap = matplotlib.colors.ListedColormap(('red', 'blue')))
    x = X_set[xi_set==0,0]
    y = X_set[xi_set==0,1]

f_set = table_surface_plot['eta'].to_numpy()[::-1]
f_set = np.unique(np.around(np.append(f_set, f.max()), decimals=2))
cs = ax1.contourf(X1, X2, f, f_set, origin='upper', cmap='gray', alpha=0.5)
ax1.contour(X1, X2, f, f_set, colors='black')
cbar = fig1.colorbar(cs, pad=0.0, shrink=0.80)
cbar.ax.set_title(r'$f$', fontsize=40, loc='left')
cbar.set_label(r'$\eta$', labelpad=-10, y=1.10, rotation=0, fontsize=30)
dots = ['red','blue']
for i, j in enumerate(np.unique(y_set)):
    ax1.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                color = dots[i], label = '{}'.format('no purchase' if i==0 else 'purchase'), alpha=0.7, s=150,marker='x', linewidth=5)
# Plot slack variables magnitudes
ax1.scatter(x, y, s=150, facecolors='none', edgecolors='g', label='misclassified', linewidths=5)

ax1.set_xlabel('Age',fontsize=40)
ax1.set_yticks([])
ax1.set_title('Test set',fontsize=40)
ax1.legend(loc='lower left', framealpha=0.5, prop={'size': 30}, labelspacing=0.0)
ax1.set_xlim(X1_min, X1_max)
ax1.set_ylim(X2_min, X2_max)
plt.tight_layout()
path_fig_fx_test = "drive/My Drive/NIPS2020/results/socialadsnet/fig_fx_test_{clf}_yhat{yhat}_pca{pca}.svg".format(clf=clf, pca=applyPCA, yhat=addPredictions)
plt.savefig(path_fig_fx_test, bbox_inches='tight', facecolor='w')

In [None]:
report_table_concat = pd.concat(report_table)
table_by_row_index = report_table_concat.groupby(report_table_concat.index)
report_table_mean = table_by_row_index.mean()
report_table_std = table_by_row_index.std()
report_table_median = table_by_row_index.median()
report_table_q1 = table_by_row_index.quantile(q=0.25)
report_table_q3 = table_by_row_index.quantile(q=0.75)

report_criteria_concat = pd.concat(report_criteria)
table_by_row_index = report_criteria_concat.groupby(report_criteria_concat.index)
report_criteria_mean = table_by_row_index.mean()
report_criteria_std = table_by_row_index.std()
report_criteria_median = table_by_row_index.median()
report_criteria_q1 = table_by_row_index.quantile(q=0.25)
report_criteria_q3 = table_by_row_index.quantile(q=0.75)