In [328]:
%load_ext autoreload
%autoreload 2
import sys,os
sys.path.append('../')   
import numpy as np
import pandas as pd
from model.erroneousPreference import erroneousPreference
from kernel import RBF
from utility import  paramz
from sklearn.model_selection import train_test_split, GroupShuffleSplit
# for plotting
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import arviz as az
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GroupShuffleSplit 

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [329]:
df = pd.read_csv("datasets/train.csv",index_col=0)
df.reset_index(drop=True,inplace=True)
print(df.shape)
df.head()

(2929, 11)


Unnamed: 0,id,choiceid,choice,price_A,time_A,change_A,comfort_A,price_B,time_B,change_B,comfort_B
0,1,1,A,2400,150,0,1,4000,150,0,1
1,1,2,A,2400,150,0,1,3200,130,0,1
2,1,3,A,2400,115,0,1,4000,115,0,0
3,1,4,B,4000,130,0,1,3200,150,0,0
4,1,5,B,2400,150,0,1,3200,150,0,0


In [330]:
df[(df.time_A==150) & (df.change_A==1) & (df.time_B==150) & (df.change_B==1)]

Unnamed: 0,id,choiceid,choice,price_A,time_A,change_A,comfort_A,price_B,time_B,change_B,comfort_B
242,21,13,A,2240,150,1,1,2520,150,1,0
524,43,2,A,3500,150,1,2,3850,150,1,1
691,56,5,A,5750,150,1,1,5000,150,1,2
2568,206,14,B,3000,150,1,0,2100,150,1,1
2839,228,17,B,3900,150,1,0,2600,150,1,1


## Split train and test

In [331]:
splitter = GroupShuffleSplit(test_size=0.5, n_splits=1, random_state = 10)
split = splitter.split(df, groups=df['id'])
ind_train, ind_test = next(split)
df_train = df.iloc[ind_train,:]
df_test  = df.iloc[ind_test,:]

## Make preferences

In [332]:
def compute_pair(df):
    Xa = np.unique(df[['price_A', 'time_A', 'change_A',
       'comfort_A']],axis=0)
    Xb = np.unique(df[['price_B', 'time_B', 'change_B',
           'comfort_B']],axis=0)
    X = np.unique(np.vstack([Xa,Xb]),axis=0)# objects
    Pair = []
    for el in range(0,df.shape[0]):
        rowi=df.iloc[el][['price_A', 'time_A', 'change_A',
           'comfort_A']]
        rowj=df.iloc[el][['price_B', 'time_B', 'change_B',
           'comfort_B']]
        i = np.where( np.isin(X, rowi).all(axis=1))[0][0]
        j = np.where( np.isin(X, rowj).all(axis=1))[0][0]
        if df.iloc[el].choice=="A":
            Pair.append([i,j])
        else:
            Pair.append([j,i])
    Pair=np.vstack(Pair)
    return Pair, X


Pair_tr,X_tr = compute_pair(df_train)
Pair_te,X_te = compute_pair(df_test)




In [334]:
preprocessor = ColumnTransformer(
        transformers=[
            ('std', StandardScaler() , [0,1]),
        ], remainder='passthrough')

scalerx = preprocessor.fit(np.vstack([X_tr,X_te]))
X_tr_n = scalerx.transform(X_tr)
X_te_n = scalerx.transform(X_te)

In [337]:
lengthscale = np.array([2.46811807e+00, 1.06970835e+01, 9.61723050e-03, 4.12986443e+00])
variance = 14.0
lengthscale = np.array([2.45749795e+00, 6.42781558e+00, 1.05724232e+01, 3.07650136e-03])
variance = 7.28
lengthscale =np.array([4.27103877e+00, 1.06897984e+01, 9.97192222e-06, 4.78859211e+00])
variance = 10.0
#0.1+np.random.rand(1,data["X"].shape[1]) 
#lengthscale = np.array([3.14554881e+01, 6.84333683e-04, 1.58214102e-04,   2.54469097e+01])

In [338]:
# data dictionary
data = {}
data["Pairs"] = Pair_tr
data["X"] = X_tr_n

# define kernel and hyperparams
Kernel = RBF

# kernel parameter dictionary
params={}
params['lengthscale']={'value':lengthscale, #0.1+np.random.rand(1,data["X"].shape[1]) ,
                            'range':np.vstack([[1e-6, 40.0]]*data["X"].shape[1]),
                            'transform': paramz.logexp()}
params['variance']={'value':np.array([variance]), 
                            'range':np.vstack([[0.1, 80.0]]),
                            'transform': paramz.logexp()}



# define preference model 
model = erroneousPreference(data,Kernel,params,inf_method="laplace",jitter=1e-6)
# compute hyperparameters
model.optimize_hyperparams(num_restarts=1,niterations=50) 
print(model.params)
# sample from posterior
#model.sample(nsamples=5000, tune=500)

  0%|          | 0/50 [00:00<?, ?it/s]

Iteration 0  -878.0389344545171
{'lengthscale': {'value': array([4.20778564e+00, 8.38412972e+00, 9.97192222e-06, 3.63928156e+00]), 'range': array([[1.e-06, 4.e+01],
       [1.e-06, 4.e+01],
       [1.e-06, 4.e+01],
       [1.e-06, 4.e+01]]), 'transform': <utility.paramz.logexp object at 0x7f1b9cb87910>}, 'variance': {'value': array([15.26707598]), 'range': array([[ 0.1, 80. ]]), 'transform': <utility.paramz.logexp object at 0x7f1b9cb87c70>}}


In [219]:
model.sample(nsamples=2000, tune=1000)

100%|██████████| 3000/3000 [06:19<00:00,  7.91it/s]


In [220]:
upred = np.mean(model.predict(X_te_n),axis=1)
upred.shape

(588,)

In [221]:
Acc = []
for i in range(Pair_te.shape[0]):
    if upred[Pair_te[i][0]]>upred[Pair_te[i][1]]:
        Acc.append(1.0)
    else:
        Acc.append(0.0)
np.mean(Acc)

0.6894273127753304

In [313]:
import pandas as pd
import numpy as np 
import GPy as GPy
from GPy_preferenceKernels import PreferenceKern
from sklearn.model_selection import train_test_split

In [325]:
X_tr = df_train.iloc[:,3:].values
X_te =  df_test.iloc[:,3:].values

preprocessor = ColumnTransformer(
    transformers=[
        ('std', StandardScaler(), [0, 1]),
    ('pass', 'passthrough', [2,3]),
    ('std1', StandardScaler(), [4, 5]),
    ], remainder='passthrough'
)



def random_swap(X0,seed=1):
    #random_swap
    np.random.seed(seed)
    X=X0.copy()
    Y=np.zeros((X.shape[0],1))
    for i in range(X.shape[0]):
        if np.random.rand(1)<0.5:
            tmp = X[i,0:4].copy()
            X[i,0:4]=X[i,4:].copy()
            X[i,4:] = tmp.copy()
            Y[i]=0
        else:
            Y[i]=1
    return X,Y
        
#random_swap
X_tr_n0,Y_tr = random_swap(X_tr,15)
X_te_n0,Y_te = random_swap(X_te,20)


scalerx = preprocessor.fit(X_tr_n0)
X_tr_n = scalerx.transform(X_tr_n0)
X_te_n = scalerx.transform(X_te_n0)

In [327]:
np.unique(np.vstack([X_tr_n0,X_te_n0]),axis=0).shape

(2920, 8)

In [294]:
lengthscale = np.array([3.14554881e+01, 6.84333683e-04, 1.58214102e-04,   2.54469097e+01])
variance=6.0328
kernel = PreferenceKern(X_tr_n.shape[1], name="PreferenceKern", 
                        lengthscale=lengthscale, variance=variance, ARD=True)
#kernel = GPy.kern.Linear(X_tr_n.shape[1],ARD=True)
inference  = GPy.inference.latent_function_inference.Laplace()
likelihood = GPy.likelihoods.Bernoulli()
# Model definition
#m.kern.lengthscale.constrain_bounded(0.01,50)
m = GPy.core.GP(
    X_tr_n,Y_tr, kernel=kernel, likelihood=likelihood, inference_method=inference)
for i in range(0):
    m.randomize()
    m.optimize_restarts(1)
    print(repr(m.param_array))

In [295]:
m.kern.lengthscale

index,gp.PreferenceKern.lengthscale,constraints,priors
[0],31.4554881,+ve,
[1],0.00068433,+ve,
[2],0.00015821,+ve,
[3],25.4469097,+ve,


In [296]:
X_te_n.shape

(1480, 8)

In [297]:
Y_pred=m.predict(X_te_n)[0]
np.mean((Y_pred>0.5)==Y_te)

0.9554054054054054

False

In [250]:
m.predict(X_te_n)

(array([[0.078184  ],
        [0.91605766],
        [0.05717934],
        [0.08390796],
        [0.07551436],
        [0.09316835],
        [0.060072  ],
        [0.06612089],
        [0.07558736],
        [0.9288635 ],
        [0.31934312],
        [0.7697965 ],
        [0.42635585],
        [0.66943297],
        [0.42210054],
        [0.78558023],
        [0.35076472],
        [0.5       ],
        [0.43548261],
        [0.41594431],
        [0.8241773 ],
        [0.81577496],
        [0.19019815],
        [0.83314615],
        [0.76768611],
        [0.8866981 ],
        [0.21787981],
        [0.12819139],
        [0.15655527],
        [0.6949131 ],
        [0.104309  ],
        [0.17880167],
        [0.88913212],
        [0.92652544],
        [0.88808796],
        [0.18194804],
        [0.74974725],
        [0.78779424],
        [0.05930568],
        [0.90138765],
        [0.92108561],
        [0.92099835],
        [0.05911075],
        [0.88508589],
        [0.10103457],
        [0