# Label Preference
This is a dataset from

    @article{muller2020analysis,
      title={Analysis of active school transportation in hilly urban environments: A case study of Dresden},
      author={M{\"u}ller, Sven and Mejia-Dorantes, Lucia and Kersten, Elisa},
      journal={Journal of transport geography},
      volume={88},
      pages={102872},
      year={2020},
      publisher={Elsevier}
    }


In [1]:
%load_ext autoreload
%autoreload 2
import sys,os
sys.path.append('../')   
import numpy as np
import pandas as pd
from model.erroneousPreference import erroneousPreference
from kernel import BlockRBF
from utility import  paramz
from sklearn.model_selection import train_test_split
# for plotting
import matplotlib.pyplot as plt
import arviz as az


#label preference dataset
df = pd.read_csv("datasets/DDModeChoice.txt", sep="\t", index_col=0)
df.drop_duplicates(inplace=True)#.reset_index(inplace=True)
df = df[(df.Season==0)&(df.CarAvail==0)&(df.Choice!=4) & (df.Grade>11)]#subselect data for homogeneity
df.loc[:,'Choice']=df.loc[:,'Choice']-1
#df.Choice:0:walk, 1:bike, 2:public transport

In [2]:
CovCol=['Distance', 'School_location', 'Grade', 'Age', 'Gender',
         'CB_location','Leistung']
Labels=[0,1,2]#possible labels
latent_dim= len(Labels)

df_train,df_test = train_test_split(df, test_size=0.3)

# train 
X_tr = df_train[CovCol].values
n = X_tr.shape[0]

pairs = []
for ii in range(df_train.shape[0]):
    choice = df_train.iloc[ii,0]
    diff = np.setdiff1d(Labels,choice)
    for d in diff:
        pairs.append([ii+n*choice,ii+n*d])
pairs_tr = np.vstack(pairs)


In [3]:
from sklearn.preprocessing import StandardScaler
scalerx = StandardScaler().fit(X_tr)
X_tr_n = scalerx.transform(X_tr)
Xaugm_tr = np.tile(X_tr_n,(latent_dim,1))
Xaugm_tr.shape

(723, 7)

In [None]:
lengthscales=np.array([
    [0.299236  , 0.3       , 0.3       , 0.17083029, 0.26493936,
         0.17910861, 3.        ],
    [0.1       , 0.3       , 0.3       , 0.17180382, 0.37671323,
         0.30235557, 0.61603046],
    [2.10192633, 0.3       , 0.3       , 3.        , 0.25359043,
         0.433541  , 3.        ]
])
variances=np.array([0.62091424,5.02522299,13.90812498])

In [6]:
# data dictionary
data = {}
data["Pairs"] = pairs_tr
data["X"] = Xaugm_tr

# define kernel and hyperparams
Kernel = BlockRBF

# kernel parameter dictionary

params={}

for i in range(latent_dim):
    params['lengthscale_'+str(i)]={'value':0.3*np.ones(data["X"].shape[1],float), 
                                'range':np.vstack([[0.1, 3.0]]*data["X"].shape[1]),
                                'transform': paramz.logexp()}
    params['variance_'+str(i)]={'value':np.array([1]), 
                                'range':np.vstack([[0.1, 200.0]]),
                                'transform': paramz.logexp()}



# define preference model 
model = erroneousPreference(data,Kernel,params,inf_method="laplace")
# compute hyperparameters
model.optimize_hyperparams(num_restarts=1)
print(model.params)
# sample from posterior
#model.sample(nsamples=5000, tune=10000)

242.1468536183076
242.1468535927249
242.1468536183076
242.1468536183076
242.14685361809865
242.14685361828518
242.1468536183076
242.14685353277974
242.1468535831201
242.14685364539196
242.1468536183076
242.1468536183076
242.1468536182582
242.1468536183301
242.1468536183076
242.14685360524163
242.14685364533995
242.14685353730795
242.1468536183076
242.1468536183076
242.1468536181811
242.1468536183076
242.1468536183076
242.14685356673854
242.14685353566114
228.9801544448767
228.9801542179335
228.9801544448767
228.9801544448767
228.98015443456046
228.98015444471176
228.9801544448767
228.9801544340017
228.9801543883255
228.98015443076198
228.9801544448767
228.9801544448767
228.98015443455728
228.98015442771364
228.9801544448767
228.9801544314282
228.9801543862216
228.98015451033757
228.9801544448767
228.9801544448767
228.98015441536484
228.98015442506932
228.9801544275158
228.9801544818567
228.98015431795454
208.2143021638995
208.21430193968132
208.2143021638995
208.2143021638995
208.21430

176.32963453861223
176.32963453562903
176.32963450039847
176.32963452975142
176.32963452975142
176.32963458805665
176.32963452975142
176.32963452971123
176.32963456026442
176.3296345446356
175.58687523171483
175.58687523053408
175.58687523171483
175.58687523171483
175.5868752317275
175.58687523171238
175.58687523171483
175.58687523413573
175.58687523283214
175.58687523950266
175.58687523171483
175.58687523171483
175.586875231707
175.58687523172082
175.58687523171483
175.58687523709222
175.58687523327228
175.58687523629374
175.58687523171483
175.58687523171483
175.58687528501486
175.58687523171747
175.58687523171972
175.58687525513898
175.5868752235964
175.52462075994995
175.5246207594616
175.52462075994995
175.52462075994995
175.5246207599534
175.52462075995942
175.52462075994995
175.52462076239226
175.524620760753
175.52462076829752
175.52462075994995
175.52462075994995
175.52462075995413
175.5246207599588
175.52462075995624
175.52462076552922
175.5246207593072
175.52462076168172
175.

In [8]:
{'lengthscale_0': {'value': array([0.299236  , 0.3       , 0.3       , 0.17083029, 0.26493936,
         0.17910861, 3.        ]),
  'range': array([[0.1, 3. ],
         [0.1, 3. ],
         [0.1, 3. ],
         [0.1, 3. ],
         [0.1, 3. ],
         [0.1, 3. ],
         [0.1, 3. ]]),
  'transform': <utility.paramz.logexp at 0x7ff17dfebfd0>},
 'variance_0': {'value': array([0.62091424]),
  'range': array([[1.e-01, 2.e+02]]),
  'transform': <utility.paramz.logexp at 0x7ff17dfebe50>},
 'lengthscale_1': {'value': array([0.1       , 0.3       , 0.3       , 0.17180382, 0.37671323,
         0.30235557, 0.61603046]),
  'range': array([[0.1, 3. ],
         [0.1, 3. ],
         [0.1, 3. ],
         [0.1, 3. ],
         [0.1, 3. ],
         [0.1, 3. ],
         [0.1, 3. ]]),
  'transform': <utility.paramz.logexp at 0x7ff17dfebf10>},
 'variance_1': {'value': array([5.02522299]),
  'range': array([[1.e-01, 2.e+02]]),
  'transform': <utility.paramz.logexp at 0x7ff17dfebdf0>},
 'lengthscale_2': {'value': array([2.10192633, 0.3       , 0.3       , 3.        , 0.25359043,
         0.433541  , 3.        ]),
  'range': array([[0.1, 3. ],
         [0.1, 3. ],
         [0.1, 3. ],
         [0.1, 3. ],
         [0.1, 3. ],
         [0.1, 3. ],
         [0.1, 3. ]]),
  'transform': <utility.paramz.logexp at 0x7ff17dfebd00>},
 'variance_2': {'value': array([13.90812498]),
  'range': array([[1.e-01, 2.e+02]]),
  'transform': <utility.paramz.logexp at 0x7ff17dfebd30>}}

{'lengthscale_0': {'value': array([0.299236  , 0.3       , 0.3       , 0.17083029, 0.26493936,
         0.17910861, 3.        ]),
  'range': array([[0.1, 3. ],
         [0.1, 3. ],
         [0.1, 3. ],
         [0.1, 3. ],
         [0.1, 3. ],
         [0.1, 3. ],
         [0.1, 3. ]]),
  'transform': <utility.paramz.logexp at 0x7ff17dfebfd0>},
 'variance_0': {'value': array([0.62091424]),
  'range': array([[1.e-01, 2.e+02]]),
  'transform': <utility.paramz.logexp at 0x7ff17dfebe50>},
 'lengthscale_1': {'value': array([0.1       , 0.3       , 0.3       , 0.17180382, 0.37671323,
         0.30235557, 0.61603046]),
  'range': array([[0.1, 3. ],
         [0.1, 3. ],
         [0.1, 3. ],
         [0.1, 3. ],
         [0.1, 3. ],
         [0.1, 3. ],
         [0.1, 3. ]]),
  'transform': <utility.paramz.logexp at 0x7ff17dfebf10>},
 'variance_1': {'value': array([5.02522299]),
  'range': array([[1.e-01, 2.e+02]]),
  'transform': <utility.paramz.logexp at 0x7ff17dfebdf0>},
 'lengthscale_2': {'

In [None]:
model.sample(nsamples=5000, tune=200)

In [None]:
2+3

In [9]:
# predicted samples
X_te = df_test[CovCol].values
X_te_n = scalerx.transform(X_te)
predictions = model.predict(np.tile(X_te_n,(latent_dim,1)))
# compute the lower and upper credible intervals
#credib_int = az.hdi(predictions.T,0.95)



ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 0 is different from 723)

In [12]:
np.tile(X_te,(latent_dim,1)).shape

(312, 7)

In [17]:
output = np.mean(predictions,axis=1)
u1 = output[0:X_te.shape[0]]
u2 = output[X_te.shape[0]:X_te.shape[0]*2]
u3 = output[X_te.shape[0]*2:]


In [20]:
u3

array([ 6.73795614e-33,  3.06422275e-59,  1.67739701e-04, ...,
        2.27898815e-02,  4.22643721e-10, -5.01154800e-07])

In [24]:
u2-u1

array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
        7.04620731e-10, -1.01097046e-06,  2.18395446e-07])