Mixtures of Logit Swissmetro


Options for the few_draws.toml:
- Second derivative set to 0
- Number of draws: 2000 (for now)

In [9]:
import pandas as pd
import numpy as np
import pandas as pd
import biogeme.database as db
import biogeme.biogeme as bio
from biogeme import models
from biogeme.expressions import Beta, Variable, bioDraws, MonteCarlo, log, Power, exp, Derive, RandomVariable, PanelLikelihoodTrajectory
from biogeme.tools import TemporaryFile

Preparing the data

In [10]:
url = "http://transp-or.epfl.ch/data/swissmetro.dat"

# Read the data into a DataFrame
df = pd.read_csv(url, sep='\t')
database_swissmetro = db.Database('swissmetro', df)

#Definition of the variables:

GROUP = Variable('GROUP')
SURVEY = Variable('SURVEY')
SP = Variable('SP')
ID = Variable('ID')
PURPOSE = Variable('PURPOSE')
FIRST = Variable('FIRST')
TICKET = Variable('TICKET')
WHO = Variable('WHO')
LUGGAGE = Variable('LUGGAGE')
AGE = Variable('AGE')
MALE = Variable('MALE')
INCOME = Variable('INCOME')
GA = Variable('GA')
ORIGIN = Variable('ORIGIN')
DEST = Variable('DEST')
TRAIN_AV = Variable('TRAIN_AV')
CAR_AV = Variable('CAR_AV')
SM_AV = Variable('SM_AV')
TRAIN_TT = Variable('TRAIN_TT')
TRAIN_CO = Variable('TRAIN_CO')
TRAIN_HE = Variable('TRAIN_HE')
SM_TT = Variable('SM_TT')
SM_CO = Variable('SM_CO')
SM_HE = Variable('SM_HE')
SM_SEATS = Variable('SM_SEATS')
CAR_TT = Variable('CAR_TT')
CAR_CO = Variable('CAR_CO')
CHOICE = Variable('CHOICE')

#We estimate the parameters of the model using all observations in the data set associated with work trips. 
#Observations such that the dependent variable CHOICE is 0 are also removed.

exclude = ((PURPOSE != 1) * (PURPOSE != 3) + (CHOICE == 0)) > 0
database_swissmetro.remove(exclude)

#Definition of new variables:
SM_COST = database_swissmetro.DefineVariable('SM_COST', SM_CO * (GA == 0))
TRAIN_COST = database_swissmetro.DefineVariable('TRAIN_COST', TRAIN_CO * (GA == 0))
CAR_AV_SP = database_swissmetro.DefineVariable('CAR_AV_SP', CAR_AV * (SP != 0))
TRAIN_AV_SP = database_swissmetro.DefineVariable('TRAIN_AV_SP', TRAIN_AV * (SP != 0))
TRAIN_TT_SCALED = database_swissmetro.DefineVariable('TRAIN_TT_SCALED', TRAIN_TT / 100)
TRAIN_COST_SCALED = database_swissmetro.DefineVariable('TRAIN_COST_SCALED', TRAIN_COST / 100)
SM_TT_SCALED = database_swissmetro.DefineVariable('SM_TT_SCALED', SM_TT / 100)
SM_COST_SCALED = database_swissmetro.DefineVariable('SM_COST_SCALED', SM_COST / 100)
CAR_TT_SCALED = database_swissmetro.DefineVariable('CAR_TT_SCALED', CAR_TT / 100)
CAR_CO_SCALED = database_swissmetro.DefineVariable('CAR_CO_SCALED', CAR_CO / 100)

database_swissmetro.panel('ID') 



Defining Model Parameters

In [11]:
ASC_CAR = Beta('ASC_CAR', 0, None, None, 0)
ASC_TRAIN = Beta('ASC_TRAIN', 0, None, None, 0)
ASC_SM = Beta('ASC_SM', 0, None, None, 1) #Setting it to 0, no estimation
B_COST = Beta('B_COST', 0, None, None, 0)

Defining Random parameter, with Halton Draw, for Monte-Carlo Simulation

In [12]:
B_TIME = Beta('B_TIME', 0, None, None, 0)
B_TIME_S = Beta('B_TIME_S', 1, None, None, 0) 
B_TIME_RND = B_TIME + B_TIME_S * bioDraws('B_TIME_RND', 'NORMAL_HALTON3')


#maybe script the draws in Apollo and import them to Biogeme (Apollo should write a list of the draws and then import it to Biogeme)

Defining the Model

In [13]:
V1 = ASC_TRAIN + B_TIME_RND * TRAIN_TT_SCALED + B_COST * TRAIN_COST_SCALED
V2 = ASC_SM + B_TIME_RND * SM_TT_SCALED + B_COST * SM_COST_SCALED
V3 = ASC_CAR + B_TIME_RND * CAR_TT_SCALED + B_COST * CAR_CO_SCALED

V = {1: V1, 2: V2, 3: V3}

av = {1: TRAIN_AV_SP, 2: SM_AV, 3: CAR_AV_SP}

Estimating the Model

In [14]:
prob = models.logit(V, av, CHOICE)

condprodIndiv = PanelLikelihoodTrajectory(prob) #will use panel data in this case

logprob = log(MonteCarlo(condprodIndiv))


USER_NOTES = (
    'Example of a mixture of logit models with three alternatives, '
    'approximated using Monte-Carlo integration.'
)

the_biogeme = bio.BIOGEME(
    database_swissmetro, logprob, userNotes=USER_NOTES, parameter_file='few_draws.toml'
)
the_biogeme.modelName = 'swissmetro_Halton_Mixture'

results = the_biogeme.estimate()


In [15]:
# Retrieve the general statistics from the results
general_stats = results.getGeneralStatistics()
print(results.printGeneralStatistics())

Number of estimated parameters:	5
Sample size:	752
Observations:	6768
Excluded observations:	3960
Init log likelihood:	-4360.995
Final log likelihood:	-4360.846
Likelihood ratio test for the init. model:	0.2976259
Rho-square for the init. model:	3.41e-05
Rho-square-bar for the init. model:	-0.00111
Akaike Information Criterion:	8731.693
Bayesian Information Criterion:	8754.807
Final gradient norm:	1.0562E-02
Number of draws:	500
Draws generation time:	0:00:00.449905
Types of draws:	['B_TIME_RND: NORMAL_HALTON3']
Nbr of threads:	8



In [16]:
pandas_results = results.getEstimatedParameters()
pandas_results

Unnamed: 0,Value,Rob. Std err,Rob. t-test,Rob. p-value
ASC_CAR,0.283116,0.107775,2.62692,0.008616155
ASC_TRAIN,-0.569443,0.14597,-3.901107,9.575365e-05
B_COST,-1.650727,0.292175,-5.649796,1.606385e-08
B_TIME,-3.228762,0.226838,-14.233798,0.0
B_TIME_S,3.637266,0.244926,14.850443,0.0
