In [1]:
import os, glob

import pandas as pd
import biogeme.database as db
import biogeme.biogeme as bio
from biogeme.expressions import Beta, DefineVariable
from biogeme.models import loglogit, piecewiseFormula

pandas = pd.read_table("lpmc02.dat")
database = db.Database("LPMC",pandas)
pd.options.display.float_format = '{:.3g}'.format

globals().update(database.variables)

# Exclude
# exclude = (  ArrivalTimeHours_1   ==  -1  )
# database.remove(exclude)
  
# Choice
chosenAlternative = travel_mode

## 1. Data and Parameters to estimate

In [2]:
#Parameters to be estimated+ (  BestAlternative_4   *  4  )
# Arguments:
#   1  Name for report. Typically, the same as the variable
#   2  Starting value
#   3  Lower bound
#   4  Upper bound
#   5  0: estimate the parameter, 1: keep it fixed
Constant1 = Beta('Constant1',0,None,None,1)
Constant2 = Beta('Constant2',0,None,None,0)
Constant3 = Beta('Constant3',0,None,None,0)
Constant4 = Beta('Constant4',0,None,None,0)
Cost = Beta('Cost',0,None,None,0)
Total_TT1 = Beta('Total_TT1',0,None,None,0)
Total_TT2 = Beta('Total_TT2',0,None,None,0)
Total_TT3 = Beta('Total_TT3',0,None,None,0)
Total_TT4 = Beta('Total_TT4',0,None,None,0)

CarOwn_2 = Beta('CarOwn_2',0,None,None,0)
CarOwn_3 = Beta('CarOwn_3',0,None,None,0)
CarOwn_4 = Beta('CarOwn_4',0,None,None,0)

LAMBDA = Beta('LAMBDA',1,None,None,0)


# socio-economic factors (interacting with Time)
Time_Age_1 = Beta('Time_Age_1', 0, None, None, 0)
Time_Age_2 = Beta('Time_Age_2', 0, None, None, 0)
Time_Age_3 = Beta('Time_Age_3', 0, None, None, 0)
Time_Age_4 = Beta('Time_Age_4', 0, None, None, 0)

## 2. Model specifications

### 2.1 Power on time

In [3]:
# Define here arithmetic expressions for name that are not directly
# available from the data

dur_walking_squ = DefineVariable('dur_walking_squ', (dur_walking) ** 2, database)
dur_cycling_squ = DefineVariable('dur_cycling_squ', (dur_cycling) ** 2, database)
dur_driving_squ = DefineVariable('dur_driving_squ', (dur_driving) ** 2, database)
dur_public = DefineVariable('dur_public', (dur_pt_access + dur_pt_rail + dur_pt_bus + dur_pt_int),database)
dur_public_squ = DefineVariable('dur_public_squ', dur_public ** 2,database)

cost_public = DefineVariable('cost_public', cost_transit ,database)
cost_driving = DefineVariable('cost_driving', cost_driving_fuel + cost_driving_ccharge ,database)


# Utilities

#Opt1 = walking
#Opt2 = cycling
#Opt3 = public transport
#Opt4 = driving

Opt1 = Constant1 + Total_TT1 * dur_walking_squ + Time_Age_1 * dur_walking * age
Opt2 = Constant2 + Total_TT2 * dur_cycling_squ + CarOwn_2 * car_ownership + Time_Age_2 * dur_cycling * age
Opt3 = Constant3 + Cost * cost_public + Total_TT3 * dur_public_squ + CarOwn_3 * car_ownership +  Time_Age_3 * dur_public * age
Opt4 = Constant4 + Cost * cost_driving + Total_TT4 * dur_driving_squ + CarOwn_4 * car_ownership +\
                    Time_Age_4 * dur_driving * age


V = {1: Opt1,2: Opt2,3: Opt3,4: Opt4}
av = {1: 1,2: 1,3: 1,4: 1}

In [4]:
output_dir = "./model3-output"
filepath = os.path.join(output_dir, "logit_lpmc_model3_Power")
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
    
# delete previously saved html and pickle
for file in glob.glob(f"{filepath}*"):
    os.remove(file)
    
# The choice model is a logit, with availability conditions
logprob = loglogit(V,av,chosenAlternative)
biogeme  = bio.BIOGEME(database,logprob)
biogeme.modelName = filepath
results = biogeme.estimate()

# Get the results in a pandas table
pandasResults = results.getEstimatedParameters()
print(f"Nbr of observations: {database.getNumberOfObservations()}")
print(f"LL(0) =    {results.data.initLogLike:.3f}")
print(f"LL(beta) = {results.data.logLike:.3f}")
print(f"rho bar square = {results.data.rhoBarSquare:.3g}")
print(f"Output file: {results.data.htmlFileName}")

pandasResults

Nbr of observations: 5000
LL(0) =    -6931.472
LL(beta) = -3837.660
rho bar square = 0.444
Output file: ./model3-output/logit_lpmc_model3_Power.html


Unnamed: 0,Value,Std err,t-test,p-value,Rob. Std err,Rob. t-test,Rob. p-value
CarOwn_2,-0.155,0.137,-1.14,0.256,0.142,-1.09,0.276
CarOwn_3,-0.555,0.0743,-7.46,8.7e-14,0.0754,-7.35,1.91e-13
CarOwn_4,1.14,0.0705,16.2,0.0,0.0681,16.7,0.0
Constant2,-3.78,0.184,-20.6,0.0,0.209,-18.1,0.0
Constant3,-1.02,0.104,-9.77,0.0,0.129,-7.89,3.11e-15
Constant4,-2.33,0.113,-20.6,0.0,0.137,-17.0,0.0
Cost,-0.164,0.0161,-10.2,0.0,0.0162,-10.1,0.0
Time_Age_1,-0.121,0.00765,-15.9,0.0,0.0129,-9.42,0.0
Time_Age_2,-0.13,0.0163,-7.98,1.55e-15,0.0184,-7.09,1.38e-12
Time_Age_3,-0.0922,0.00866,-10.6,0.0,0.00978,-9.43,0.0


### 2.2 Box-Cox Transforms

In [5]:
# Utilities

#Opt1 = walking
#Opt2 = cycling
#Opt3 = public transport
#Opt4 = driving

Opt1 = Constant1 + Total_TT1 * ((dur_walking) ** LAMBDA -1)/LAMBDA + Time_Age_1 * dur_walking * age
Opt2 = Constant2 + Total_TT2 * ((dur_cycling) ** LAMBDA -1)/LAMBDA+ CarOwn_2 * car_ownership +\
                    Time_Age_2 * dur_cycling * age
Opt3 = Constant3 + Cost * cost_public + Total_TT3 * (dur_public ** LAMBDA -1)/LAMBDA + CarOwn_3 * car_ownership +\
                    Time_Age_3 * dur_public * age
Opt4 = Constant4 + Cost * cost_driving + Total_TT4 * ((dur_driving) ** LAMBDA -1)/LAMBDA +\
                    CarOwn_4 * car_ownership + Time_Age_4 * dur_driving * age


V = {1: Opt1,2: Opt2,3: Opt3,4: Opt4}
av = {1: 1,2: 1,3: 1,4: 1}

In [6]:
output_dir = "./model3-output"
filepath = os.path.join(output_dir, "logit_lpmc_model3_BoxCox")
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
    
# delete previously saved html and pickle
for file in glob.glob(f"{filepath}*"):
    os.remove(file)
    
# The choice model is a logit, with availability conditions
logprob = loglogit(V,av,chosenAlternative)
biogeme  = bio.BIOGEME(database,logprob)
biogeme.modelName = filepath
results = biogeme.estimate()

# Get the results in a pandas table
pandasResults = results.getEstimatedParameters()
print(f"Nbr of observations: {database.getNumberOfObservations()}")
print(f"LL(0) =    {results.data.initLogLike:.3f}")
print(f"LL(beta) = {results.data.logLike:.3f}")
print(f"rho bar square = {results.data.rhoBarSquare:.3g}")
print(f"Output file: {results.data.htmlFileName}")

pandasResults

Nbr of observations: 5000
LL(0) =    -6931.472
LL(beta) = -3657.128
rho bar square = 0.47
Output file: ./model3-output/logit_lpmc_model3_BoxCox.html


Unnamed: 0,Value,Std err,t-test,p-value,Rob. Std err,Rob. t-test,Rob. p-value
CarOwn_2,-0.258,0.138,-1.87,0.0614,0.145,-1.78,0.0748
CarOwn_3,-0.652,0.0809,-8.06,8.88e-16,0.0838,-7.78,7.33e-15
CarOwn_4,1.04,0.0749,13.9,0.0,0.0738,14.1,0.0
Constant2,-2.64,0.487,-5.44,5.47e-08,0.457,-5.79,7.09e-09
Constant3,2.09,0.256,8.14,4.44e-16,0.253,8.24,2.22e-16
Constant4,-0.934,0.356,-2.63,0.00865,0.392,-2.38,0.0172
Cost,-0.156,0.0166,-9.42,0.0,0.0172,-9.07,0.0
LAMBDA,0.0377,0.0939,0.401,0.688,0.109,0.345,0.73
Time_Age_1,-0.0605,0.00838,-7.22,5.12e-13,0.00854,-7.08,1.42e-12
Time_Age_2,-0.0499,0.0165,-3.03,0.00248,0.0165,-3.02,0.00249


### 2.3 Piecewise on Driving time

In [7]:
thresholds = [None, 0.5 * pandas.dur_driving.mean(), 1.5 * pandas.dur_driving.mean(), None]
init_Betas_TT4 = [0,0,0]

In [8]:
# Define here arithmetic expressions for name that are not directly
# available from the data

#Opt1 = walking
#Opt2 = cycling
#Opt3 = public transport
#Opt4 = driving

Opt1 = Constant1 + Total_TT1 * dur_walking + Time_Age_1 * dur_walking * age
Opt2 = Constant2 + Total_TT2 * dur_cycling + CarOwn_2 * car_ownership + Time_Age_2 * dur_cycling * age
Opt3 = Constant3 + Cost * cost_public + Total_TT3 * dur_public + CarOwn_3 * car_ownership +\
                    Time_Age_3 * dur_public * age
Opt4 = Constant4 + Cost * cost_driving + piecewiseFormula(dur_driving, thresholds, init_Betas_TT4)+\
                    CarOwn_4 * car_ownership + Time_Age_4 * dur_driving * age


V = {1: Opt1,2: Opt2,3: Opt3,4: Opt4}
av = {1: 1,2: 1,3: 1,4: 1}

In [9]:
output_dir = "./model3-output"
filepath = os.path.join(output_dir, "logit_lpmc_model3_Piecewise")
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
    
# delete previously saved html and pickle
for file in glob.glob(f"{filepath}*"):
    os.remove(file)
    
# The choice model is a logit, with availability conditions
logprob = loglogit(V,av,chosenAlternative)
biogeme  = bio.BIOGEME(database,logprob)
biogeme.modelName = filepath
results = biogeme.estimate()

# Get the results in a pandas table
pandasResults = results.getEstimatedParameters()
print(f"Nbr of observations: {database.getNumberOfObservations()}")
print(f"LL(0) =    {results.data.initLogLike:.3f}")
print(f"LL(beta) = {results.data.logLike:.3f}")
print(f"rho bar square = {results.data.rhoBarSquare:.3g}")
print(f"Output file: {results.data.htmlFileName}")

pandasResults

Nbr of observations: 5000
LL(0) =    -6931.472
LL(beta) = -3710.081
rho bar square = 0.462
Output file: ./model3-output/logit_lpmc_model3_Piecewise.html


Unnamed: 0,Value,Std err,t-test,p-value,Rob. Std err,Rob. t-test,Rob. p-value
CarOwn_2,-0.26,0.139,-1.87,0.0609,0.146,-1.78,0.0748
CarOwn_3,-0.647,0.0789,-8.2,2.22e-16,0.0805,-8.04,8.88e-16
CarOwn_4,1.04,0.0728,14.3,0.0,0.0702,14.8,0.0
Constant2,-4.66,0.21,-22.2,0.0,0.263,-17.7,0.0
Constant3,-2.14,0.14,-15.3,0.0,0.184,-11.6,0.0
Constant4,-2.77,0.177,-15.6,0.0,0.175,-15.9,0.0
Cost,-0.157,0.0168,-9.35,0.0,0.0175,-8.96,0.0
Time_Age_1,-0.0588,0.00923,-6.37,1.93e-10,0.00996,-5.9,3.71e-09
Time_Age_2,-0.0347,0.0213,-1.62,0.105,0.0221,-1.56,0.118
Time_Age_3,-0.037,0.0118,-3.13,0.00176,0.0126,-2.94,0.00328



#### Let's compare all these models with Model2, and find the best one between all of them

First, we can directly dismiss the Model2.1 (Power on time) since it has the same number of estimates as Model2 but a lower loglikelihood

##### => Let's compare Model2.2(Box Cox Transforms) with Model2

In order to compare the two models, we take this one as the unrestricted model, and Model2 is the restricted one. Model 2 produces 15 estimates while Model 2.2 produces 16. Thus we have to compare our test statistic with the chi squared ditribution with 1 degree of freedom $\chi^2_{1}$ .


In [10]:
LL_Model2 = - 3727.612
LL_Model2_2 = -3657.128
-2 * (LL_Model2 - LL_Model2_2)

140.96799999999985

$$2 (LLModel_{22} - LLModel_2) = 140.968 > \chi^2_{1,0.99} = 6.63$$


Thus we can reject Model2 at 99% level of confidence, and we keep Model2.2 as our new preferred model.
Let's compare Model2.3 with Model 2.2

We have that Model2.3 produces higher estimates and produces a lower loglikelihood than Model2.2. Thus we can clearly keep Model 2.2 as our preferred model
