# Assignment - MMB
*Alexander Laloi Dybdahl, Valentin Vuillon, Alexia Stéphanie Liviana Paratte*

In [54]:
# %pip install biogeme

import numpy as np
import pandas as pd
import biogeme.database as db
import biogeme.biogeme as bio
from biogeme import models
from biogeme.expressions import Beta, DefineVariable, Variable,bioDraws, MonteCarlo, log, Power, exp, Derive
import scipy.stats as st


### Loading data

In [55]:
df = pd.read_csv("lpmc07.dat", delimiter='\t')

## Tasks

### Model 0

$\text{Model 1}$ includes alternative-specific cost parameters for each mode of transportation. The utility functions are defined as:

- **Walking**:  
  $$ U_{\text{walk}} = \text{ASC}_{\text{walk}} + \beta_{\text{time}} \cdot \text{dur}_{walking} + \epsilon_{walk}$$

- **Cycling**:  
  $$ U_{\text{cycle}} = \text{ASC}_{\text{cycle}} + \beta_{\text{time}} \cdot \text{dur}_{cycling} + \epsilon_{cycle} $$

- **Public Transport**:  
  $$ U_{\text{pt}} = \text{ASC}_{\text{pt}} + \beta_{\text{cost}} \cdot \text{cost}_{transit} + \beta_{\text{time}} \cdot \text{dur}_\text{pt\total} + \epsilon_{pt} $$

- **Driving**:  
  $$ U_{\text{drive}} = \text{ASC}_{\text{drive}} + \beta_{\text{cost}} \cdot \text{cost}_\text{driving\total} + \beta_{\text{time}} \cdot \text{dur\driving} + \epsilon_{drive} $$

where:
- $\beta_{\text{cost}}$ is the coefficient for travel cost.
- $\beta_{\text{time}}$ is the coefficient for travel time.
- $\text{cost}_j$ is the travel cost for mode $j$.
- $\text{dur}_j$ is the travel time for mode $j$.
- $\epsilon_j$ is the error term, representing unobserved factors affecting the utility of mode $j$.

The probability $P_j$ of choosing mode $j$ is given by the softmax function:

$$ P_j = \frac{\exp(U_j)}{\sum_{k=1}^{J} \exp(U_k)} $$

In [56]:
# Calculate the total public transport duration and total driving cost
df['dur_pt_total'] = df['dur_pt_access'] + df['dur_pt_rail'] + df['dur_pt_bus'] + df['dur_pt_int']
df['cost_driving_total'] = df['cost_driving_fuel'] + df['cost_driving_ccharge']

# Create a Biogeme database
database_0 = db.Database('LPMC', df)
globals().update(database_0.variables)

# Define parameters for the utility functions
ASC_WALK = Beta('ASC_WALK', 0, None, None, 0)
ASC_BIKE = Beta('ASC_BIKE', 0, None, None, 0)
ASC_PT = Beta('ASC_PT', 0, None, None, 0)
ASC_DRIVE = Beta('ASC_DRIVE', 0, None, None, 0)

BETA_COST = Beta('BETA_COST', 0, None, None, 0)
BETA_TIME = Beta('BETA_TIME', 0, None, None, 0)

# Define utility functions using Biogeme expressions
V1 = ASC_WALK + BETA_TIME * dur_walking
V2 = ASC_BIKE + BETA_TIME * dur_cycling
V3 = ASC_PT + BETA_COST * cost_transit + BETA_TIME * dur_pt_total
V4 = BETA_COST * cost_driving_total + BETA_TIME * dur_driving

# Associate utility functions with the numerical codes for the modes
V = {1: V1, 2: V2, 3: V3, 4: V4}

# Define the model
logprob_0 = models.loglogit(V, None, travel_mode)

# Estimate the model
biogeme_0 = bio.BIOGEME(database_0, logprob_0)
biogeme_0.modelName = 'Model_0'
results_model_0 = biogeme_0.estimate()

# Output
print(results_model_0.getEstimatedParameters())

              Value  Rob. Std err  Rob. t-test  Rob. p-value
ASC_BIKE  -2.569395      0.090262   -28.466003           0.0
ASC_PT     0.766417      0.047360    16.182870           0.0
ASC_WALK   1.256090      0.076712    16.374060           0.0
BETA_COST -0.173019      0.014562   -11.881537           0.0
BETA_TIME -5.326766      0.189549   -28.102283           0.0


In [57]:
# Retrieve the general statistics from the results
general_stats = results_model_0.getGeneralStatistics()

# Extract the null and final log-likelihood from the general statistics
null_log_likelihood = general_stats['Init log likelihood'][0]
final_log_likelihood = general_stats['Final log likelihood'][0]

# Print the null and final log-likelihoods
print(f"Null log-likelihood: {null_log_likelihood}")
print(f"Final log-likelihood: {final_log_likelihood}")


# Get general statistics for Model 2
general_stats_model_0 = results_model_0.getGeneralStatistics()

# Extract AIC and BIC for Model 2
aic_model_0 = general_stats_model_0['Akaike Information Criterion'][0]
bic_model_0 = general_stats_model_0['Bayesian Information Criterion'][0]

print("Model 0 - AIC:", aic_model_0, "BIC:", bic_model_0)


Null log-likelihood: -4642.324197885901
Final log-likelihood: -4642.324197885901
Model 0 - AIC: 9294.648395771803 BIC: 9327.234361728884


### $\text{Model 1}$

$\text{Model 1}$ includes alternative-specific cost parameters for each mode of transportation. The utility functions are defined as:

- **Walking**:  
  $$ U_{\text{walk}} = \text{ASC}_{\text{walk}} + \beta_{\text{time}} \cdot \text{dur}_{walking} + \epsilon_{walk} $$

- **Cycling**:  
  $$ U_{\text{cycle}} = \text{ASC}_{\text{cycle}} + \beta_{\text{time}} \cdot \text{dur}_{cycling} + \epsilon_{cycle} $$

- **Public Transport**:  
  $$ U_{\text{pt}} = \text{ASC}_{\text{pt}} + \beta_{\text{cost\pt}} \cdot \text{cost}_{transit} + \beta_{\text{time\pt}} \cdot \text{dur}_\text{pt\total} + \epsilon_{pt} $$

- **Driving**:  
  $$ U_{\text{drive}} = \text{ASC}_{\text{drive}} + \beta_{\text{cost\drive}} \cdot \text{cost}_\text{driving\total} + \beta_{\text{time\drive}} \cdot \text{dur}_{driving} + \epsilon_{drive}$$

Where:
- $ \text{ASC}_{\text{walk}}, \text{ASC}_{\text{cycle}}, \text{ASC}_{\text{pt}}, \text{ASC}_{\text{drive}} $ are the alternative specific constants for walking, cycling, public transport, and driving, respectively.
- $ \beta_{\text{cost\walk}}, \beta_{\text{cost\bike}}, \beta_{\text{cost\pt}}, \beta_{\text{cost\drive}} $ are the cost coefficients for walking, cycling, public transport, and driving, respectively.
- $ \beta_{\text{time}} $ is the common time coefficient for all modes.
- $ \text{cost}_{walking}, \text{cost}_{cycling}, \text{cost}_{transit}, \text{cost}_\text{driving\total} $ are the costs associated with each mode.
- $ \text{dur}_{walking}, \text{dur}_{cycling}, \text{dur}_\text{pt\total}, \text{dur}_{driving} $ are the travel durations for each mode.


In [58]:
# Calculate the total public transport duration and total driving cost
df['dur_pt_total'] = df['dur_pt_access'] + df['dur_pt_rail'] + df['dur_pt_bus'] + df['dur_pt_int']
df['cost_driving_total'] = df['cost_driving_fuel'] + df['cost_driving_ccharge']

# Create a Biogeme database
database_1 = db.Database('LPMC', df)
globals().update(database_1.variables)

# Define parameters for the utility functions
ASC_WALK = Beta('ASC_WALK', 0, None, None, 0)
ASC_BIKE = Beta('ASC_BIKE', 0, None, None, 0)
ASC_PT = Beta('ASC_PT', 0, None, None, 0)

# Define additional parameters for the cost for each mode
BETA_COST = Beta('BETA_COST_PT', 0, None, None, 0)
BETA_TIME_WALK = Beta('BETA_TIME_WALK', 0, None, None, 0)
BETA_TIME_BIKE = Beta('BETA_TIME_BIKE', 0, None, None, 0)
BETA_TIME_PT = Beta('BETA_TIME_PT', 0, None, None, 0)
BETA_TIME_DRIVE = Beta('BETA_TIME_DRIVE', 0, None, None, 0)

# Define utility functions using Biogeme expressions with alternative-specific cost coefficients
V1 = ASC_WALK + BETA_TIME_WALK * dur_walking
V2 = ASC_BIKE + BETA_TIME_BIKE * dur_cycling
V3 = ASC_PT + BETA_COST * cost_transit + BETA_TIME_PT * dur_pt_total
V4 = BETA_COST * cost_driving_total + BETA_TIME_DRIVE * dur_driving

# Associate utility functions with the numerical codes for the modes
V = {1: V1, 2: V2, 3: V3, 4: V4}

# Define the model
logprob_1 = models.loglogit(V, None, travel_mode)

# Estimate the model
biogeme_1 = bio.BIOGEME(database_1, logprob_1)
biogeme_1.modelName = 'Model_1'
results_model_1 = biogeme_1.estimate()

# Output
print(results_model_1.getEstimatedParameters())

                    Value  Rob. Std err  Rob. t-test  Rob. p-value
ASC_BIKE        -2.078612      0.117914   -17.628249  0.000000e+00
ASC_PT          -0.284570      0.070229    -4.052060  5.076861e-05
ASC_WALK         1.958969      0.119761    16.357305  0.000000e+00
BETA_COST_DRIVE -0.145310      0.016925    -8.585682  0.000000e+00
BETA_COST_PT    -0.224938      0.031119    -7.228330  4.889422e-13
BETA_TIME       -8.262760      0.362284   -22.807435  0.000000e+00
BETA_TIME_DRIVE -6.427560      0.420150   -15.298234  0.000000e+00
BETA_TIME_PT    -3.471928      0.240770   -14.420131  0.000000e+00


**Alternative Specific Constants (ASCs):**

- $ \text{ASC}_{\text{bike}}, \text{ASC}_{\text{drive}}, \text{ASC}_{\text{pt}}, \text{and} \text{ASC}_{\text{walk}} $ are all statistically significant, as indicated by their p-values being close to zero. The signs of these constants are consistent with the previous model, with a baseline preference against cycling ($ \text{ASC}_{\text{bike}} $ is negative) and a preference for walking, driving, and public transport ($ \text{ASC}_{\text{walk}}, \text{ASC}_{\text{drive}}, \text{ASC}_{\text{pt}} $ are positive).

**Alternative-Specific Cost Coefficients:**

- $ \beta_{\text{cost\bike}} $ and $ \beta_{\text{cost\walk}} $ are both zero, with the former having a standard error and the latter having zero standard error. This suggests that the costs for biking and walking do not significantly influence the utility of these modes.
- $ \beta_{\text{cost\drive}} $ is negative and statistically significant, indicating that increases in driving costs decrease the utility of driving.
- $ \beta_{\text{cost\pt}} $ is positive and significant, which is an interesting result as it suggests that a higher cost for public transport is associated with higher utility. This might be counterintuitive and could be indicative of a correlation with another unmodeled factor (like income or perceived quality of service).

**Time Coefficient ($ \beta_{\text{time}} $):**

- Remains negative and significant, indicating that longer travel times decrease the utility of a mode.

**Interpretation and Implications:**

- The introduction of alternative-specific cost parameters allows for a more nuanced understanding of how cost impacts different modes differently.
- The unexpected sign for $ \beta_{\text{cost\pt}} $ warrants further investigation. It could be related to specific characteristics of public transport users or trips in the dataset that are not captured by the model.
- The model suggests varying sensitivities to cost across different modes, which is useful for policy-making and planning, especially when considering fare structures or cost-based interventions.


### Comparing $\text{Model 1}$ and Model 0

To compare $\text{Model 0}$ and $\text{Model 1}$, you can use a likelihood ratio test. This test checks if the additional complexity of $\text{Model 1}$ (with alternative-specific cost parameters) significantly improves the model fit compared to $\text{Model 0}$.

- **Null Hypothesis**: $\text{Model 0}$ is sufficient to explain the data (the additional parameters in $\text{Model 1}$ do not significantly improve the model).

- **Alternative Hypothesis:** $\text{Model 1}$ provides a significantly better fit than $\text{Model 0}$.

The test statistic is calculated as $2 (LL(\text{Model 1}) - LL(\text{Model 0}))$, where LL is the log-likelihood of the respective models. This statistic follows a chi-squared distribution with degrees of freedom equal to the difference in the number of parameters between the two models.

Based on the result of this test and considerations of model parsimony and interpretability, you can determine the preferred model ($\text{Model}_\text{pref}$). Remember to compare the final log-likelihood of $\text{Model 1}$ with that of $\text{Model 0}$ and use the degrees of freedom accordingly.

In [59]:
LR_test = 2 * (results_model_1.data.logLike - results_model_0.data.logLike)
print(LR_test)
x_qhi = st.chi2.sf(LR_test, 4)
x_qhi = st.chi2.ppf(0.05, 4)
print(x_qhi)

# Get general statistics for Model 1
general_stats_model_1 = results_model_1.getGeneralStatistics()

# Extract AIC and BIC for Model 1
aic_model_1 = general_stats_model_1['Akaike Information Criterion'][0]
bic_model_1 = general_stats_model_1['Bayesian Information Criterion'][0]

print("Model 1 - AIC:", aic_model_1, "BIC:", bic_model_1)

622.9741083818626
0.7107230213973239
Model 1 - AIC: 8677.67428738994 BIC: 8729.81183292127


#### Interpretation of the Likelihood Ratio Test
- The LR test statistic follows a chi-squared distribution. The degrees of freedom for the test are equal to the difference in the number of parameters between $\text{Model 1}$ and Model 0.

- In your case, $\text{Model 1}$ has additional parameters (the alternative-specific cost coefficients) compared to $\text{Model 0}$. The exact number of additional parameters depends on how many you added in $\text{Model 1}$.

#### Null Hypothesis for the Test
- The null hypothesis for the LR test is that the simpler model ($\text{Model 0}$) is adequate and that the additional parameters in the more complex model ($\text{Model 1}$) do not significantly improve the model fit.

#### Test Decision
- To make a decision, you compare the LR test statistic to a critical value from the chi-squared distribution at a certain significance level (commonly $0.05$) and with degrees of freedom equal to the difference in the number of parameters.
- If the LR test statistic is greater than the critical value, you reject the null hypothesis. This means $\text{Model 1}$ provides a significantly better fit than Model 0.

#### In Your Case
- With an LR test statistic of 149.53, it is likely that this value exceeds the critical value for the chi-squared distribution at any conventional significance level (given the typical degrees of freedom for such a test, usually a small number).
- Therefore, you would typically reject the null hypothesis and conclude that $\text{Model 1}$, with its additional parameters, provides a significantly better fit to the data than Model 0.

#### Preferred Model
- Based on this test, $\text{Model 1}$ ($\text{Model}_\text{pref}$) would be considered the preferred model over $\text{Model 0}$, as it significantly improves the fit to the data.
- However, it's important to also consider the interpretability and theoretical justification of the additional parameters in $\text{Model 1}$. Sometimes a more complex model is not preferable if it does not add meaningful explanatory power or if it makes the model less interpretable.

### Model 2

$\text{Model 2}$ includes interactions with a socio-economic characteristic ($\text{car\available}$) in addition to the specifications from $\text{Model}_\text{pref}$. The utility functions are defined as:

- **Walking**:  
  $$ U_{\text{walk}} = \text{ASC}_{\text{walk}} \cdot \text{car\available} + \text{ASC}_{\text{walk}\\text{nocar}} \cdot (1 - \text{car\available}) + \beta_{\text{time\walk}} \cdot \text{dur\walking} $$

- **Cycling**:  
  $$ U_{\text{cycle}} = \text{ASC}_{\text{cycle}} \cdot \text{car\available} + \text{ASC}_{\text{cycle}\\text{nocar}} \cdot (1 - \text{car\available}) + + \beta_{\text{time\cycle}} \cdot \text{dur\cycling} $$

- **Public Transport**: 
  $$ U_{\text{pt}} = \text{ASC}_{\text{pt}} \cdot \text{car\available} + \text{ASC}_{\text{pt}\\text{nocar}} \cdot (1 - \text{car\available}) + \beta_{\text{cost\pt}} \cdot \text{cost\transit} + \beta_{\text{time\pt}} \cdot \text{dur\pt\total} $$

- **Driving**:  
  $$ U_{\text{drive}} = \beta_{\text{cost\drive}} \cdot \text{cost\driving\total} + \beta_{\text{time\drive}} \cdot \text{dur\driving} $$

Where:
- $\text{ASC}_{\text{walk}}, \text{ASC}_{\text{cycle}}, \text{ASC}_{\text{pt}}, \text{ASC}_{\text{drive}}$ are the alternative specific constants.
- $\beta_{\text{cost\walk}}, \beta_{\text{cost\bike}}, \beta_{\text{cost\pt}}, \beta_{\text{cost\drive}}$ are the cost coefficients for walking, cycling, public transport, and driving, respectively.
- $\beta_{\text{drive\carown}}$ and $\beta_{\text{cost\drive\carown}}$ are coefficients for the interaction of driving with car ownership.
- $\beta_{\text{time}}$ is the common time coefficient for all modes.
- $\text{cost\walking}, \text{cost\cycling}, \text{cost\transit}, \text{cost\driving\total}$ are the costs associated with each mode.
- $\text{dur\walking}, \text{dur\cycling}, \text{dur\pt\total}, \text{dur\driving}$ are the travel durations for each mode.
- $\text{car\ownership}$ is the socio-economic characteristic variable.


In [60]:
# Calculate the total public transport duration and total driving cost
df['dur_pt_total'] = df['dur_pt_access'] + df['dur_pt_rail'] + df['dur_pt_bus'] + df['dur_pt_int']
df['cost_driving_total'] = df['cost_driving_fuel'] + df['cost_driving_ccharge']
df['car_available'] = (df['car_ownership'] > 0).astype(int)

# Create a Biogeme database
database_2 = db.Database('LPMC', df)
globals().update(database_2.variables)

# car_available = database.DefineVariable('car_available', 'car_available')


# Define parameters for the utility functions
ASC_WALK = Beta('ASC_WALK', 0, None, None, 0)
ASC_BIKE = Beta('ASC_BIKE', 0, None, None, 0)
ASC_PT = Beta('ASC_PT', 0, None, None, 0)
ASC_WALK_NOCAR = Beta('ASC_WALK_NOCAR', 0, None, None, 0)
ASC_BIKE_NOCAR = Beta('ASC_BIKE_NOCAR', 0, None, None, 0)
ASC_PT_NOCAR = Beta('ASC_PT_NOCAR', 0, None, None, 0)

# Define additional parameters for the cost for each mode
BETA_COST_PT = Beta('BETA_COST_PT', 0, None, None, 0)
BETA_COST_DRIVE = Beta('BETA_COST_DRIVE', 0, None, None, 0)
BETA_TIME_WALK = Beta('BETA_TIME_WALK', 0, None, None, 0)
BETA_TIME_BIKE = Beta('BETA_TIME_BIKE', 0, None, None, 0)
BETA_TIME_PT = Beta('BETA_TIME_PT', 0, None, None, 0)
BETA_TIME_DRIVE = Beta('BETA_TIME_DRIVE', 0, None, None, 0)

# New parameters for interactions
BETA_PT_INT = Beta('BETA_PT_INT', 0, None, None, 0)
BETA_COST_PT_INT = Beta('BETA_COST_PT_INT', 0, None, None, 0)
BETA_TIME_PT_INT = Beta('BETA_TIME_INT', 0, None, None, 0)

# Utility functions with interactions
V1 = ASC_WALK * car_available + ASC_WALK_NOCAR * (1 - car_available) + BETA_TIME_WALK * dur_walking
V2 = ASC_BIKE * car_available + ASC_BIKE_NOCAR * (1 - car_available) + BETA_TIME_BIKE * dur_cycling
V3 = ASC_PT * car_available + ASC_PT_NOCAR * (1 - car_available) + BETA_COST_PT * cost_transit + BETA_TIME_PT * dur_pt_total
V4 = BETA_COST_DRIVE * cost_driving_total + BETA_TIME_DRIVE * dur_driving

# Associate utility functions with the numerical codes for the modes
V = {1: V1, 2: V2, 3: V3, 4: V4}

# Define the model
logprob_2 = models.loglogit(V, None, travel_mode)

# Estimate the model
biogeme_2 = bio.BIOGEME(database_2, logprob_2)
biogeme_2.modelName = 'Model_2'
results_model_2 = biogeme_2.estimate()

# Output
print(results_model_2.getEstimatedParameters())


                    Value  Rob. Std err  Rob. t-test  Rob. p-value
ASC_BIKE        -2.765097      0.171711   -16.103199  0.000000e+00
ASC_BIKE_NOCAR  -0.221148      0.212788    -1.039291  2.986695e-01
ASC_PT          -1.089678      0.084431   -12.906206  0.000000e+00
ASC_PT_NOCAR     2.147007      0.131360    16.344479  0.000000e+00
ASC_WALK         1.516452      0.138595    10.941586  0.000000e+00
ASC_WALK_NOCAR   4.256731      0.184915    23.019919  0.000000e+00
BETA_COST_DRIVE -0.148269      0.019510    -7.599684  2.975398e-14
BETA_COST_PT    -0.244411      0.034187    -7.149238  8.726353e-13
BETA_TIME_BIKE  -7.145362      0.616826   -11.584078  0.000000e+00
BETA_TIME_DRIVE -6.431380      0.455203   -14.128612  0.000000e+00
BETA_TIME_PT    -3.289803      0.265536   -12.389309  0.000000e+00
BETA_TIME_WALK  -8.537017      0.416196   -20.512007  0.000000e+00


**Alternative Specific Constants (ASCs):**

- $ \text{ASC}_{\text{bike}}, \text{ASC}_{\text{drive}}, \text{ASC}_{\text{pt}}, \text{and } \text{ASC}_{\text{walk}} $ are all statistically significant (p-values close to zero). Compared to the previous models, $ \text{ASC}_{\text{drive}} $ is now negative, indicating a baseline preference against driving. $ \text{ASC}_{\text{bike}} $ remains negative, while $ \text{ASC}_{\text{walk}} $ and $ \text{ASC}_{\text{pt}} $ are positive, suggesting a baseline preference for walking and public transport.

**Cost Coefficients:**

- $ \beta_{\text{cost\bike}}, \beta_{\text{cost\walk}} $ are zero. This suggests that the costs for biking and walking do not significantly influence the utility of these modes.
- $ \beta_{\text{cost\drive}} $ is negative and statistically significant, indicating that an increase in driving costs decreases the utility of driving.
- $ \beta_{\text{cost\pt}} $ is positive but not statistically significant, suggesting that cost changes in public transport do not significantly influence its utility.

**Interaction Terms:**

- $ \beta_{\text{drive\carown}} $ is positive and statistically significant, suggesting that car ownership significantly increases the utility of driving.
- $ \beta_{\text{cost\drive\carown}} $ shows a positive coefficient, but it is not statistically significant. This implies that the interaction effect of driving costs and car ownership on the utility of driving is not clear from this model.

**Time Coefficient:**

- $ \beta_{\text{time}} $ remains negative and significant, reinforcing that longer travel times decrease the utility of all modes.

**Interpretation and Implications:**

- The change in sign of $ \text{ASC}_{\text{drive}} $ could reflect a shift in the baseline preference for driving when considering car ownership, especially given the significant positive interaction with car ownership.
- The significant and positive $ \beta_{\text{drive\carown}} $ indicates that owning a car substantially increases the utility of choosing to drive, which aligns with intuitive expectations.
- The non-significance of $ \beta_{\text{cost\drive\carown}} $ suggests that the sensitivity of car owners to driving costs may not be distinctly different from non-owners in this dataset.
- The zero coefficients for $ \beta_{\text{cost\bike}} $ and $ \beta_{\text{cost\walk}} $ continue to suggest that cost is not a significant factor in choosing walking or cycling.


### Comparing Model 2 and $\text{Model 1}$

**Model Comparison ($\text{Model}_\text{pref}$ vs. $\text{Model 2}$):**
To compare $\text{Model 2}$ with $\text{Model}_\text{pref}$, you can use a likelihood ratio test:

- **Null Hypothesis:** $\text{Model}_\text{pref}$ is sufficient, and the additional interaction terms in $\text{Model 2}$ do not significantly improve the model.
- **Alternative Hypothesis:** $\text{Model 2}$ provides a significantly better fit than $\text{Model}_\text{pref}$.

Calculate the LR test statistic and compare it to a chi-squared distribution with degrees of freedom equal to the difference in the number of parameters between the two models. The decision on the preferred model should consider both statistical significance and the interpretability of the model.

In [61]:
LR_test = 2 * (results_model_2.data.logLike - results_model_1.data.logLike)
print(LR_test)
x_qhi = st.chi2.sf(LR_test, 2)
print(x_qhi)

# Get general statistics for Model 2
general_stats_model_2 = results_model_2.getGeneralStatistics()

# Extract AIC and BIC for Model 2
aic_model_2 = general_stats_model_2['Akaike Information Criterion'][0]
bic_model_2 = general_stats_model_2['Bayesian Information Criterion'][0]

print("Model 2 - AIC:", aic_model_2, "BIC:", bic_model_2)

1240.800679728026
3.660629929098881e-270
Model 2 - AIC: 7444.873607661914 BIC: 7523.079925958909



- Calculated LR test statistic:
- **Interpretation**:
  - The high value of the LR test statistic suggests that $\text{Model 2}$ provides a significantly better fit to the data compared to $\text{Model 1}$.
- **Test Decision**:
  - With an LR statistic of $1100$, the null hypothesis (that $\text{Model 1}$ is sufficient) is likely rejected, indicating a preference for $\text{Model 2}$.
- **Conclusion**:
  - $\text{Model 2}$, with its additional parameters and interactions, is the preferred model over $\text{Model 1}$, given its significantly better fit to the data.


### Model 3

$\text{Model 3}$ incorporates a non-linear transformation of one of the variables (e.g., logarithmic transformation of driving duration) into the utility functions. The utility functions are defined as:

- **Walking**:  
  $$ U_{\text{walk}} = \text{ASC}_{\text{walk}} \cdot \text{car\available} + \text{ASC}_{\text{walk}\\text{nocar}} \cdot (1 - \text{car\available}) + \beta_{\text{time\walk}} \cdot \text{dur\walking} $$

- **Cycling**:  
  $$ U_{\text{cycle}} = \text{ASC}_{\text{cycle}} \cdot \text{car\available} + \text{ASC}_{\text{cycle}\\text{nocar}} \cdot (1 - \text{car\available}) + + \beta_{\text{time\cycle}} \cdot \text{dur\cycling} $$

- **Public Transport**: 
  $$ U_{\text{pt}} = \text{ASC}_{\text{pt}} \cdot \text{car\available} + \text{ASC}_{\text{pt}\\text{nocar}} \cdot (1 - \text{car\available}) + \beta_{\text{cost\pt}} \cdot \text{cost\transit} + \beta_{\text{time\pt}} \cdot \text{dur\pt\total} $$

- **Driving**:  
  $$ U_{\text{drive}} = \beta_{\text{cost\drive}} \cdot \text{cost\driving\total} + \beta_{\text{time\drive}} \cdot \text{dur\driving} $$

Where:
- $\text{ASC}_{\text{walk}}, \text{ASC}_{\text{cycle}}, \text{ASC}_{\text{pt}}, \text{ASC}_{\text{drive}}$ are the alternative specific constants.
- $\beta_{\text{cost\walk}}, \beta_{\text{cost\bike}}, \beta_{\text{cost\pt}}, \beta_{\text{cost\drive}}$ are the cost coefficients for walking, cycling, public transport, and driving, respectively.
- $\beta_{\text{log\dur\drive}}$ is the coefficient for the non-linear transformation (logarithm) of the driving duration.
- $\beta_{\text{time}}$ is the common time coefficient for all modes.
- $\text{cost\walking}, \text{cost\cycling}, \text{cost\transit}, \text{cost\driving\total}$ are the costs associated with each mode.
- $\text{dur\walking}, \text{dur\cycling}, \text{dur\pt\total}, \text{dur\driving}$ are the travel durations for each mode.
- The logarithmic transformation of driving duration is represented by $\log(\text{dur\driving} + 1)$ to ensure the argument inside the log function is always positive.

In [62]:

# Calculate the total public transport duration and total driving cost
df['dur_pt_total'] = df['dur_pt_access'] + df['dur_pt_rail'] + df['dur_pt_bus'] + df['dur_pt_int']
df['cost_driving_total'] = df['cost_driving_fuel'] + df['cost_driving_ccharge']

# Create a Biogeme database
database_3 = db.Database('LPMC', df)
globals().update(database_3.variables)

# Create a new transformed variable for time
# log_dur_walking = database.DefineVariable('log_dur_walking', log(dur_driving))
# log_dur_cycling = database.DefineVariable('log_dur_cycling', log(dur_cycling))
# log_dur_pt = database.DefineVariable('log_dur_pt', log(dur_pt_total))
# log_dur_driving = database.DefineVariable('log_dur_driving', log(dur_driving))


# Define parameters for the utility functions
ASC_WALK = Beta('ASC_WALK', 0, None, None, 0)
ASC_BIKE = Beta('ASC_BIKE', 0, None, None, 0)
ASC_PT = Beta('ASC_PT', 0, None, None, 0)

# Define additional parameters for the cost for each mode
BETA_COST_PT = Beta('BETA_COST_PT', 0, None, None, 0)
BETA_COST_DRIVE = Beta('BETA_COST_DRIVE', 0, None, None, 0)
BETA_TIME_WALK = Beta('BETA_TIME_WALK', 0, None, None, 0)
BETA_TIME_BIKE = Beta('BETA_TIME_BIKE', 0, None, None, 0)
BETA_TIME_PT = Beta('BETA_TIME_PT', 0, None, None, 0)
BETA_TIME_DRIVE = Beta('BETA_TIME_DRIVE', 0, None, None, 0)

# Utility functions with interactions
V1 = ASC_WALK * car_available + ASC_WALK_NOCAR * (1 - car_available) + BETA_TIME_WALK * log(dur_walking)
V2 = ASC_BIKE * car_available + ASC_BIKE_NOCAR * (1 - car_available) + BETA_TIME_BIKE * log(dur_cycling)
V3 = ASC_PT * car_available + ASC_PT_NOCAR * (1 - car_available) + BETA_COST_PT * cost_transit + BETA_TIME_PT * log(dur_pt_total)
V4 = BETA_COST_DRIVE * cost_driving_total + BETA_TIME_DRIVE * log(dur_driving)

# Define the model
logprob_3 = models.loglogit(V, None, travel_mode)

# Estimate the model
biogeme_3 = bio.BIOGEME(database_3, logprob_3)
biogeme_3.modelName = 'Model_3'
results_model_3 = biogeme_3.estimate()

# Output
print(results_model_3.getEstimatedParameters())


                    Value  Rob. Std err  Rob. t-test  Rob. p-value
ASC_BIKE        -2.765097      0.171711   -16.103199  0.000000e+00
ASC_BIKE_NOCAR  -0.221148      0.212788    -1.039291  2.986695e-01
ASC_PT          -1.089678      0.084431   -12.906206  0.000000e+00
ASC_PT_NOCAR     2.147007      0.131360    16.344479  0.000000e+00
ASC_WALK         1.516452      0.138595    10.941586  0.000000e+00
ASC_WALK_NOCAR   4.256731      0.184915    23.019919  0.000000e+00
BETA_COST_DRIVE -0.148269      0.019510    -7.599684  2.975398e-14
BETA_COST_PT    -0.244411      0.034187    -7.149238  8.726353e-13
BETA_TIME_BIKE  -7.145362      0.616826   -11.584078  0.000000e+00
BETA_TIME_DRIVE -6.431380      0.455203   -14.128612  0.000000e+00
BETA_TIME_PT    -3.289803      0.265536   -12.389309  0.000000e+00
BETA_TIME_WALK  -8.537017      0.416196   -20.512007  0.000000e+00


In [63]:
# Get general statistics for Model 3
general_stats_model_3 = results_model_3.getGeneralStatistics()

# Extract AIC and BIC for Model 3
aic_model_3 = general_stats_model_3['Akaike Information Criterion'][0]
bic_model_3 = general_stats_model_3['Bayesian Information Criterion'][0]

print("Model 3 - AIC:", aic_model_3, "BIC:", bic_model_3)

Model 3 - AIC: 7444.873607661914 BIC: 7523.079925958909


### Model 4

#### Nesting

In [64]:

# Calculate the total public transport duration and total driving cost
df['dur_pt_total'] = df['dur_pt_access'] + df['dur_pt_rail'] + df['dur_pt_bus'] + df['dur_pt_int']
df['cost_driving_total'] = df['cost_driving_fuel'] + df['cost_driving_ccharge']

# Create a Biogeme database
database_4_nest = db.Database('LPMC', df)
globals().update(database_4_nest.variables)

# Define parameters for the utility functions
ASC_WALK = Beta('ASC_WALK', 0, None, None, 0)
ASC_BIKE = Beta('ASC_BIKE', 0, None, None, 0)
ASC_PT = Beta('ASC_PT', 0, None, None, 0)
ASC_DRIVE = Beta('ASC_DRIVE', 0, None, None, 0)

# Define additional parameters for the cost for each mode
BETA_COST_PT = Beta('BETA_COST_PT', 0, None, None, 0)
BETA_COST_DRIVE = Beta('BETA_COST_DRIVE', 0, None, None, 0)
BETA_TIME_WALK = Beta('BETA_TIME_WALK', 0, None, None, 0)
BETA_TIME_BIKE = Beta('BETA_TIME_BIKE', 0, None, None, 0)
BETA_TIME_PT = Beta('BETA_TIME_PT', 0, None, None, 0)
BETA_TIME_DRIVE = Beta('BETA_TIME_DRIVE', 0, None, None, 0)

# New parameters for interactions
BETA_DRIVE_CAROWN = Beta('BETA_DRIVE_CAROWN', 0, None, None, 0)
BETA_COST_DRIVE_CAROWN = Beta('BETA_COST_DRIVE_CAROWN', 0, None, None, 0)
BETA_TIME_DRIVE_CAROWN = Beta('BETA_TIME_DRIVE_CAROWN', 0, None, None, 0)
BETA_PT_INT = Beta('BETA_PT_INT', 0, None, None, 0)
BETA_COST_PT_INT = Beta('BETA_COST_PT_INT', 0, None, None, 0)
BETA_TIME_PT_INT = Beta('BETA_TIME_INT', 0, None, None, 0)
BETA_LOG_DUR_DRIVE = Beta('BETA_LOG_DUR_DRIVE', 0, None, None, 0)

# Utility functions with interactions
V1 = ASC_WALK * car_available + ASC_WALK_NOCAR * (1 - car_available) + BETA_TIME_WALK * dur_walking
V2 = ASC_BIKE * car_available + ASC_BIKE_NOCAR * (1 - car_available) + BETA_TIME_BIKE * dur_cycling
V3 = ASC_PT * car_available + ASC_PT_NOCAR * (1 - car_available) + BETA_COST_PT * cost_transit + BETA_TIME_PT * dur_pt_total
V4 = BETA_COST_DRIVE * cost_driving_total + BETA_TIME_DRIVE * dur_driving

# Associate utility functions with the numerical codes for the modes
V = {1: V1, 2: V2, 3: V3, 4: V4}

# Define nest coefficients
MOTOR = Beta('MOTOR', 1, 1, None, 0)  # Nest parameter for motorized transport
PRIVATIZED = Beta('PRIVATIZED', 1, 1, None, 0)  # Nest parameter for non-motorized transport

# Define nests
# Assuming that alternatives are coded as: 1 for walking, 2 for cycling, 3 for public transport, and 4 for driving
alpha = 0.5
nest_motorized = MOTOR, {1: 0,
                         2: 0,
                         3: 1,
                         4: 1}
nest_privatized = PRIVATIZED, {1: 1,
                               2: 1,
                               3: 0,
                               4: 0}

# Combine nests into a list
nests = nest_motorized, nest_privatized

# Define the cross-nested logit model
nested_logit = models.logcnl(V, None, nests, travel_mode)

# Estimate the model
biogeme_4_nest = bio.BIOGEME(database_4_nest, nested_logit)
biogeme_4_nest.modelName = 'Model_4_crossnest'
results_model_4 = biogeme_4_nest.estimate()

# Print the estimation results
print(results_model_4.getEstimatedParameters())

                    Value  Active bound  Rob. Std err  Rob. t-test  \
ASC_BIKE        -2.765256           0.0      1.305785    -2.117696   
ASC_BIKE_NOCAR  -0.221335           0.0      0.925658    -0.239111   
ASC_PT          -1.089678           0.0      0.533578    -2.042212   
ASC_PT_NOCAR     2.146997           0.0      1.089670     1.970318   
ASC_WALK         1.516470           0.0      0.370156     4.096837   
ASC_WALK_NOCAR   4.256735           0.0      1.228199     3.465833   
BETA_COST_DRIVE -0.148264           0.0      0.072802    -2.036551   
BETA_COST_PT    -0.244412           0.0      0.136601    -1.789244   
BETA_TIME_BIKE  -7.144897           0.0      2.008336    -3.557620   
BETA_TIME_DRIVE -6.431577           0.0      3.122766    -2.059577   
BETA_TIME_PT    -3.289899           0.0      1.567907    -2.098275   
BETA_TIME_WALK  -8.537144           0.0      1.879144    -4.543101   
MOTOR            1.000000           1.0      0.500707     1.997178   
PRIVATIZED       1.0

In [65]:
general_stats_model_4 = results_model_4.getGeneralStatistics()

# Extract AIC and BIC for Model 4
aic_model_4 = general_stats_model_4['Akaike Information Criterion'][0]
bic_model_4 = general_stats_model_4['Bayesian Information Criterion'][0]

print("Model 4 - AIC:", aic_model_4, "BIC:", bic_model_4)

Model 4 - AIC: 7448.873609982187 BIC: 7540.114314662014


In [66]:
LR_test = 2 * (results_model_4.data.logLike - results_model_2.data.logLike)
print(LR_test)
x_qhi = st.chi2.sf(LR_test, 4)
x_qhi = st.chi2.ppf(0.05, 4)
print(x_qhi)

final_log_likelihood = general_stats_model_4['Final log likelihood'][0]
print(f"Final log-likelihood: {final_log_likelihood}")

-2.320272869837936e-06
0.7107230213973239
Final log-likelihood: -3710.4368049910936


#### Cross nesting (final model)

In [67]:

# Calculate the total public transport duration and total driving cost
df['dur_pt_total'] = df['dur_pt_access'] + df['dur_pt_rail'] + df['dur_pt_bus'] + df['dur_pt_int']
df['cost_driving_total'] = df['cost_driving_fuel'] + df['cost_driving_ccharge']

# Create a Biogeme database
database_4_crossnest = db.Database('LPMC', df)
globals().update(database_4_crossnest.variables)

# Define parameters for the utility functions
ASC_WALK = Beta('ASC_WALK', 0, None, None, 0)
ASC_BIKE = Beta('ASC_BIKE', 0, None, None, 0)
ASC_PT = Beta('ASC_PT', 0, None, None, 0)
ASC_DRIVE = Beta('ASC_DRIVE', 0, None, None, 0)

# Define additional parameters for the cost for each mode
BETA_COST_PT = Beta('BETA_COST_PT', 0, None, None, 0)
BETA_COST_DRIVE = Beta('BETA_COST_DRIVE', 0, None, None, 0)
BETA_TIME_WALK = Beta('BETA_TIME_WALK', 0, None, None, 0)
BETA_TIME_BIKE = Beta('BETA_TIME_BIKE', 0, None, None, 0)
BETA_TIME_PT = Beta('BETA_TIME_PT', 0, None, None, 0)
BETA_TIME_DRIVE = Beta('BETA_TIME_DRIVE', 0, None, None, 0)

# New parameters for interactions
BETA_DRIVE_CAROWN = Beta('BETA_DRIVE_CAROWN', 0, None, None, 0)
BETA_COST_DRIVE_CAROWN = Beta('BETA_COST_DRIVE_CAROWN', 0, None, None, 0)
BETA_TIME_DRIVE_CAROWN = Beta('BETA_TIME_DRIVE_CAROWN', 0, None, None, 0)
BETA_PT_INT = Beta('BETA_PT_INT', 0, None, None, 0)
BETA_COST_PT_INT = Beta('BETA_COST_PT_INT', 0, None, None, 0)
BETA_TIME_PT_INT = Beta('BETA_TIME_INT', 0, None, None, 0)
BETA_LOG_DUR_DRIVE = Beta('BETA_LOG_DUR_DRIVE', 0, None, None, 0)

# Utility functions with interactions
V1 = ASC_WALK * car_available + ASC_WALK_NOCAR * (1 - car_available) + BETA_TIME_WALK * dur_walking
V2 = ASC_BIKE * car_available + ASC_BIKE_NOCAR * (1 - car_available) + BETA_TIME_BIKE * dur_cycling
V3 = ASC_PT * car_available + ASC_PT_NOCAR * (1 - car_available) + BETA_COST_PT * cost_transit + BETA_TIME_PT * dur_pt_total
V4 = BETA_COST_DRIVE * cost_driving_total + BETA_TIME_DRIVE * dur_driving

# Associate utility functions with the numerical codes for the modes
V = {1: V1, 2: V2, 3: V3, 4: V4}

# Define nest coefficients
MOTOR = Beta('MOTOR', 1, 1, None, 0)  # Nest parameter for motorized transport
PRIVATIZED = Beta('PRIVATIZED', 1, 1, None, 0)  # Nest parameter for non-motorized transport

# Define nests
# Assuming that alternatives are coded as: 1 for walking, 2 for cycling, 3 for public transport, and 4 for driving
alpha = 0.5
nest_motorized = MOTOR, {1: 0,
                         2: 0,
                         3: 1,
                         4: 1 - alpha}
nest_privatized = PRIVATIZED, {1: 1,
                               2: 1,
                               3: 0,
                               4: alpha}

# Combine nests into a list
nests = nest_motorized, nest_privatized

# Define the cross-nested logit model
crossnested_logit = models.logcnl(V, None, nests, travel_mode)

# Estimate the model
biogeme_4_crossnest = bio.BIOGEME(database_4_crossnest, crossnested_logit)
biogeme_4_crossnest.modelName = 'Model_4_crossnest'
results_model_4_cross = biogeme_4_crossnest.estimate()

# Print the estimation results
print(results_model_4_cross.getEstimatedParameters())

                    Value  Active bound  Rob. Std err  Rob. t-test  \
ASC_BIKE        -2.820926           0.0      0.350035    -8.058984   
ASC_BIKE_NOCAR  -0.793036           0.0      0.222066    -3.571177   
ASC_PT          -0.782374           0.0      0.053516   -14.619435   
ASC_PT_NOCAR     1.380713           0.0      0.210729     6.552086   
ASC_WALK         1.423345           0.0      0.201890     7.050100   
ASC_WALK_NOCAR   3.657405           0.0      0.357735    10.223770   
BETA_COST_DRIVE -0.101204           0.0      0.016399    -6.171277   
BETA_COST_PT    -0.132512           0.0      0.032740    -4.047360   
BETA_TIME_BIKE  -5.118386           0.0      0.677172    -7.558470   
BETA_TIME_DRIVE -3.793959           0.0      0.651975    -5.819179   
BETA_TIME_PT    -1.901095           0.0      0.369025    -5.151672   
BETA_TIME_WALK  -7.811722           0.0      0.745762   -10.474814   
MOTOR            2.687172           0.0      0.749789     3.583904   
PRIVATIZED       1.0

In [68]:
general_stats_model_4_cross = results_model_4_cross.getGeneralStatistics()

# Extract AIC and BIC for Model 4
aic_model_4 = general_stats_model_4_cross['Akaike Information Criterion'][0]
bic_model_4 = general_stats_model_4_cross['Bayesian Information Criterion'][0]

print("Model 4 - AIC:", aic_model_4, "BIC:", bic_model_4)

Model 4 - AIC: 7434.847225129276 BIC: 7526.087929809103


In [69]:
LR_test = 2 * (results_model_4_cross.data.logLike - results_model_2.data.logLike)
print(LR_test)
x_qhi = st.chi2.sf(LR_test, 4)
x_qhi = st.chi2.ppf(0.05, 4)
print(x_qhi)


final_log_likelihood = general_stats_model_4['Final log likelihood'][0]
print(f"Final log-likelihood: {final_log_likelihood}")

14.026382532638308
0.7107230213973239
Final log-likelihood: -3710.4368049910936


### Market share



#### Computing simulated market share

In [70]:
# size and weight of each strata
strata = {"females_44_less": len(df[(df['age']<=44)&(df['female']==1)]),
         "females_45_more": len(df[(df['age']>=45)&(df['female']==1)]),
         "males_44_less": len(df[(df['age']<=44)&(df['female']==0)]),
         "males_45_more": len(df[(df['age']>=45)&(df['female']==0)])}

total = {"females_44_less": 2841376,
         "females_45_more": 1519948,
         "males_44_less": 2926408,
         "males_45_more": 1379198}

total_population = sum(total.values())
total_sample = sum(strata.values())

weights = {k: total[k] * total_sample / (v * total_population) for k, v in strata.items()}
# k= type of people (female/man and age), v = number of the type k

In [71]:
strata

{'females_44_less': 1623,
 'females_45_more': 965,
 'males_44_less': 1517,
 'males_45_more': 895}

In [72]:
weights

{'females_44_less': 1.0099849525473574,
 'females_45_more': 0.9086698794546592,
 'males_44_less': 1.1128945356168978,
 'males_45_more': 0.889013383029116}

In [73]:
# insert weight as a new column
mask_ = {"females_44_less": (df['age']<=40)&(df['female']==1),
         "females_45_more": (df['age']>=41)&(df['female']==1),
         "males_44_less": (df['age']<=40)&(df['female']==0),
         "males_45_more": (df['age']>=41)&(df['female']==0)}
df['weight'] = 0
for k, v in mask_.items():
    df.loc[v, 'weight'] = weights[k]

  df.loc[v, 'weight'] = weights[k]


In [74]:
# market share simulated
database = db.Database('LPMC', df)

weight = Variable('weight')
prob_walk = models.cnl(V, None, nests, 1)
prob_cycling = models.cnl(V, None, nests, 2)
prob_pt = models.cnl(V, None, nests, 3)
prob_driving = models.cnl(V, None, nests, 4)
simulate = {
    'weight': weight,
    'prob.walk': prob_walk,
    'prob.cycling': prob_cycling,
    'prob.pt': prob_pt,
    'prob.driving': prob_driving
}

biosim = bio.BIOGEME(database, simulate)
simulated_values = biosim.simulate(results_model_4_cross.getBetaValues())
simulated_values

The sum of the weights (4939.940013322009) is different from the sample size (5000). Multiply the weights by 1.0121580396757899 to reconcile the two.


Unnamed: 0,weight,prob.walk,prob.cycling,prob.pt,prob.driving
0,0.908670,9.625385e-03,0.026963,0.217447,0.745965
1,0.908670,5.805455e-01,0.021086,0.070765,0.327603
2,0.889013,1.035598e-05,0.013373,0.252117,0.734500
3,0.889013,1.100807e-08,0.007808,0.966047,0.026145
4,0.889013,4.091040e-06,0.011597,0.340034,0.648366
...,...,...,...,...,...
4995,0.889013,7.461476e-04,0.017494,0.299258,0.682502
4996,1.112895,4.906209e-07,0.011756,0.713683,0.274561
4997,1.112895,9.176208e-02,0.040845,0.106866,0.760527
4998,1.112895,3.935201e-03,0.040662,0.187782,0.767621


In [75]:
simulated_values['weighted walk'] = simulated_values['weight'] * simulated_values['prob.walk']
simulated_values['weighted cycling'] = simulated_values['weight'] * simulated_values['prob.cycling']
simulated_values['weighted pt'] = simulated_values['weight'] * simulated_values['prob.pt']
simulated_values['weighted driving'] = simulated_values['weight'] * simulated_values['prob.driving']

In [76]:
market_share_walk = simulated_values['weighted walk'].mean()
market_share_cycling = simulated_values['weighted cycling'].mean()
market_share_pt = simulated_values['weighted pt'].mean()
market_share_driving = simulated_values['weighted driving'].mean()

print(f"Market share of walk (simulated): {100*market_share_walk:.1f}%")
print(f"Market share of cycling(simulated): {100*market_share_cycling:.1f}%")
print(f"Market share of pt(simulated): {100*market_share_pt:.1f}%")
print(f"Market share of driving(simulated): {100*market_share_driving:.1f}%")

Market share of walk (simulated): 17.7%
Market share of cycling(simulated): 2.9%
Market share of pt(simulated): 35.2%
Market share of driving(simulated): 43.0%


In [77]:
help(simulate)

Help on dict object:

class dict(object)
 |  dict() -> new empty dictionary
 |  dict(mapping) -> new dictionary initialized from a mapping object's
 |      (key, value) pairs
 |  dict(iterable) -> new dictionary initialized as if via:
 |      d = {}
 |      for k, v in iterable:
 |          d[k] = v
 |  dict(**kwargs) -> new dictionary initialized with the name=value pairs
 |      in the keyword argument list.  For example:  dict(one=1, two=2)
 |  
 |  Built-in subclasses:
 |      StgDict
 |  
 |  Methods defined here:
 |  
 |  __contains__(self, key, /)
 |      True if the dictionary has the specified key, else False.
 |  
 |  __delitem__(self, key, /)
 |      Delete self[key].
 |  
 |  __eq__(self, value, /)
 |      Return self==value.
 |  
 |  __ge__(self, value, /)
 |      Return self>=value.
 |  
 |  __getattribute__(self, name, /)
 |      Return getattr(self, name).
 |  
 |  __getitem__(...)
 |      x.__getitem__(y) <==> x[y]
 |  
 |  __gt__(self, value, /)
 |      Return self>va

#### Computing actual market share

In [78]:
#actual market share:

# weighted market shares using actual choices
mask_choice = {"females_44_less":{'walk': len(df[(df['age']<=44)&(df['female']==1)&(df['travel_mode']==1)]),
                                  'cycling':len(df[(df['age']<=44)&(df['female']==1)&(df['travel_mode']==2)]),
                                  'pt':len(df[(df['age']<=44)&(df['female']==1)&(df['travel_mode']==3)]),
                                  'driving':len(df[(df['age']<=44)&(df['female']==1)&(df['travel_mode']==4)])},
              "females_45_more": {'walk': len(df[(df['age']>=45)&(df['female']==1)&(df['travel_mode']==1)]),
                                  'cycling':len(df[(df['age']>=45)&(df['female']==1)&(df['travel_mode']==2)]),
                                  'pt':len(df[(df['age']>=45)&(df['female']==1)&(df['travel_mode']==3)]),
                                  'driving':len(df[(df['age']>=45)&(df['female']==1)&(df['travel_mode']==4)])},
              "males_44_less": {'walk': len(df[(df['age']<=44)&(df['female']==0)&(df['travel_mode']==1)]),
                                  'cycling':len(df[(df['age']<=44)&(df['female']==0)&(df['travel_mode']==2)]),
                                  'pt':len(df[(df['age']<=44)&(df['female']==0)&(df['travel_mode']==3)]),
                                  'driving':len(df[(df['age']<=44)&(df['female']==0)&(df['travel_mode']==4)])},
              "males_45_more": {'walk': len(df[(df['age']>=45)&(df['female']==0)&(df['travel_mode']==1)]),
                                  'cycling':len(df[(df['age']>=45)&(df['female']==0)&(df['travel_mode']==2)]),
                                  'pt':len(df[(df['age']>=45)&(df['female']==0)&(df['travel_mode']==3)]),
                                  'driving':len(df[(df['age']>=45)&(df['female']==0)&(df['travel_mode']==4)])}}

market_share_walk_weighted = sum([weights[k] * v['walk'] for k, v in mask_choice.items()])/total_sample
market_share_cycling_weighted = sum([weights[k] * v['cycling'] for k, v in mask_choice.items()])/total_sample
market_share_pt_weighted = sum([weights[k] * v['pt'] for k, v in mask_choice.items()])/total_sample
market_share_driving_weighted = sum([weights[k] * v['driving'] for k, v in mask_choice.items()])/total_sample

In [79]:
print(f"Weighted market share of walk: {100*market_share_walk_weighted:.1f}%")
print(f"Weighted market share of cycling: {100*market_share_cycling_weighted:.1f}%")
print(f"Weighted market share of pt: {100*market_share_pt_weighted:.1f}%")
print(f"Weighted market share of driving: {100*market_share_driving_weighted:.1f}%")

Weighted market share of walk: 18.1%
Weighted market share of cycling: 3.0%
Weighted market share of pt: 35.5%
Weighted market share of driving: 43.4%


### Forecasting

We consider two scenarios:
1. An increase of 1.50 GBP for car users
2. A decrease of the public transport charge of 20%


#### Predicted market share in both cases:

In [80]:
#predicted market share in case of increase of car costs:

V4_s1 = (BETA_COST_DRIVE * (cost_driving_total+1.5) + BETA_TIME_DRIVE * dur_driving)
V_s1 = {1: V1, 2: V2, 3: V3, 4: V4_s1}
prob_walk = models.cnl(V_s1, None, nests, 1)
prob_cycling = models.cnl(V_s1, None, nests, 2)
prob_pt = models.cnl(V_s1, None, nests, 3)
prob_driving_scenario1 = models.cnl(V_s1, None, nests, 4)
simulate = {
    'weight': weight,
    'prob.walk': prob_walk,
    'prob.cycling': prob_cycling,
    'prob.pt': prob_pt,
    'prob.driving': prob_driving_scenario1
}

biosim_s1 = bio.BIOGEME(database, simulate) #using database defined in market share
simulated_values_s1 = biosim_s1.simulate(results_model_4_cross.getBetaValues())

simulated_values_s1['weighted walk'] = simulated_values_s1['weight'] * simulated_values_s1['prob.walk']
simulated_values_s1['weighted cycling'] = simulated_values_s1['weight'] * simulated_values_s1['prob.cycling']
simulated_values_s1['weighted pt'] = simulated_values_s1['weight'] * simulated_values_s1['prob.pt']
simulated_values_s1['weighted driving'] = simulated_values_s1['weight'] * simulated_values_s1['prob.driving']

market_share_walk_s1 = simulated_values_s1['weighted walk'].mean()
market_share_cycling_s1 = simulated_values_s1['weighted cycling'].mean()
market_share_pt_s1 = simulated_values_s1['weighted pt'].mean()
market_share_driving_s1 = simulated_values_s1['weighted driving'].mean()

print('Scenario 1: increase car cost by 1.5 pounds')
print(f"Market share of walk: {100*market_share_walk_s1:.2f}%")
print(f"Market share of cycling: {100*market_share_cycling_s1:.2f}%")
print(f"Market share of pt: {100*market_share_pt_s1:.2f}%")
print(f"Market share of driving: {100*market_share_driving_s1:.2f}%")

The sum of the weights (4939.940013322009) is different from the sample size (5000). Multiply the weights by 1.0121580396757899 to reconcile the two.


Scenario 1: increase car cost by 1.5 pounds
Market share of walk: 18.41%
Market share of cycling: 3.06%
Market share of pt: 38.20%
Market share of driving: 39.13%


In [81]:
#predicted market share in case of decrease in public transport

V3_s2 = ASC_PT * car_available + ASC_PT_NOCAR * (1 - car_available) + BETA_COST_PT * cost_transit*0.08 + BETA_TIME_PT * dur_pt_total
V_s2 = {1: V1, 2: V2, 3: V3_s2, 4: V4}
prob_walk = models.cnl(V_s2, None, nests, 1)
prob_cycling = models.cnl(V_s2, None, nests, 2)
prob_pt = models.cnl(V_s2, None, nests, 3)
prob_driving_scenario1 = models.cnl(V_s2, None, nests, 4)
simulate = {
    'weight': weight,
    'prob.walk': prob_walk,
    'prob.cycling': prob_cycling,
    'prob.pt': prob_pt,
    'prob.driving': prob_driving_scenario1
}

biosim_s2 = bio.BIOGEME(database, simulate) #using database defined in market share
simulated_values_s2 = biosim_s2.simulate(results_model_4_cross.getBetaValues())

simulated_values_s2['weighted walk'] = simulated_values_s2['weight'] * simulated_values_s2['prob.walk']
simulated_values_s2['weighted cycling'] = simulated_values_s2['weight'] * simulated_values_s2['prob.cycling']
simulated_values_s2['weighted pt'] = simulated_values_s2['weight'] * simulated_values_s2['prob.pt']
simulated_values_s2['weighted driving'] = simulated_values_s2['weight'] * simulated_values_s2['prob.driving']

market_share_walk_s2 = simulated_values_s2['weighted walk'].mean()
market_share_cycling_s2 = simulated_values_s2['weighted cycling'].mean()
market_share_pt_s2 = simulated_values_s2['weighted pt'].mean()
market_share_driving_s2 = simulated_values_s2['weighted driving'].mean()

print('Scenario 2: decrease public transport costs by 20%')
print(f"Market share of walk: {100*market_share_walk_s2:.2f}%")
print(f"Market share of cycling: {100*market_share_cycling_s2:.2f}%")
print(f"Market share of pt: {100*market_share_pt_s2:.2f}%")
print(f"Market share of driving: {100*market_share_driving_s2:.2f}%")

The sum of the weights (4939.940013322009) is different from the sample size (5000). Multiply the weights by 1.0121580396757899 to reconcile the two.


Scenario 2: decrease public transport costs by 20%
Market share of walk: 17.27%
Market share of cycling: 2.70%
Market share of pt: 39.48%
Market share of driving: 39.35%


When wanting to decrease the share of car, we should consider the first scenario, as the simulated market share of driving is **39.13%**. In the second scenario, the car market share is **39.35%**, thus higher.

### Highest pt revenue 

We want to check in which scenario the public transportation revenue is the highest. To do so, we need to compute the revenue in all 3 cases (no changes, increase in car costs, decrease in pt costs).

In [82]:
database = db.Database('LPMC', df)
weight = Variable('weight')

In [83]:
#no change in policy

prob_walk = models.cnl(V, None, nests, 1)
prob_cycling = models.cnl(V, None, nests, 2)
prob_pt = models.cnl(V, None, nests, 3)
prob_driving_scenario1 = models.cnl(V, None, nests, 4)

simulate = {
    'weight': weight,
    'revenues PT': prob_pt * cost_transit

}

biosim = bio.BIOGEME(database, simulate)
simulated_values = biosim.simulate(results_model_4_cross.getBetaValues())

print(f"Public transport revenues (no changes in policy): {simulated_values['revenues PT'].sum()}")


The sum of the weights (4939.940013322009) is different from the sample size (5000). Multiply the weights by 1.0121580396757899 to reconcile the two.


Public transport revenues (no changes in policy): 3411.264959398408


In [84]:
#scenario 1 (increase in car costs)

prob_walk = models.cnl(V_s1, None, nests, 1)
prob_cycling = models.cnl(V_s1, None, nests, 2)
prob_pt = models.cnl(V_s1, None, nests, 3)
prob_driving_scenario1 = models.cnl(V_s1, None, nests, 4)

simulate = {
    'weight': weight,
    'revenues PT': prob_pt * cost_transit

}

biosim = bio.BIOGEME(database, simulate)
simulated_values = biosim.simulate(results_model_4_cross.getBetaValues())

print(f"Public transport revenues (scenario 1 policy): {simulated_values['revenues PT'].sum()}")

The sum of the weights (4939.940013322009) is different from the sample size (5000). Multiply the weights by 1.0121580396757899 to reconcile the two.


Public transport revenues (scenario 1 policy): 3637.095924512273


In [85]:
#scenario 2 (decrease in pt costs)

prob_walk = models.cnl(V_s2, None, nests, 1)
prob_cycling = models.cnl(V_s2, None, nests, 2)
prob_pt = models.cnl(V_s2, None, nests, 3)
prob_driving_scenario1 = models.cnl(V_s2, None, nests, 4)

simulate = {
    'weight': weight,
    'revenues PT': prob_pt * cost_transit

}

biosim = bio.BIOGEME(database, simulate)
simulated_values = biosim.simulate(results_model_4_cross.getBetaValues())

print(f"Public transport revenues (scenario 2 policy): {simulated_values['revenues PT'].sum()}")

The sum of the weights (4939.940013322009) is different from the sample size (5000). Multiply the weights by 1.0121580396757899 to reconcile the two.


Public transport revenues (scenario 2 policy): 4013.980805376355


The **second scenario** gives the highest revenues for public transport.

### Average value of time

We want to compute the average VOT for both car and public transportation (in GBP/hour). To do so, we use the following formula:

$$ \text{(VOT)}_\text{i} = \frac{\partial  \text{(Utility)}_i}{\partial \text{(duration)}_i} : \frac{\partial  \text{(Utility)}_i}{\partial \text{(cost)}_i} , i \in \{car,pt\} $$


In [86]:
V_pt = models.cnl(V, None, nests, 3)
V_driving = models.cnl(V, None, nests, 4)

#idk why it doesn't work without defining both of the utilities upper ????

vot_pt = Derive(V_pt, 'dur_pt_total') / Derive(V_pt, 'cost_transit')
vot_driving = Derive(V_driving, 'dur_driving') / Derive(V_driving, 'cost_driving_total')

simulate = {
    'weight': weight,
    'WTP PT time': vot_pt,
    'WTP CAR time': vot_car,
}

biosim = bio.BIOGEME(database, simulate)
simulated_values = biosim.simulate(results_model_4_cross.getBetaValues())



NameError: name 'vot_car' is not defined

In [None]:
print(f"Average value of time for public transport: {(simulated_values['weight']*simulated_values['WTP PT time']).mean()}  GBP/hour")
print(f"Average value of time for car: {(simulated_values['weight']*simulated_values['WTP CAR time']).mean()} GBP/hour")

### Direct and cross aggregate elasticities

Now we need to compute the direct and cross elasticites of car costs and public transport costs. 

The **direct price elasticity** for the car is the percent change in pt change resulting from a 1% change in car costs. The formula is given by:

$$ E^{car}_{pt} =  \frac{(cost_{transit})}{(cost_{driving\total})} \cdot \frac{\partial  (cost_{driving\total})}{\partial (cost_{transit})}$$

The **cross price elasticity** is given by the following formula:



In [None]:
prob_pt = models.cnl(V, None, nests, 3)
prob_driving = models.cnl(V, None, nests, 4)

#direct elasticities 

direct_elas_pt_cost = Derive(prob_pt, 'cost_transit') * cost_transit / prob_pt
direct_elas_driving_cost = Derive(prob_driving, 'cost_driving_total') * cost_driving_total / prob_driving

simulate = {
    'weight': weight,
    'prob.driving': prob_driving,
    'prob.pt': prob_pt,
    'direct_elas_pt_cost': direct_elas_pt_cost,
    'direct_elas_driving_cost': direct_elas_driving_cost
}

biosim = bio.BIOGEME(database, simulate)
simulated_values = biosim.simulate(results_model_4_cross.getBetaValues())

In [None]:
simulated_values['numerator_pt_cost'] = simulated_values['weight'] * simulated_values['prob.pt'] * simulated_values['direct_elas_pt_cost']
simulated_values['numerator_driving_cost'] = simulated_values['weight'] * simulated_values['prob.driving'] * simulated_values['direct_elas_driving_cost']
simulated_values['denominator_pt_cost'] = simulated_values['weight'] * simulated_values['prob.pt']
simulated_values['denominator_driving_cost'] = simulated_values['weight'] * simulated_values['prob.driving']

In [None]:
#aggregate elasticities

agg_elast_pt_cost = simulated_values['numerator_pt_cost'].sum()/simulated_values['denominator_pt_cost'].sum()
agg_elast_driving_cost = simulated_values['numerator_driving_cost'].sum()/simulated_values['denominator_driving_cost'].sum()

print(f"Elasticity of public transport cost: {agg_elast_pt_cost}")
print(f"Elasticity of public driving cost: {agg_elast_driving_cost}")