In [1]:
import os, glob

import pandas as pd
import biogeme.database as db
import biogeme.biogeme as bio
from biogeme.expressions import Beta, DefineVariable
from biogeme.models import logit, loglogit, piecewiseFormula, nested
from biogeme.models import lognested
from biogeme.results import bioResults, pickle

In [2]:
df = pd.read_table("new_lpmc02.dat", index_col=0)
database = db.Database("LPMC",df)
pd.options.display.float_format = '{:.3g}'.format

globals().update(database.variables)

In [3]:
# Model

  
# Choice
chosenAlternative = travel_mode


#Parameters to be estimated+ (  BestAlternative_4   *  4  )
# Arguments:
#   1  Name for report. Typically, the same as the variable
#   2  Starting value
#   3  Lower bound
#   4  Upper bound
#   5  0: estimate the parameter, 1: keep it fixed
Constant1 = Beta('Constant1',0,None,None,1)
Constant2 = Beta('Constant2',0,None,None,0)
Constant3 = Beta('Constant3',0,None,None,0)
Constant4 = Beta('Constant4',0,None,None,0)
Cost = Beta('Cost',0,None,None,0)
Total_TT1 = Beta('Total_TT1',0,None,None,0)
Total_TT2 = Beta('Total_TT2',0,None,None,0)
Total_TT3 = Beta('Total_TT3',0,None,None,0)
Total_TT4 = Beta('Total_TT4',0,None,None,0)

CarOwn_2 = Beta('CarOwn_2',0,None,None,0)
CarOwn_3 = Beta('CarOwn_3',0,None,None,0)
CarOwn_4 = Beta('CarOwn_4',0,None,None,0)

LAMBDA = Beta('LAMBDA',1,None,None,0)

# parameters relevant to the nests
N_SM = Beta('N_SM',1,1,None, 0)
N_MOTOR = Beta('N_MOTOR',1,1,None, 0)


# socio-economic factors (interacting with Time)
Time_Age_1 = Beta('Time_Age_1', 0, None, None, 0)
Time_Age_2 = Beta('Time_Age_2', 0, None, None, 0)
Time_Age_3 = Beta('Time_Age_3', 0, None, None, 0)
Time_Age_4 = Beta('Time_Age_4', 0, None, None, 0)


# Utilities

#Opt1 = walking
#Opt2 = cycling
#Opt3 = public transport
#Opt4 = driving


cost_public = DefineVariable('cost_public', cost_transit ,database)
dur_public = DefineVariable('dur_public', (dur_pt_access + dur_pt_rail + dur_pt_bus + dur_pt_int),database)
cost_driving = DefineVariable('cost_driving', cost_driving_fuel + cost_driving_ccharge ,database)

Opt1 = Constant1 + Total_TT1 * ((dur_walking) ** LAMBDA -1)/LAMBDA + Time_Age_1 * dur_walking * age
Opt2 = Constant2 + Total_TT2 * ((dur_cycling) ** LAMBDA -1)/LAMBDA+ CarOwn_2 * car_ownership +\
                    Time_Age_2 * dur_cycling * age
Opt3 = Constant3 + Cost * cost_public + Total_TT3 * (dur_public ** LAMBDA -1)/LAMBDA + CarOwn_3 * car_ownership +\
                    Time_Age_3 * dur_public * age
Opt4 = Constant4 + Cost * cost_driving + Total_TT4 * ((dur_driving) ** LAMBDA -1)/LAMBDA +\
                    CarOwn_4 * car_ownership + Time_Age_4 * dur_driving * age


V = {1: Opt1,2: Opt2,3: Opt3,4: Opt4}
av = {1: 1, 2: 1, 3: 1, 4: 1}


#Definitions of nests
N_SM = N_SM, [1, 2]
N_MOTOR = N_MOTOR, [3, 4]

nests = N_SM, N_MOTOR

In [4]:
sum_weights = database.data['Weights'].sum()
S = database.getSampleSize()
sample_normalized_weight = Weights * S / sum_weights

In [5]:
# market share prediction 

prob_walking = nested(V, av, nests, 1)
prob_cycling = nested(V, av, nests, 2)
prob_public = nested(V, av, nests, 3)
prob_car = nested(V, av, nests, 4)


simulate = {'Prob. walking': prob_walking,
            'Prob. cycling': prob_cycling,
            'Prob. public': prob_public,
            'Prob. car': prob_car,
            'Weighted prob. walking': sample_normalized_weight * prob_walking,
            'Weighted prob. cycling': sample_normalized_weight * prob_cycling,
            'Weighted prob. public': sample_normalized_weight * prob_public,
            'Weighted prob. car': sample_normalized_weight * prob_car
           }

output_dir = "./model-nested-output"
filepath = os.path.join(output_dir, "logit_nested_lpmc_sm_motor")
# if not os.path.exists(output_dir):
#     os.mkdir(output_dir)
    
# # delete previously saved html and pickle
# for file in glob.glob(f"{filepath}*"):
#     os.remove(file)

biogeme  = bio.BIOGEME(database, simulate)
#biogeme.modelName = filepath

betas = biogeme.freeBetaNames
results = bioResults(pickleFile=f"{filepath}.pickle")

beta_values = results.getBetaValues()

In [7]:
simulated_values = biogeme.simulate(beta_values)

marketShare_walking = 100 * simulated_values['Weighted prob. walking'].mean()
marketShare_cycling = 100 * simulated_values['Weighted prob. cycling'].mean()
marketShare_public = 100 * simulated_values['Weighted prob. public'].mean()
marketShare_car = 100 * simulated_values['Weighted prob. car'].mean()

In [8]:
# conf interval
b = results.getBetasForSensitivityAnalysis(betas, size=100)
left, right = biogeme.confidenceIntervals(b, 0.9)

In [9]:
pd.DataFrame(left)

Unnamed: 0,Prob. walking,Prob. cycling,Prob. public,Prob. car,Weighted prob. walking,Weighted prob. cycling,Weighted prob. public,Weighted prob. car
0,0.165,0.0481,0.474,0.16,0.177,0.0514,0.507,0.171
1,7.72e-05,0.0103,0.93,0.0246,8.26e-05,0.011,0.995,0.0263
2,0.0265,0.0121,0.0284,0.871,0.0284,0.013,0.0304,0.932
3,0.0409,0.0047,0.0679,0.782,0.0437,0.00503,0.0727,0.837
4,0.578,0.0144,0.0896,0.192,0.618,0.0154,0.0959,0.206
...,...,...,...,...,...,...,...,...
4995,0.18,0.0595,0.191,0.419,0.155,0.0512,0.164,0.36
4996,0.0343,0.0209,0.0238,0.851,0.0295,0.018,0.0205,0.732
4997,0.548,0.0394,0.0973,0.119,0.471,0.0339,0.0837,0.103
4998,3.21e-05,0.0126,0.126,0.783,2.76e-05,0.0109,0.108,0.674


In [10]:
pd.DataFrame(right)

Unnamed: 0,Prob. walking,Prob. cycling,Prob. public,Prob. car,Weighted prob. walking,Weighted prob. cycling,Weighted prob. public,Weighted prob. car
0,0.232,0.0942,0.573,0.241,0.249,0.101,0.613,0.258
1,0.0014,0.0243,0.959,0.0507,0.0015,0.026,1.03,0.0543
2,0.0506,0.0303,0.0566,0.924,0.0541,0.0324,0.0606,0.989
3,0.0801,0.0196,0.138,0.871,0.0858,0.021,0.147,0.932
4,0.662,0.0624,0.133,0.284,0.708,0.0667,0.143,0.304
...,...,...,...,...,...,...,...,...
4995,0.272,0.138,0.249,0.51,0.234,0.118,0.215,0.439
4996,0.0583,0.0574,0.0464,0.91,0.0501,0.0493,0.0399,0.782
4997,0.677,0.179,0.162,0.213,0.582,0.154,0.139,0.183
4998,0.000762,0.0309,0.2,0.86,0.000655,0.0265,0.172,0.739


In [11]:
lst_marketShares = [marketShare_walking, marketShare_cycling, marketShare_public, marketShare_car]
temp_names = ["marketShare_walking", "marketShare_cycling", "marketShare_public", "marketShare_car"]

print("Predicted market shares:\n")
for i in range(len(temp_names)):
    l = left[f"Weighted prob. {temp_names[i].split('_')[1]}"].mean()*100
    r = right[f"Weighted prob. {temp_names[i].split('_')[1]}"].mean()*100
    print(f"{temp_names[i]}: {lst_marketShares[i]:.2f}% ({l:.2f}%, {r:.2f}%)")

Predicted market shares:

marketShare_walking: 17.59% (15.74%, 19.99%)
marketShare_cycling: 2.84% (2.21%, 5.90%)
marketShare_public: 36.43% (33.81%, 39.97%)
marketShare_car: 43.14% (37.86%, 45.27%)


In [60]:
df.head()["travel_mode"], df.columns # sanity check

(0    3
 1    3
 2    4
 3    4
 4    1
 Name: travel_mode, dtype: int64,
 Index(['trip_id', 'household_id', 'person_n', 'trip_n', 'travel_mode',
        'purpose', 'fueltype', 'faretype', 'bus_scale', 'survey_year',
        'travel_year', 'travel_month', 'travel_date', 'day_of_week',
        'start_time', 'age', 'female', 'driving_license', 'car_ownership',
        'distance', 'dur_walking', 'dur_cycling', 'dur_pt_access',
        'dur_pt_rail', 'dur_pt_bus', 'dur_pt_int', 'pt_interchanges',
        'dur_driving', 'cost_transit', 'cost_driving_fuel',
        'cost_driving_ccharge', 'driving_traffic_percent', 'Weights',
        'cost_public', 'dur_public', 'cost_driving'],
       dtype='object'))

In [29]:
# compare with actual choices
#Opt1 = walking
#Opt2 = cycling
#Opt3 = public transport
#Opt4 = driving

choices = ["walking", "cycling", "public", "driving"]
actual_choices = df.travel_mode.value_counts()/df.shape[0]

print("Actual choices: \n")
for i in range(len(choices)):
    print(f"{choices[i]}: {actual_choices[i+1]*100:.2f}%")

Actual choices: 

walking: 17.38%
cycling: 2.86%
public: 35.58%
driving: 44.18%


In [69]:
# share of users choosing a mode with a higher (unweighted) probability for another mode, 

temp = {"Prob. walking": 1, "Prob. cycling": 2, "Prob. public": 3, "Prob. car": 4}
simulated_probs = simulated_values[["Prob. walking", "Prob. cycling", "Prob. public", "Prob. car"]]
choices_per_n = pd.DataFrame(simulated_probs.idxmax(axis=1), columns=["mode_by_prob"])
choices_per_n = choices_per_n.replace(temp)
df_merged = df.merge(choices_per_n, left_index=True, right_index=True)

deviates = df_merged[df_merged.travel_mode != df_merged.mode_by_prob]

print(f"Share of those choosing a mode despite having a higher prob. for another mode: "
      f"{deviates.shape[0]/df.shape[0] * 100:.2f}%")

# specifically, within each mode
for i in range(len(choices)):
    total = df.travel_mode.value_counts()[i+1]
    anomalies = deviates.travel_mode.value_counts()[i+1]
    print(f"Out of those who choose {choices[i]}, {anomalies/total * 100:.2f}% have a higher prob. of using"
        " another mode")

Share of those choosing a mode despite having a higher prob. for another mode: 28.86%
Out of those who choose walking, 36.25% have a higher prob. of using another mode
Out of those who choose cycling, 100.00% have a higher prob. of using another mode
Out of those who choose public, 31.08% have a higher prob. of using another mode
Out of those who choose driving, 19.56% have a higher prob. of using another mode
