In [1]:
import os, glob

import pandas as pd
import biogeme.database as db
import biogeme.biogeme as bio
from biogeme.expressions import Beta, DefineVariable
from biogeme.models import logit, loglogit, piecewiseFormula, nested
from biogeme.models import lognested
from biogeme.results import bioResults, pickle

In [2]:
df = pd.read_table("new_lpmc02.dat", index_col=0)
database = db.Database("LPMC",df)
pd.options.display.float_format = '{:.3g}'.format

globals().update(database.variables)

In [3]:
# Model

  
# Choice
chosenAlternative = travel_mode


#Parameters to be estimated+ (  BestAlternative_4   *  4  )
# Arguments:
#   1  Name for report. Typically, the same as the variable
#   2  Starting value
#   3  Lower bound
#   4  Upper bound
#   5  0: estimate the parameter, 1: keep it fixed
Constant1 = Beta('Constant1',0,None,None,1)
Constant2 = Beta('Constant2',0,None,None,0)
Constant3 = Beta('Constant3',0,None,None,0)
Constant4 = Beta('Constant4',0,None,None,0)
Cost = Beta('Cost',0,None,None,0)
Total_TT1 = Beta('Total_TT1',0,None,None,0)
Total_TT2 = Beta('Total_TT2',0,None,None,0)
Total_TT3 = Beta('Total_TT3',0,None,None,0)
Total_TT4 = Beta('Total_TT4',0,None,None,0)

CarOwn_2 = Beta('CarOwn_2',0,None,None,0)
CarOwn_3 = Beta('CarOwn_3',0,None,None,0)
CarOwn_4 = Beta('CarOwn_4',0,None,None,0)

LAMBDA = Beta('LAMBDA',1,None,None,0)

# parameters relevant to the nests
N_SM = Beta('N_SM',1,1,None, 0)
N_MOTOR = Beta('N_MOTOR',1,1,None, 0)


# socio-economic factors (interacting with Time)
Time_Age_1 = Beta('Time_Age_1', 0, None, None, 0)
Time_Age_2 = Beta('Time_Age_2', 0, None, None, 0)
Time_Age_3 = Beta('Time_Age_3', 0, None, None, 0)
Time_Age_4 = Beta('Time_Age_4', 0, None, None, 0)


# Utilities

#Opt1 = walking
#Opt2 = cycling
#Opt3 = public transport
#Opt4 = driving


cost_public = DefineVariable('cost_public', cost_transit ,database)
dur_public = DefineVariable('dur_public', (dur_pt_access + dur_pt_rail + dur_pt_bus + dur_pt_int),database)
cost_driving = DefineVariable('cost_driving', cost_driving_fuel + cost_driving_ccharge ,database)

Opt1 = Constant1 + Total_TT1 * ((dur_walking) ** LAMBDA -1)/LAMBDA + Time_Age_1 * dur_walking * age
Opt2 = Constant2 + Total_TT2 * ((dur_cycling) ** LAMBDA -1)/LAMBDA+ CarOwn_2 * car_ownership +\
                    Time_Age_2 * dur_cycling * age
Opt3 = Constant3 + Cost * cost_public + Total_TT3 * (dur_public ** LAMBDA -1)/LAMBDA + CarOwn_3 * car_ownership +\
                    Time_Age_3 * dur_public * age
Opt4 = Constant4 + Cost * cost_driving + Total_TT4 * ((dur_driving) ** LAMBDA -1)/LAMBDA +\
                    CarOwn_4 * car_ownership + Time_Age_4 * dur_driving * age


V = {1: Opt1,2: Opt2,3: Opt3,4: Opt4}
av = {1: 1, 2: 1, 3: 1, 4: 1}


#Definitions of nests
N_SM = N_SM, [1, 2]
N_MOTOR = N_MOTOR, [3, 4]

nests = N_SM, N_MOTOR

In [4]:
sum_weights = database.data['Weights'].sum()
S = database.getSampleSize()
sample_normalized_weight = Weights * S / sum_weights

In [5]:
# market share prediction 

prob_walking = nested(V, av, nests, 1)
prob_cycling = nested(V, av, nests, 2)
prob_public = nested(V, av, nests, 3)
prob_car = nested(V, av, nests, 4)


simulate = {'Prob. walking': prob_walking,
            'Prob. cycling': prob_cycling,
            'Prob. public': prob_public,
            'Prob. car': prob_car,
            'Weighted prob. walking': sample_normalized_weight * prob_walking,
            'Weighted prob. cycling': sample_normalized_weight * prob_cycling,
            'Weighted prob. public': sample_normalized_weight * prob_public,
            'Weighted prob. car': sample_normalized_weight * prob_car,
            'Revenue public': prob_public * cost_public,
            'Revenue driving': prob_car * cost_driving
           }

output_dir = "./model-nested-output"
filepath = os.path.join(output_dir, "logit_nested_lpmc_sm_motor")
# if not os.path.exists(output_dir):
#     os.mkdir(output_dir)
    
# # delete previously saved html and pickle
# for file in glob.glob(f"{filepath}*"):
#     os.remove(file)

biogeme  = bio.BIOGEME(database, simulate)
#biogeme.modelName = filepath

betas = biogeme.freeBetaNames
results = bioResults(pickleFile=f"{filepath}.pickle")

beta_values = results.getBetaValues()

In [6]:
simulated_values = biogeme.simulate(beta_values)

marketShare_walking = 100 * simulated_values['Weighted prob. walking'].mean()
marketShare_cycling = 100 * simulated_values['Weighted prob. cycling'].mean()
marketShare_public = 100 * simulated_values['Weighted prob. public'].mean()
marketShare_car = 100 * simulated_values['Weighted prob. car'].mean()

In [7]:
revenue_public_avg = simulated_values["Revenue public"].mean()
revenue_public_total = simulated_values["Revenue public"].sum()

revenue_public_total

3577.1343927456273

In [13]:
# conf interval
b = results.getBetasForSensitivityAnalysis(betas, size=100)
left, right = biogeme.confidenceIntervals(b, 0.9)

In [14]:
pd.DataFrame(left)

Unnamed: 0,Prob. walking,Prob. cycling,Prob. public,Prob. car,Weighted prob. walking,Weighted prob. cycling,Weighted prob. public,Weighted prob. car,Revenue public,Revenue driving
0,0.173,0.0427,0.481,0.165,0.185,0.0457,0.515,0.177,0,0.0661
1,7.73e-05,0.0111,0.919,0.0254,8.27e-05,0.0119,0.983,0.0272,3.77,0.0565
2,0.028,0.0124,0.0289,0.871,0.03,0.0133,0.0309,0.932,0.0433,0.374
3,0.0425,0.00632,0.067,0.752,0.0454,0.00676,0.0717,0.805,0,0.399
4,0.589,0.0143,0.0887,0.18,0.63,0.0153,0.0949,0.193,0,0.0396
...,...,...,...,...,...,...,...,...,...,...
4995,0.19,0.0592,0.19,0.436,0.164,0.0509,0.164,0.375,0.571,0.201
4996,0.0358,0.0217,0.0241,0.852,0.0308,0.0187,0.0208,0.733,0.0362,0.29
4997,0.591,0.0385,0.107,0.12,0.509,0.0331,0.092,0.103,0.16,0.0179
4998,3.73e-05,0.013,0.125,0.781,3.21e-05,0.0112,0.107,0.671,0.636,1.41


In [9]:
pd.DataFrame(right)

Unnamed: 0,Prob. walking,Prob. cycling,Prob. public,Prob. car,Weighted prob. walking,Weighted prob. cycling,Weighted prob. public,Weighted prob. car
0,0.237,0.101,0.6,0.247,0.254,0.108,0.642,0.264
1,0.00125,0.0259,0.96,0.0563,0.00134,0.0278,1.03,0.0602
2,0.0491,0.0277,0.0771,0.924,0.0525,0.0296,0.0826,0.989
3,0.0852,0.0212,0.16,0.878,0.0911,0.0226,0.172,0.939
4,0.662,0.0631,0.157,0.287,0.709,0.0675,0.168,0.307
...,...,...,...,...,...,...,...,...
4995,0.255,0.123,0.294,0.515,0.219,0.106,0.253,0.443
4996,0.0611,0.0596,0.0619,0.909,0.0526,0.0512,0.0532,0.782
4997,0.678,0.207,0.155,0.217,0.583,0.178,0.134,0.187
4998,0.000809,0.0253,0.23,0.848,0.000695,0.0218,0.198,0.729


In [10]:
lst_marketShares = [marketShare_walking, marketShare_cycling, marketShare_public, marketShare_car]
temp_names = ["marketShare_walking", "marketShare_cycling", "marketShare_public", "marketShare_car"]

print("Predicted market shares:\n")
for i in range(len(temp_names)):
    l = left[f"Weighted prob. {temp_names[i].split('_')[1]}"].mean()*100
    r = right[f"Weighted prob. {temp_names[i].split('_')[1]}"].mean()*100
    print(f"{temp_names[i]}: {lst_marketShares[i]:.2f}% ({l:.2f}%, {r:.2f}%)")

Predicted market shares:

marketShare_walking: 17.59% (15.62%, 20.30%)
marketShare_cycling: 2.84% (2.16%, 6.09%)
marketShare_public: 36.43% (33.57%, 41.11%)
marketShare_car: 43.14% (35.23%, 45.56%)


In [60]:
df.head()["travel_mode"], df.columns # sanity check

(0    3
 1    3
 2    4
 3    4
 4    1
 Name: travel_mode, dtype: int64,
 Index(['trip_id', 'household_id', 'person_n', 'trip_n', 'travel_mode',
        'purpose', 'fueltype', 'faretype', 'bus_scale', 'survey_year',
        'travel_year', 'travel_month', 'travel_date', 'day_of_week',
        'start_time', 'age', 'female', 'driving_license', 'car_ownership',
        'distance', 'dur_walking', 'dur_cycling', 'dur_pt_access',
        'dur_pt_rail', 'dur_pt_bus', 'dur_pt_int', 'pt_interchanges',
        'dur_driving', 'cost_transit', 'cost_driving_fuel',
        'cost_driving_ccharge', 'driving_traffic_percent', 'Weights',
        'cost_public', 'dur_public', 'cost_driving'],
       dtype='object'))

In [29]:
# compare with actual choices
#Opt1 = walking
#Opt2 = cycling
#Opt3 = public transport
#Opt4 = driving

choices = ["walking", "cycling", "public", "driving"]
actual_choices = df.travel_mode.value_counts()/df.shape[0]

print("Actual choices: \n")
for i in range(len(choices)):
    print(f"{choices[i]}: {actual_choices[i+1]*100:.2f}%")

Actual choices: 

walking: 17.38%
cycling: 2.86%
public: 35.58%
driving: 44.18%


In [69]:
# share of users choosing a mode with a higher (unweighted) probability for another mode, 

temp = {"Prob. walking": 1, "Prob. cycling": 2, "Prob. public": 3, "Prob. car": 4}
simulated_probs = simulated_values[["Prob. walking", "Prob. cycling", "Prob. public", "Prob. car"]]
choices_per_n = pd.DataFrame(simulated_probs.idxmax(axis=1), columns=["mode_by_prob"])
choices_per_n = choices_per_n.replace(temp)
df_merged = df.merge(choices_per_n, left_index=True, right_index=True)

deviates = df_merged[df_merged.travel_mode != df_merged.mode_by_prob]

print(f"Share of those choosing a mode despite having a higher prob. for another mode: "
      f"{deviates.shape[0]/df.shape[0] * 100:.2f}%")

# specifically, within each mode
for i in range(len(choices)):
    total = df.travel_mode.value_counts()[i+1]
    anomalies = deviates.travel_mode.value_counts()[i+1]
    print(f"Out of those who choose {choices[i]}, {anomalies/total * 100:.2f}% have a higher prob. of using"
        " another mode")

Share of those choosing a mode despite having a higher prob. for another mode: 28.86%
Out of those who choose walking, 36.25% have a higher prob. of using another mode
Out of those who choose cycling, 100.00% have a higher prob. of using another mode
Out of those who choose public, 31.08% have a higher prob. of using another mode
Out of those who choose driving, 19.56% have a higher prob. of using another mode


In [None]:
# revenues
