In [25]:
# Estimation of discrete choice model (multinomial logit)
# Binary choice between Uber and bus
# Sample: stated preference survey administered in UC Berkeley in March 2023
# Bus: fixed travel time (30 min), wait time (12 min), cost ($0)
# Uber: varying levels of these attributes

In [1]:
import pandas as pd
import numpy as np
import pylogit as pl
from collections import OrderedDict
import warnings
warnings.filterwarnings('ignore')

# Data processing

In [2]:
df = pd.read_csv('DDM-Bus-Uber_March 23, 2023_14.44.csv', skiprows=[1, 2]).fillna('')
df

Unnamed: 0,StartDate,EndDate,Status,IPAddress,Progress,Duration (in seconds),Finished,RecordedDate,ResponseId,RecipientLastName,...,F-7-2-3,F-8-3,F-8-1-3,F-8-2-3,F-9-3,F-9-1-3,F-9-2-3,F-10-3,F-10-1-3,F-10-2-3
0,2023-03-20 00:53:58,2023-03-20 00:54:42,Survey Preview,,100,43,True,2023-03-20 00:54:43,R_12L9n5w6AU7o5wP,,...,0,Travel Cost ($),8,0,Travel Cost ($),10,0,Travel Cost ($),8,0
1,2023-03-20 01:00:32,2023-03-20 01:01:30,Survey Preview,,100,58,True,2023-03-20 01:01:31,R_3EsyrHz314dFpbz,,...,0,Travel Cost ($),10,0,Travel Cost ($),10,0,Travel Cost ($),15,0
2,2023-03-20 01:13:05,2023-03-20 01:15:00,IP Address,76.133.252.103,100,115,True,2023-03-20 01:15:01,R_Q9TI2hCq3ck5p3b,,...,0,Travel Cost ($),15,0,Travel Cost ($),15,0,Travel Cost ($),15,0
3,2023-03-20 14:40:32,2023-03-20 14:43:25,IP Address,107.115.29.100,100,172,True,2023-03-20 14:43:26,R_2bPnFJ2CNCOoRPF,,...,0,Travel Cost ($),8,0,Travel Cost ($),15,0,Travel Cost ($),15,0
4,2023-03-20 20:54:35,2023-03-20 20:56:12,IP Address,192.184.206.215,100,97,True,2023-03-20 20:56:12,R_1fdVwMixGuc2SJm,,...,0,Travel Cost ($),10,0,Travel Cost ($),10,0,Travel Cost ($),10,0
5,2023-03-20 20:54:19,2023-03-20 20:56:27,IP Address,135.180.199.230,100,127,True,2023-03-20 20:56:28,R_3phS5Bk8jp4WX0P,,...,0,Travel Cost ($),10,0,Travel Cost ($),10,0,Travel Cost ($),12,0
6,2023-03-20 20:54:25,2023-03-20 20:58:47,IP Address,135.180.118.94,100,262,True,2023-03-20 20:58:48,R_2R7VQsoOhpWcjUk,,...,0,Travel Cost ($),12,0,Travel Cost ($),10,0,Travel Cost ($),8,0
7,2023-03-21 07:21:38,2023-03-21 07:22:37,IP Address,135.180.50.212,100,59,True,2023-03-21 07:22:37,R_afmC0uabIDFPZsJ,,...,0,Travel Cost ($),10,0,Travel Cost ($),12,0,Travel Cost ($),12,0
8,2023-03-21 10:45:42,2023-03-21 10:47:51,IP Address,136.152.143.83,100,128,True,2023-03-21 10:47:51,R_262q6pRLND5mTjw,,...,0,Travel Cost ($),10,0,Travel Cost ($),8,0,Travel Cost ($),15,0
9,2023-03-21 10:45:32,2023-03-21 10:48:45,IP Address,136.152.143.199,100,193,True,2023-03-21 10:48:46,R_2uxnVAJKlHzdrT3,,...,0,Travel Cost ($),8,0,Travel Cost ($),8,0,Travel Cost ($),12,0


In [3]:
#df = df[(df['Q13']!='') & (df['F-1-1-1']!='') & (df['Q8']!='')]

In [4]:
sample = df[['Q24', 'Q25', 'Q27', 'Timing _Page Submit',
             'Q28_Page Submit', 'Q30_Page Submit',
             'Q32_Page Submit', 'Q34_Page Submit', 'Q36_Page Submit',
             'Q38_Page Submit', 'Q40_Page Submit', 'Q42_Page Submit',
             'Q44_Page Submit', 'Q1', 'Q29', 'Q31', 'Q33', 'Q35', 'Q37', 'Q39', 'Q41', 'Q43', 'Q45',
             'F-1-1-1', 'F-1-1-2', 'F-1-1-3', 'F-1-2-1', 'F-1-2-2', 'F-1-2-3',
             'F-2-1-1', 'F-2-1-2', 'F-2-1-3', 'F-2-2-1', 'F-2-2-2', 'F-2-2-3',
             'F-3-1-1', 'F-3-1-2', 'F-3-1-3', 'F-3-2-1', 'F-3-2-2', 'F-3-2-3',
             'F-4-1-1', 'F-4-1-2', 'F-4-1-3', 'F-4-2-1', 'F-4-2-2', 'F-4-2-3',
             'F-5-1-1', 'F-5-1-2', 'F-5-1-3', 'F-5-2-1', 'F-5-2-2', 'F-5-2-3',
             'F-6-1-1', 'F-6-1-2', 'F-6-1-3', 'F-6-2-1', 'F-6-2-2', 'F-6-2-3',
             'F-7-1-1', 'F-7-1-2', 'F-7-1-3', 'F-7-2-1', 'F-7-2-2', 'F-7-2-3',
             'F-8-1-1', 'F-8-1-2', 'F-8-1-3', 'F-8-2-1', 'F-8-2-2', 'F-8-2-3',
             'F-9-1-1', 'F-9-1-2', 'F-9-1-3', 'F-9-2-1', 'F-9-2-2', 'F-9-2-3',
             'F-10-1-1', 'F-10-1-2', 'F-10-1-3', 'F-10-2-1', 'F-10-2-2', 'F-10-2-3']]

col_name = {'Q24': 'time_day', 'Q25': 'alone', 'Q27': 'mode', 'Q1': 'choice_1',
            'Q29': 'choice_2', 'Q31': 'choice_3', 'Q33': 'choice_4', 'Q35': 'choice_5',
            'Q37': 'choice_6','Q39': 'choice_7','Q41': 'choice_8','Q43': 'choice_9','Q45': 'choice_10',
            'F-1-1-1': 'travelt_1a', 'F-1-1-2': 'waitt_1a', 'F-1-1-3': 'cost_1a',
            'F-1-2-1': 'travelt_1b', 'F-1-2-2': 'waitt_1b', 'F-1-2-3': 'cost_1b',
            'F-2-1-1': 'travelt_2a', 'F-2-1-2': 'waitt_2a', 'F-2-1-3': 'cost_2a',
            'F-2-2-1': 'travelt_2b', 'F-2-2-2': 'waitt_2b', 'F-2-2-3': 'cost_2b',
            'F-3-1-1': 'travelt_3a', 'F-3-1-2': 'waitt_3a', 'F-3-1-3': 'cost_3a',
            'F-3-2-1': 'travelt_3b', 'F-3-2-2': 'waitt_3b', 'F-3-2-3': 'cost_3b',
            'F-4-1-1': 'travelt_4a', 'F-4-1-2': 'waitt_4a', 'F-4-1-3': 'cost_4a',
            'F-4-2-1': 'travelt_4b', 'F-4-2-2': 'waitt_4b', 'F-4-2-3': 'cost_4b',
            'F-5-1-1': 'travelt_5a', 'F-5-1-2': 'waitt_5a', 'F-5-1-3': 'cost_5a',
            'F-5-2-1': 'travelt_5b', 'F-5-2-2': 'waitt_5b', 'F-5-2-3': 'cost_5b',
            'F-6-1-1': 'travelt_6a', 'F-6-1-2': 'waitt_6a', 'F-6-1-3': 'cost_6a',
            'F-6-2-1': 'travelt_6b', 'F-6-2-2': 'waitt_6b', 'F-6-2-3': 'cost_6b',
            'F-7-1-1': 'travelt_7a', 'F-7-1-2': 'waitt_7a', 'F-7-1-3': 'cost_7a',
            'F-7-2-1': 'travelt_7b', 'F-7-2-2': 'waitt_7b', 'F-7-2-3': 'cost_7b',
            'F-8-1-1': 'travelt_8a', 'F-8-1-2': 'waitt_8a', 'F-8-1-3': 'cost_8a',
            'F-8-2-1': 'travelt_8b', 'F-8-2-2': 'waitt_8b', 'F-8-2-3': 'cost_8b',
            'F-9-1-1': 'travelt_9a', 'F-9-1-2': 'waitt_9a', 'F-9-1-3': 'cost_9a',
            'F-9-2-1': 'travelt_9b', 'F-9-2-2': 'waitt_9b', 'F-9-2-3': 'cost_9b',
            'F-10-1-1': 'travelt_10a', 'F-10-1-2': 'waitt_10a', 'F-10-1-3': 'cost_10a',
            'F-10-2-1': 'travelt_10b', 'F-10-2-2': 'waitt_10b', 'F-10-2-3': 'cost_10b'}

sample = sample.rename(columns=col_name).reset_index(drop=True)


sample['choice_1'] = sample['choice_1'].map({'Ridehail': 1, 'Bus': 2})
sample['choice_2'] = sample['choice_2'].map({'Ridehail': 1, 'Bus': 2})
sample['choice_3'] = sample['choice_3'].map({'Ridehail': 1, 'Bus': 2})
sample['choice_4'] = sample['choice_4'].map({'Ridehail': 1, 'Bus': 2})
sample['choice_5'] = sample['choice_5'].map({'Ridehail': 1, 'Bus': 2})
sample['choice_6'] = sample['choice_6'].map({'Ridehail': 1, 'Bus': 2})
sample['choice_7'] = sample['choice_7'].map({'Ridehail': 1, 'Bus': 2})
sample['choice_8'] = sample['choice_8'].map({'Ridehail': 1, 'Bus': 2})
sample['choice_9'] = sample['choice_9'].map({'Ridehail': 1, 'Bus': 2})
sample['choice_10'] = sample['choice_10'].map({'Ridehail': 1, 'Bus': 2})
#sample['age'] = sample['age'].map({'Below 18\t': 1, '18-25': 2, '26-35': 3, '36-45': 4, '45-60': 5, 'above 60': 6})
#sample['gender'] = sample['gender'].map({'Male': 1, 'Female': 2, 'Non-binary / third gender': 3,
#'Prefer not to say': 4})
#sample['occ'] = sample['occ'].map({'Student at UC Berkeley': 1, 'Faculty at UC Berkeley': 2, 'Non UC Berkeley': 3})
#sample['income'] = sample['income'].map({'Less than $50k': 1, ' $50k-150k': 2, 'Greater than $150k': 3})
#sample['hh_size'] = sample['hh_size'].map({'1': 1, '2': 2, '3': 3, '4': 4, '>=5': 5})
#sample['have_driven'] = sample['have_driven'].map({'Yes': 1, 'No': 0})
#sample['trip_purp'] = sample['trip_purp'].map({'Work': 1, 'School': 2, 'Shopping': 3, 'Leisure': 4, 'Other': 5})
#sample['start_time'] = [int(t) if t.isdigit() else '' for t in sample['start_time']]
#sample['trip_freq'] = sample['trip_freq'].map({'1': 1, '2': 2, '3': 3, '4': 4, '>=5': 5})
#for i in range(1, 5):
    #sample[f'choice_{i}'] = sample[f'choice_{i}'].map({'Alternative A': 1, 'Alternative B': 2, 'Alternative C': 3})

In [5]:
len(sample)
sample=sample[3:]

In [6]:
len(sample)

57

In [7]:
sample=sample.dropna(subset=['choice_1', 'choice_2', 'choice_3', 'choice_4', 'choice_5',
                     'choice_6', 'choice_7', 'choice_8', 'choice_9', 'choice_10']).reset_index(drop=True)

In [8]:
len(sample)

56

In [9]:
cols=['choice_1', 'choice_2', 'choice_3', 'choice_4', 'choice_5',
                     'choice_6', 'choice_7', 'choice_8', 'choice_9', 'choice_10']

In [10]:
for col in cols:
    sample[f'{col}'] = sample[f'{col}'].fillna(-1).astype(int)

In [11]:
sample['choice_2'].value_counts()

2    44
1    12
Name: choice_2, dtype: int64

In [12]:
# If we only analyze drivers' choice
# sample = sample[sample['is_driver']==1].reset_index(drop=True)

In [13]:
sample.insert(0, 'id', [i+1 for i in sample.index])

In [14]:
sample.columns

Index(['id', 'time_day', 'alone', 'mode', 'Timing _Page Submit',
       'Q28_Page Submit', 'Q30_Page Submit', 'Q32_Page Submit',
       'Q34_Page Submit', 'Q36_Page Submit', 'Q38_Page Submit',
       'Q40_Page Submit', 'Q42_Page Submit', 'Q44_Page Submit', 'choice_1',
       'choice_2', 'choice_3', 'choice_4', 'choice_5', 'choice_6', 'choice_7',
       'choice_8', 'choice_9', 'choice_10', 'travelt_1a', 'waitt_1a',
       'cost_1a', 'travelt_1b', 'waitt_1b', 'cost_1b', 'travelt_2a',
       'waitt_2a', 'cost_2a', 'travelt_2b', 'waitt_2b', 'cost_2b',
       'travelt_3a', 'waitt_3a', 'cost_3a', 'travelt_3b', 'waitt_3b',
       'cost_3b', 'travelt_4a', 'waitt_4a', 'cost_4a', 'travelt_4b',
       'waitt_4b', 'cost_4b', 'travelt_5a', 'waitt_5a', 'cost_5a',
       'travelt_5b', 'waitt_5b', 'cost_5b', 'travelt_6a', 'waitt_6a',
       'cost_6a', 'travelt_6b', 'waitt_6b', 'cost_6b', 'travelt_7a',
       'waitt_7a', 'cost_7a', 'travelt_7b', 'waitt_7b', 'cost_7b',
       'travelt_8a', 'waitt_8a', '

In [15]:
sample_long = sample.iloc[:, :14]
sample_long = sample_long.loc[sample_long.index.repeat(20)].reset_index(drop=True)

In [16]:
sample_long

Unnamed: 0,id,time_day,alone,mode,Timing _Page Submit,Q28_Page Submit,Q30_Page Submit,Q32_Page Submit,Q34_Page Submit,Q36_Page Submit,Q38_Page Submit,Q40_Page Submit,Q42_Page Submit,Q44_Page Submit
0,1,Morning (06:00-11:59),Yes,BART,1.955,1.616,1.765,2.517,2.237,1.546,1.629,1.503,1.616,1.515
1,1,Morning (06:00-11:59),Yes,BART,1.955,1.616,1.765,2.517,2.237,1.546,1.629,1.503,1.616,1.515
2,1,Morning (06:00-11:59),Yes,BART,1.955,1.616,1.765,2.517,2.237,1.546,1.629,1.503,1.616,1.515
3,1,Morning (06:00-11:59),Yes,BART,1.955,1.616,1.765,2.517,2.237,1.546,1.629,1.503,1.616,1.515
4,1,Morning (06:00-11:59),Yes,BART,1.955,1.616,1.765,2.517,2.237,1.546,1.629,1.503,1.616,1.515
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1115,56,Evening (18:00-21:59),No,Private vehicle (either driver or passenger),15.5,3.633,3.157,2.297,1.725,1.767,3.004,2.938,3.331,211.515
1116,56,Evening (18:00-21:59),No,Private vehicle (either driver or passenger),15.5,3.633,3.157,2.297,1.725,1.767,3.004,2.938,3.331,211.515
1117,56,Evening (18:00-21:59),No,Private vehicle (either driver or passenger),15.5,3.633,3.157,2.297,1.725,1.767,3.004,2.938,3.331,211.515
1118,56,Evening (18:00-21:59),No,Private vehicle (either driver or passenger),15.5,3.633,3.157,2.297,1.725,1.767,3.004,2.938,3.331,211.515


In [17]:

#sample_long = sample.iloc[:, :4]
#sample_long = sample_long.loc[sample_long.index.repeat(5)].reset_index(drop=True)

t_t, t_w, c, choice = [[] for i in range(4)]
for i in sample_long.index:
    y = i//20
    if (i%20)%2 == 0:
        x = int((i%20)/2*6)
        t_t.append(sample.iloc[y, 24+x])
        t_w.append(sample.iloc[y, 25+x])
        c.append(sample.iloc[y, 26+x])
    elif (i%20)%2 == 1:
        x = int((i%20-1)/2*6)
        t_t.append(sample.iloc[y, 27+x])
        t_w.append(sample.iloc[y, 28+x])
        c.append(sample.iloc[y, 29+x])
    else:
        t_t.append(0)
        t_w.append(0)
        c.append(0)

sample_long['travelt'] = t_t
sample_long['waitt'] = t_w
sample_long['cost'] = c

for i in sample.index:
    temp = [0 for a in range(20)]
    for j in range(10):
        if sample.iloc[i, 14+j] == 1:
            temp[2*j] = 1
        elif sample.iloc[i, 14+j] == 2:
            temp[2*j+1] = 1
    choice.extend(temp)
            
sample_long['choice'] = choice
sample_long.insert(1, 'alt', [i%2+1 for i in sample_long.index])
sample_long.insert(2, 'sit', [i//2+1 for i in sample_long.index])

In [18]:
sample_long.head(20)

Unnamed: 0,id,alt,sit,time_day,alone,mode,Timing _Page Submit,Q28_Page Submit,Q30_Page Submit,Q32_Page Submit,Q34_Page Submit,Q36_Page Submit,Q38_Page Submit,Q40_Page Submit,Q42_Page Submit,Q44_Page Submit,travelt,waitt,cost,choice
0,1,1,1,Morning (06:00-11:59),Yes,BART,1.955,1.616,1.765,2.517,2.237,1.546,1.629,1.503,1.616,1.515,10,2,10,1
1,1,2,1,Morning (06:00-11:59),Yes,BART,1.955,1.616,1.765,2.517,2.237,1.546,1.629,1.503,1.616,1.515,30,12,0,0
2,1,1,2,Morning (06:00-11:59),Yes,BART,1.955,1.616,1.765,2.517,2.237,1.546,1.629,1.503,1.616,1.515,20,4,12,0
3,1,2,2,Morning (06:00-11:59),Yes,BART,1.955,1.616,1.765,2.517,2.237,1.546,1.629,1.503,1.616,1.515,30,12,0,1
4,1,1,3,Morning (06:00-11:59),Yes,BART,1.955,1.616,1.765,2.517,2.237,1.546,1.629,1.503,1.616,1.515,10,6,10,1
5,1,2,3,Morning (06:00-11:59),Yes,BART,1.955,1.616,1.765,2.517,2.237,1.546,1.629,1.503,1.616,1.515,30,12,0,0
6,1,1,4,Morning (06:00-11:59),Yes,BART,1.955,1.616,1.765,2.517,2.237,1.546,1.629,1.503,1.616,1.515,25,6,15,0
7,1,2,4,Morning (06:00-11:59),Yes,BART,1.955,1.616,1.765,2.517,2.237,1.546,1.629,1.503,1.616,1.515,30,12,0,1
8,1,1,5,Morning (06:00-11:59),Yes,BART,1.955,1.616,1.765,2.517,2.237,1.546,1.629,1.503,1.616,1.515,20,10,15,1
9,1,2,5,Morning (06:00-11:59),Yes,BART,1.955,1.616,1.765,2.517,2.237,1.546,1.629,1.503,1.616,1.515,30,12,0,0


# Model formulation & parameter estimation

We noticed that the parameter for $t_\textit{shift}$ is non-significant. Possible reasons include: a) respondents are less sensitive to this variable compared with other variables; 2) respondents generally ignored this variable when answering the survey. We finally used different parameters for positive $t_\textit{shift}$ (depart later) and negative $t_\textit{shift}$ (depart earlier).

In [19]:
#sample_long['pos_shift'] = [t if t>0 else 0 for t in t_s]
#sample_long['neg_shift'] = [-t if t<0 else 0 for t in t_s]

In [20]:
#sample_long['cost_low'] = [i if j==1 else 0 for i, j in zip(c, sample_long['income'])]
#sample_long['cost_high'] = [i if (j==2 or j==3) else 0 for i, j in zip(c, sample_long['income'])]

## MNL model

In [21]:
basic_spec = OrderedDict()
basic_names = OrderedDict()

basic_spec['intercept'] = [2]
basic_names['intercept'] = ['ASC']

basic_spec['travelt'] = [[1, 2]]
basic_names['travelt'] = ['t_travel (min)']

basic_spec['waitt'] = [[1, 2]]
basic_names['waitt'] = ['t_waiting (min)']

basic_spec['cost'] = [[1, 2]]
basic_names['cost'] = ['cost ($)']


### MNL

In [22]:
mnl = pl.create_choice_model(data=sample_long,
                             alt_id_col='alt',
                             obs_id_col='sit',
                             choice_col='choice',
                             specification=basic_spec,
                             model_type='MNL',
                             names=basic_names)

In [23]:
mnl.fit_mle(np.zeros(4))
mnl.get_statsmodels_summary()

Log-likelihood at zero: -388.1624
Initial Log-likelihood: -388.1624
Estimation Time for Point Estimation: 0.01 seconds.
Final log-likelihood: -275.9645


0,1,2,3
Dep. Variable:,choice,No. Observations:,560.0
Model:,Multinomial Logit Model,Df Residuals:,556.0
Method:,MLE,Df Model:,4.0
Date:,"Tue, 04 Apr 2023",Pseudo R-squ.:,0.289
Time:,20:34:27,Pseudo R-bar-squ.:,0.279
AIC:,559.929,Log-Likelihood:,-275.965
BIC:,577.241,LL-Null:,-388.162

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ASC,0.9545,0.574,1.663,0.096,-0.170,2.079
t_travel (min),-0.1281,0.020,-6.553,0.000,-0.166,-0.090
t_waiting (min),-0.1188,0.037,-3.183,0.001,-0.192,-0.046
cost ($),-0.2338,0.045,-5.216,0.000,-0.322,-0.146


In [24]:
sample_long.to_csv('data_ddm.csv')