In [3]:
import sys
import keras as K
import tensorflow as tf

py_ver = sys.version
k_ver = K.__version__
tf_ver = tf.__version__

print("Using Python version " + str(py_ver))
print("Using Keras version " + str(k_ver))
print("Using TensorFlow version " + str(tf_ver))

import pandas as pd
import numpy as np
import pyblp
import statsmodels.api as sm
from linearmodels.iv import IV2SLS
from scipy.optimize import minimize
import scipy
from numba import jit, njit, prange
from numba.typed import List
import time
import multiprocessing as mp
import pickle
from sklearn.linear_model import LinearRegression
from matplotlib import pyplot

import biogeme.database as db
import biogeme.biogeme as bio
from biogeme import models
import biogeme.messaging as msg
from biogeme.expressions import Beta, DefineVariable

Using Python version 3.8.5 (default, Sep  4 2020, 02:22:02) 
[Clang 10.0.0 ]
Using Keras version 2.6.0
Using TensorFlow version 2.6.0


Research that uses DNN to solve discrete choice model
- 
- 
- 


In [18]:
# import data (raw car data)
df = pd.read_csv(r'../data/BLP_1995_data/BLP_1995_data.csv')
print('number of observations: ', df.shape[0], ';', 'number of variables:', df.shape[1])

df[["ln_hpwt", "ln_space", "ln_mpg", "ln_mpd", "ln_price"]] = \
    df[["hpwt", "space", "mpg", "mpd", "price"]].apply(lambda x: np.log(x))

# instrument
df["trend"] = df["market"] + 70

df["cons"] = 1

df["s_0"] = np.log(1 - df.share.groupby(df["model_year"]).transform("sum"))

df["s_i"] = np.log(df.share)
df["dif"] = df.s_i - df.s_0
df["dif_2"] = np.log(df.share) - np.log(df.share_out)
df["ln_price"] = np.log(df.price)

df.head()

number of observations:  2217 ; number of variables: 16


Unnamed: 0,prodvec,modelvec,newmodv,model_year,id,firmid,market,hpwt,space,air,...,ln_space,ln_mpg,ln_mpd,ln_price,trend,cons,s_0,s_i,dif,dif_2
0,AMGREM,AMGREM,AMGREM71,71,129,15,1,0.528997,1.1502,0.0,...,0.139936,0.528862,0.635595,1.596515,71,1,-0.171483,-6.858013,-6.686531,-6.7303
1,AMHORN,AMHORN,AMHORN71,71,130,15,1,0.494324,1.278,0.0,...,0.245296,0.553885,0.660618,1.707662,71,1,-0.171483,-7.308233,-7.13675,-7.18052
2,AMJAVL,AMJAVL,AMJAVL71,71,132,15,1,0.467613,1.4592,0.0,...,0.377888,0.433729,0.540462,1.961311,71,1,-0.171483,-7.983628,-7.812146,-7.855915
3,AMMATA,AMMATA,AMMATA71,71,134,15,1,0.42654,1.6068,0.0,...,0.474245,0.416735,0.523468,1.922716,71,1,-0.171483,-7.557843,-7.38636,-7.43013
4,AMAMBS,AMAMBS,AMAMBS71,71,136,15,1,0.452489,1.6458,0.0,...,0.498227,0.301585,0.408318,2.189237,71,1,-0.171483,-7.724201,-7.552718,-7.596488


In [19]:
# estimated log income means for years 1971 - 1990
IncomeMeans = [2.01156, 2.06526, 2.07843, 2.05775, 2.02915, 2.05346, 2.06745,
               2.09805, 2.10404, 2.07208, 2.06019, 2.06561, 2.07672, 2.10437, 
               2.12608, 2.16426, 2.18071, 2.18856, 2.21250, 2.18377]


temp1 = pd.DataFrame(columns = ['model_year', 'log_income_mean'])
for i in range(20):
    temp1.loc[i] = [i+71, IncomeMeans[i]]
temp1.head()

Unnamed: 0,model_year,log_income_mean
0,71.0,2.01156
1,72.0,2.06526
2,73.0,2.07843
3,74.0,2.05775
4,75.0,2.02915


In [20]:
df = df.merge(temp1, how='left')
df.head()

Unnamed: 0,prodvec,modelvec,newmodv,model_year,id,firmid,market,hpwt,space,air,...,ln_mpg,ln_mpd,ln_price,trend,cons,s_0,s_i,dif,dif_2,log_income_mean
0,AMGREM,AMGREM,AMGREM71,71,129,15,1,0.528997,1.1502,0.0,...,0.528862,0.635595,1.596515,71,1,-0.171483,-6.858013,-6.686531,-6.7303,2.01156
1,AMHORN,AMHORN,AMHORN71,71,130,15,1,0.494324,1.278,0.0,...,0.553885,0.660618,1.707662,71,1,-0.171483,-7.308233,-7.13675,-7.18052,2.01156
2,AMJAVL,AMJAVL,AMJAVL71,71,132,15,1,0.467613,1.4592,0.0,...,0.433729,0.540462,1.961311,71,1,-0.171483,-7.983628,-7.812146,-7.855915,2.01156
3,AMMATA,AMMATA,AMMATA71,71,134,15,1,0.42654,1.6068,0.0,...,0.416735,0.523468,1.922716,71,1,-0.171483,-7.557843,-7.38636,-7.43013,2.01156
4,AMAMBS,AMAMBS,AMAMBS71,71,136,15,1,0.452489,1.6458,0.0,...,0.301585,0.408318,2.189237,71,1,-0.171483,-7.724201,-7.552718,-7.596488,2.01156


Product-related variables: constant, hpwt, air, mpd, space, price

Customer-related variables: log_income_mean

Outcome variable: share, i.e., the **market share** of the $j$th product in market $t$ 
  - $s_{jt}=f(constant, hpwt, air, mpd, space, price, log income mean)$

In [21]:
df_train = df.sample(frac = 0.8)
df_test = df[~df.index.isin(df_train.index)]
print('number of observations in train data: ', len(df_train))
print('number of observations in test data: ', len(df_test))

number of observations in train data:  1774
number of observations in test data:  443


- The model expects rows of data with 7 variables (the input_dim=7 argument)
- The first hidden layer has 12 nodes and uses the relu activation function.
- The second hidden layer has 8 nodes and uses the relu activation function.
- The third hidden layer has 8 nodes and uses the relu activation function.
- The output layer has one node and uses the sigmoid activation function.

In [27]:
from keras.models import Sequential
from keras.layers import Dense

In [38]:
init = K.initializers.glorot_uniform(seed=1)
model = K.models.Sequential()
model = Sequential()
model.add(Dense(12, input_dim=7, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='mean_squared_error', optimizer="Adam", metrics=['mse'])

In [40]:
trainX = df_train[["cons", "hpwt", "air", "mpd", "space", "log_income_mean", "price"]].values
trainy = df_train[["share"]].values
testX = df_test[["cons", "hpwt", "air", "mpd", "space", "log_income_mean", "price"]].values
testy = df_test[["share"]].values

In [44]:
# fit model
history = model.fit(trainX, trainy, epochs=1, verbose='auto')
# evaluate the model
_, train_mse = model.evaluate(trainX, trainy)
_, test_mse = model.evaluate(testX, testy)
print('Train: %.3f, Test: %.3f' % (train_mse, test_mse))

Train: 0.016, Test: 0.017


# Load Data

In [51]:
df = pd.read_csv(r'../data/swissmetro/swissmetro.dat', sep='\t')

In [52]:
df.head()

Unnamed: 0,GROUP,SURVEY,SP,ID,PURPOSE,FIRST,TICKET,WHO,LUGGAGE,AGE,...,TRAIN_TT,TRAIN_CO,TRAIN_HE,SM_TT,SM_CO,SM_HE,SM_SEATS,CAR_TT,CAR_CO,CHOICE
0,2,0,1,1,1,0,1,1,0,3,...,112,48,120,63,52,20,0,117,65,2
1,2,0,1,1,1,0,1,1,0,3,...,103,48,30,60,49,10,0,117,84,2
2,2,0,1,1,1,0,1,1,0,3,...,130,48,60,67,58,30,0,117,52,2
3,2,0,1,1,1,0,1,1,0,3,...,103,40,30,63,52,20,0,72,52,2
4,2,0,1,1,1,0,1,1,0,3,...,130,36,60,63,42,20,0,90,84,2
