In [1]:
# import packages
import pandas as pd
import numpy as np
import pyblp
import statsmodels.api as sm
from linearmodels.iv import IV2SLS

# Data Description

- The data is about tht automotive market in the U.S.
- It has 2217 observations, and 16 variables
  - 20 markets
  - 26 firms
  - 20 years: from 1971 to 1990
 
- Key variables:

|Variable name |Description|
|----|----------------|
|mpg |tens of miles per gallon (also indicating miles per dollar)|
|hpwt | the ratio of horsepower to weight (in HP per 10 lbs.)|
|air |whether the car has air conditioning|
|quantity |in unit of 1000|
|price |in \$1000 units|
|size |length times width|
|share |🙋? market size was measured using the number of households in the U.S.|
|share_out |🙋? |

In [2]:
# import data (raw car data)
df = pd.read_csv(r'../data/BLP_1995_data/BLP_1995_data.csv')
print('number of observations: ', df.shape[0], ';', 'number of variables:', df.shape[1])
df.head()

number of observations:  2217 ; number of variables: 16


Unnamed: 0,prodvec,modelvec,newmodv,model_year,id,firmid,market,hpwt,space,air,mpd,price,mpg,quantity,share,share_out
0,AMGREM,AMGREM,AMGREM71,71,129,15,1,0.528997,1.1502,0.0,1.888146,4.935802,1.697,70.096,0.001051,0.880106
1,AMHORN,AMHORN,AMHORN71,71,130,15,1,0.494324,1.278,0.0,1.935989,5.516049,1.74,44.678,0.00067,0.880106
2,AMJAVL,AMJAVL,AMJAVL71,71,132,15,1,0.467613,1.4592,0.0,1.716799,7.108642,1.543,22.705,0.000341,0.880106
3,AMMATA,AMMATA,AMMATA71,71,134,15,1,0.42654,1.6068,0.0,1.687871,6.839506,1.517,34.821,0.000522,0.880106
4,AMAMBS,AMAMBS,AMAMBS71,71,136,15,1,0.452489,1.6458,0.0,1.504286,8.928395,1.352,29.499,0.000442,0.880106


In [5]:
# obtain additional instrumental data from pyblp.data
product_data = pd.read_csv(pyblp.data.BLP_PRODUCTS_LOCATION)
product_data.head()

Unnamed: 0,market_ids,clustering_ids,car_ids,firm_ids,region,shares,prices,hpwt,air,mpd,...,supply_instruments2,supply_instruments3,supply_instruments4,supply_instruments5,supply_instruments6,supply_instruments7,supply_instruments8,supply_instruments9,supply_instruments10,supply_instruments11
0,1971,AMGREM71,129,15,US,0.001051,4.935802,0.528997,0,1.888146,...,0.0,1.705933,1.595656,87.0,-61.959985,0.0,46.060389,29.786989,0.0,1.888146
1,1971,AMHORN71,130,15,US,0.00067,5.516049,0.494324,0,1.935989,...,0.0,1.68091,1.490295,87.0,-61.959985,0.0,46.060389,29.786989,0.0,1.935989
2,1971,AMJAVL71,132,15,US,0.000341,7.108642,0.467613,0,1.716799,...,0.0,1.801067,1.357703,87.0,-61.959985,0.0,46.060389,29.786989,0.0,1.716799
3,1971,AMMATA71,134,15,US,0.000522,6.839506,0.42654,0,1.687871,...,0.0,1.818061,1.261347,87.0,-61.959985,0.0,46.060389,29.786989,0.0,1.687871
4,1971,AMAMBS71,136,15,US,0.000442,8.928395,0.452489,0,1.504286,...,0.0,1.93321,1.237365,87.0,-61.959985,0.0,46.060389,29.786989,0.0,1.504286


In [4]:
# clean data and create new variables

df[["ln_hpwt", "ln_space", "ln_mpg", "ln_mpd", "ln_price"]] = \
    df[["hpwt", "space", "mpg", "mpd", "price"]].apply(lambda x: np.log(x))

# instrument
df["trend"] = df["market"] + 70

df["cons"] = 1

df["s_0"] = np.log(1 - df.share.groupby(df["model_year"]).transform("sum"))

df["s_i"] = np.log(df.share)
df["dif"] = df.s_i - df.s_0
df["dif_2"] = np.log(df.share) - np.log(df.share_out)
df["ln_price"] = np.log(df.price)

df.head()

Unnamed: 0,prodvec,modelvec,newmodv,model_year,id,firmid,market,hpwt,space,air,...,ln_space,ln_mpg,ln_mpd,ln_price,trend,cons,s_0,s_i,dif,dif_2
0,AMGREM,AMGREM,AMGREM71,71,129,15,1,0.528997,1.1502,0.0,...,0.139936,0.528862,0.635595,1.596515,71,1,-0.171483,-6.858013,-6.686531,-6.7303
1,AMHORN,AMHORN,AMHORN71,71,130,15,1,0.494324,1.278,0.0,...,0.245296,0.553885,0.660618,1.707662,71,1,-0.171483,-7.308233,-7.13675,-7.18052
2,AMJAVL,AMJAVL,AMJAVL71,71,132,15,1,0.467613,1.4592,0.0,...,0.377888,0.433729,0.540462,1.961311,71,1,-0.171483,-7.983628,-7.812146,-7.855915
3,AMMATA,AMMATA,AMMATA71,71,134,15,1,0.42654,1.6068,0.0,...,0.474245,0.416735,0.523468,1.922716,71,1,-0.171483,-7.557843,-7.38636,-7.43013
4,AMAMBS,AMAMBS,AMAMBS71,71,136,15,1,0.452489,1.6458,0.0,...,0.498227,0.301585,0.408318,2.189237,71,1,-0.171483,-7.724201,-7.552718,-7.596488


In [5]:
# prepare mediate data

# demand variables
X = df[["cons", "hpwt", "air", "mpd", "space"]].values

# suppy variables
W = df[["cons", "ln_hpwt", "air", "ln_mpg", "ln_space", "trend"]].values

# price
Price = df.price.values

# initial delta_0 estimate: log(share) - log(share outside good)
Delta_0 = df.dif_2.values

# number of goods per market
J = df.groupby("model_year").sum().cons.values

# number of draws per market
N = 500

# number of markets
T = len(J)

# estimated log income means for years 1971 - 1990
incomeMeans = [2.01156, 2.06526, 2.07843, 2.05775, 2.02915, 2.05346, 2.06745,
               2.09805, 2.10404, 2.07208, 2.06019, 2.06561, 2.07672, 2.10437, 
               2.12608, 2.16426, 2.18071, 2.18856, 2.21250, 2.18377]

# standard deviation of log incomes, assuming empirically given in 1995
sigma_v = 1.72

# number of terms that have the random coefficient
# according to table 4, they are constant, hpwt, air, mpd, space, price
k = 5

# markets for ijt
markets = df.market.values

# unique markets
unique_markets = np.unique(df.market)

# firms
firms = np.reshape(df.firmid.values, (-1,1))

# Review on Choice Model
- The indirect utility of consumer $i$ from consuming product $i$ in market $t$: 
$$u_{ijt}=\alpha_i (y_i-p_{jt}) + {x}_{jt}^{T}{\beta}_i+\xi_{jt}+ \epsilon_{ijt}$$
  -  $\alpha_i$: consumer $i$’s marginal utility from income
  -  $y_i$: the income of consumer $i$
  - $p_{jt}$: price of product $j$ in market $t$
  - ${x}_{jt}$: K-dimensional (column) vector of non-price attributes of product $j$ in market $t$
  - ${\beta}_i$: K-dimensional (column) vector of individual-specific taste coefficients
  - $\xi_{jt}$: utility of unobserved attributes of product $j$ in market $t$
  - $\varepsilon_{njm}$: idiosyncratic unobserved utility



# 1 Standard (Simple) Logit Model 

- Assuming $\epsilon_{ijt}$ follows Type I Extreme Value Distribution, then the probability that consumer $i$ will choose product $j$ 
$$p_{ijt}(v_i)=\frac{exp(\delta_{jt}+v_{ijt})}{exp(\delta_{0}+v_{ikt})+\sum_{k=1}^J exp(\delta_{kt}+v_{ikt})}$$
  - $\delta_{jt}$: mean utility
  - $p_{ijt}(v_i)$: the probability of customer $i$ selecting product $j$ in market $t$

- If $v_{ijt}=0$, it reduces to the standard logit model , i.e., $$p_{ijt}(v_i)=\frac{exp(\delta_{jt} )}{exp(\delta_{0})+\sum_{k=1}^J exp(\delta_{kt})}$$
  - Then, $$ln(p_{jt}) - ln(p_{0}) = \delta_{jt} - \delta_{0}$$
  - market share $s_{jt} = p_{jt}$
  - WLG, assume $\delta_{0}$ to be 0, and the utility $\delta_{jt} = ln(s_{jt}) - ln(s_{0})$

## 1.1 Demand Side: Regression Specification of Standard (Simple) Logit Model 
- Outcome varible: $\delta_{jt}$, i.e., $ln(s_{jt}) - ln(s_{0})$
- Formula:
>$\delta_{jt}$ ~ hpwt + air + mpd + space + price

Most coefficients are of the expected sign, although the (imprecisely estimated) negative coefficients on air conditioning and size are anomalies, as one would expect these attributes to yield positive marginal utility. On the other hand these estimates have a distinctly implausible set of implications on own price elasticities. **The estimated coefficient on price in Table III implies that 1494 of the 2217 models have inelastic demands.(🙋?)** This is inconsistent with profit maximizing price choices. Moreover this is not simply a problem generated by an imprecise estimate of the price coefficient.

In [7]:
# Table 3 column (1) in BLP 1995
df["utility_simple_logit"] = np.log(df.share) - np.log(df.share_out)
ols_res = sm.OLS.from_formula('utility_simple_logit ~ 1 + hpwt + air + mpd + space + price', data = df).fit()
print(ols_res.summary2(float_format="%.6f"))

                   Results: Ordinary least squares
Model:              OLS                  Adj. R-squared:     0.386    
Dependent Variable: utility_simple_logit AIC:                6648.6879
Date:               2021-08-12 10:46     BIC:                6682.9114
No. Observations:   2217                 Log-Likelihood:     -3318.3  
Df Model:           5                    F-statistic:        279.3    
Df Residuals:       2211                 Prob (F-statistic): 5.83e-232
R-squared:          0.387                Scale:              1.1716   
------------------------------------------------------------------------
             Coef.     Std.Err.      t       P>|t|     [0.025     0.975]
------------------------------------------------------------------------
Intercept   -10.0730     0.2528   -39.8458   0.0000   -10.5688   -9.5773
hpwt         -0.1231     0.2771    -0.4442   0.6570    -0.6666    0.4204
air          -0.0344     0.0728    -0.4728   0.6364    -0.1771    0.1083
mpd           

## 1.2 Supply Side: a Cobb-Douglas Form
- Marginal cost is assumed to take a Cobb-Douglas form, i.e.
$$ \textit{marginal cost} = \gamma_1^{w_1}\gamma_2^{w_2}...\gamma_k^{w_k}e^{\epsilon} $$

- Taking logs of both sides gives the linear form

- Formula:
> ln_price ~ ln_hpwt + air + ln_mpg + ln_space + trend

In [6]:
# Table 3 column (3) in BLP 1995
ols_res = sm.OLS.from_formula('ln_price ~ 1 + ln_hpwt + air + ln_mpg + ln_space + trend', data = df).fit()
print(ols_res.summary2(float_format="%.6f"))

                 Results: Ordinary least squares
Model:              OLS              Adj. R-squared:     0.656    
Dependent Variable: ln_price         AIC:                1146.0122
Date:               2021-08-12 11:55 BIC:                1180.2356
No. Observations:   2217             Log-Likelihood:     -567.01  
Df Model:           5                F-statistic:        844.9    
Df Residuals:       2211             Prob (F-statistic): 0.00     
R-squared:          0.656            Scale:              0.097915 
-------------------------------------------------------------------
                Coef.   Std.Err.     t     P>|t|    [0.025   0.975]
-------------------------------------------------------------------
Intercept       1.8819    0.1188  15.8465  0.0000   1.6490   2.1148
ln_hpwt         0.5203    0.0351  14.8327  0.0000   0.4515   0.5891
air             0.6798    0.0188  36.2471  0.0000   0.6430   0.7165
ln_mpg         -0.4706    0.0485  -9.6943  0.0000  -0.5658  -0.3754
ln_spa

# 2 Using Intrumental Variable in Logit Model

These abnormal elasticities are due to the endogeneity of price. Cars with better unobserved quality will tend to have higher prices as well. A simple remedy would be to instrument for price in the logit model. BLP propose using three sets of instruments
1. The observed product characteristics (which are assumed orthogonal to the unobserved characteristics)
2. The sum of product characteristics for all models marketed by a single firm in a given market.
3. The sum of product characteristics for all models in a given market.

In [None]:
iv_res = IV2SLS.from_formula('utility_simple_logit ~ 1 + hpwt + air + mpd + space + [price ~ 1 + ]',df).fit() # robust HCV
print(iv_res.summary)