In [305]:
"""
Demand Modeling - 1.202
Problem Set 2: Linear Time Series Regression
Alexandra Berke

"""
import pandas as pd
import numpy as np

# Load Data
df = pd.read_csv("Tran.csv")

In [220]:
"""
The following are the equations that will be used:

ln VMT/Vehiclet = b0 + b1 ln GDP/capitat + b2 ln Pfuel/gal,t + b3 ln MPGt + b4 ln Vehicles/drivert + b5 ln psuburbst + b6 ln VMT/Vehiclet-1 + et (7)
Where b0 = ln k0, and et = ln ht is a stochastic disturbance representing variables omitted from the model and other noise in the data.

The vehicles-per-driver term reflects the size and rate of utilization of the vehicle stock. Since the vehicle stock may change in the long run, Pickrell modeled it as
ln Vehicles/drivert = g0 + g1 ln GDP/capitat + g2 ln Pfuel/gal,t + g3 ln Pveh,t + g4 ln LFPt + g5 ln psuburbst + g6 ln Vehicles/drivert-1 + xt


I create 2 new dataframes, one for each of the equations.
Each dataframe contains the variables for their equations, transformed as needed.
i.e. each row of a dataframe contains the explanatory variable, as well as each independent variable including
lagged variables, which are pulled from the previous row's data.

For variables that were already present in the data, I simply take their natural log.
Some variables were first constructed them from other variables as proportions.

"""
# Variables needed as columns of dataframes:
YEAR = "year"
LOG_VMT_PER_VEHICLE = "ln VMT/Vehiclet"
LOG_GDP_PER_CAPITA = "ln GDP/capitat"  # I compute GDP/capita as GDP/pop
LOG_PFUEL_PER_GALLON = "ln Pfuel/gal,t" # This is pgas in the dataset
LOG_MPG = "ln MPGt"  # average fuel efficiency of automobiles and two-axle, four-tire trucks (miles per gallon)
LOG_VEHICLES_PER_DRIVER = "ln Vehicles/drivert"
LOG_PSUBURBS = "ln psuburbst" # proportion of U.S. population living in urbanized areas who live in suburban portions of urban areas (using Census Bureau definitions)
LOG_LAGGED_VMT_PER_VEHICLE = "ln VMT/Vehiclet-1"
# Additional variables for vehicle stock equation
LOG_PRICEVEHICLE = "ln pvehicle"  # average price of a new automobile in 1987 U.S. dollars
LOG_LFP = "ln LFPt"  # percent of working-age U.S. population employed or actively seeking work
LOG_LAGGED_VEHICLES_PER_DRIVER = "ln Vehicles/drivert-1"

# Construct the dataframe for equation (7)
# ln VMT/Vehiclet = b0 + b1 ln GDP/capitat + b2 ln Pfuel/gal,t + b3 ln MPGt + b4 ln Vehicles/drivert + b5 ln psuburbst + b6 ln VMT/Vehiclet-1 + et (7)
vmt_per_vehicle_df = pd.DataFrame(data={
    YEAR: df[YEAR],
    LOG_VMT_PER_VEHICLE: np.log(df["vmt"]/df["vehicles"]),
    LOG_GDP_PER_CAPITA: np.log(df["gdp"]/df["pop"]),
    LOG_PFUEL_PER_GALLON: np.log(df["pgas"]),
    LOG_MPG: np.log(df["mpg"]),
    LOG_VEHICLES_PER_DRIVER: np.log(df["vehicles"]/df["drivers"])
})
vmt_per_vehicle_df.set_index(YEAR)
# psuburbs values are missing for the last 4 years of data.
# I insert the last available psuburbs value for these missing datapoints.  This insertion
# seems valid because the values were gradually growing, thought somewhat stagnant in
# the final years of data.  Inserting zeros for the missing values would contribute error to the
# model.
psuburbs = pd.to_numeric(df["psuburbs"], errors="coerce")
for i in range(len(psuburbs)):
    ps = psuburbs[i-1] if pd.isna(psuburbs[i]) else psuburbs[i]
    psuburbs[i] = ps
    
vmt_per_vehicle_df[LOG_PSUBURBS] = np.log(psuburbs)

# Add in the lagged variable:
# To add in the lagged variables, I added to each row of data the value for the previous row.
# For example, the value of ln Vehicles/drivert-1 for t=1954 is the value of ln Vehicles/driver for t=1953.
# To handle the first year of data (1950) for which the previous year’s data was not available,
# I insert the first year’s data to serve as a proxy.
# i.e. Vehicles/drivert-1 = Vehicles/drivert when t=1950.
log_vmt_per_vehicles = vmt_per_vehicle_df[LOG_VMT_PER_VEHICLE]
log_lagged_vmt_per_vehicles = [log_vmt_per_vehicles[0] if i == 0 else log_vmt_per_vehicles[i-1] for i in range(len(log_vmt_per_vehicles))]
vmt_per_vehicle_df[LOG_LAGGED_VMT_PER_VEHICLE] = log_lagged_vmt_per_vehicles


# Construct the dataframe for equation (8)
# ln Vehicles/drivert = g0 + g1 ln GDP/capitat + g2 ln Pfuel/gal,t + g3 ln Pveh,t + g4 ln LFPt + g5 ln psuburbst + g6 ln Vehicles/drivert-1 + xt (8)

# pvehicle values are missing for the last 4 years of data.
# lfp values are missing for the last 2 years of data.
# I insert the last available value for these missing datapoints.
pvehicle = pd.to_numeric(df["pvehicle"], errors="coerce")
for i in range(len(pvehicle)):
    pvehicle[i] = pvehicle[i-1] if pd.isna(pvehicle[i]) else pvehicle[i]

lfp = pd.to_numeric(df["lfp"], errors="coerce")
for i in range(len(lfp)):
    lfp[i] = lfp[i-1] if pd.isna(lfp[i]) else lfp[i]

vehicles_per_driver_df = pd.DataFrame(data={
    YEAR: df[YEAR],
    LOG_VEHICLES_PER_DRIVER: np.log(df["vehicles"]/df["drivers"]),
    LOG_GDP_PER_CAPITA: np.log(df["gdp"]/df["pop"]),
    LOG_PFUEL_PER_GALLON: np.log(df["pgas"]),
    LOG_PRICEVEHICLE: np.log(pvehicle),
    LOG_LFP: np.log(lfp),
    LOG_PSUBURBS: np.log(psuburbs)
})
vehicles_per_driver_df.set_index(YEAR)

# Add in the lagged variable ln Vehicles/drivert-1
log_vehicles_per_driver = vehicles_per_driver_df[LOG_VEHICLES_PER_DRIVER]
log_lagged_vehicles_per_driver = [log_vehicles_per_driver[0] if i == 0 else log_vehicles_per_driver[i-1] for i in range(len(log_vehicles_per_driver))]
vehicles_per_driver_df[LOG_LAGGED_VEHICLES_PER_DRIVER] = log_lagged_vehicles_per_driver

vehicles_per_driver_df.head()


Unnamed: 0,year,ln Vehicles/drivert,ln GDP/capitat,"ln Pfuel/gal,t",ln pvehicle,ln LFPt,ln psuburbst,ln Vehicles/drivert-1
0,1950,6.615782,2.231855,0.41211,9.614004,4.080922,-1.461018,6.615782
1,1951,6.638924,2.314206,0.350657,9.60326,4.080922,-1.431292,6.615782
2,1952,6.624696,2.333203,0.350657,9.595399,4.077537,-1.406497,6.638924
3,1953,6.639866,2.353691,0.378436,9.588914,4.075841,-1.378326,6.624696
4,1954,6.649138,2.329221,0.385262,9.580869,4.074142,-1.350927,6.639866


In [221]:
"""2. Prompt: Replicate each of Pickrell’s models (equations 7 and 8) using OLS.

Interpret your estimates of the parameters.

Are they statistically significant?

What are the short- and long-run estimates of the elasticity of VMT with
respect to gasoline price and income (GDP per capita)? 
"""
import statsmodels.api as sm

# Modeling VMT/Vehicle model:
vmt_per_vehicle_model_X = sm.add_constant(vmt_per_vehicle_df[[
                                  LOG_GDP_PER_CAPITA,
                                  LOG_PFUEL_PER_GALLON,
                                  LOG_MPG,
                                  LOG_VEHICLES_PER_DRIVER,
                                  LOG_PSUBURBS,
                                  LOG_LAGGED_VMT_PER_VEHICLE
                              ]])
vmt_per_vehicle_model_Y = vmt_per_vehicle_df[LOG_VMT_PER_VEHICLE]
vmt_per_vehicle_results = sm.OLS(vmt_per_vehicle_model_Y, vmt_per_vehicle_model_X).fit()

print(vmt_per_vehicle_results.summary())

                            OLS Regression Results                            
Dep. Variable:        ln VMT/Vehiclet   R-squared:                       0.954
Model:                            OLS   Adj. R-squared:                  0.947
Method:                 Least Squares   F-statistic:                     132.7
Date:                Mon, 18 Mar 2019   Prob (F-statistic):           6.26e-24
Time:                        11:28:51   Log-Likelihood:                 136.57
No. Observations:                  45   AIC:                            -259.1
Df Residuals:                      38   BIC:                            -246.5
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                   2.6565    

In [222]:
"""Calculating long-run elasticities for VMT/vehicle w.r.t price of gas and income
The long-run estimates assume that equilibrium for VMT per vehicle has been reached,
i.e. that VMT/vehicle for year t is the same as VMT/vehicle for year t-1.
"""

#  For price of fuel/gallon
#  (coefficient for price of fuel/gallon)/(1 - coefficient for VMT/vehicle)
print((-0.0868)/(1 - 0.6332))

# For income (GDP/capita) the long-run elasticity  estimate is 
# = (coefficient of GDP/capita) / (1 - coefficient for VMT/vehicle)
print((0.1698)/(1 - 0.6332))

-0.2366412213740458
0.4629225736095965


In [223]:
# Modeling VMT/Vehicle model:
vehicles_per_driver_model_X = sm.add_constant(vehicles_per_driver_df[[
                                  LOG_GDP_PER_CAPITA,
                                  LOG_PFUEL_PER_GALLON,
                                  LOG_PRICEVEHICLE,
                                  LOG_LFP,
                                  LOG_PSUBURBS,
                                  LOG_LAGGED_VEHICLES_PER_DRIVER
                              ]])
vehicles_per_driver_model_Y = vehicles_per_driver_df[LOG_VEHICLES_PER_DRIVER]
vehicles_per_driver_results = sm.OLS(vehicles_per_driver_model_Y, vehicles_per_driver_model_X).fit()

print(vehicles_per_driver_results.summary())

                             OLS Regression Results                            
Dep. Variable:     ln Vehicles/drivert   R-squared:                       0.995
Model:                             OLS   Adj. R-squared:                  0.995
Method:                  Least Squares   F-statistic:                     1390.
Date:                 Mon, 18 Mar 2019   Prob (F-statistic):           6.21e-43
Time:                         11:28:54   Log-Likelihood:                 147.80
No. Observations:                   45   AIC:                            -281.6
Df Residuals:                       38   BIC:                            -269.0
Df Model:                            6                                         
Covariance Type:             nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const               

In [224]:
"""Part c) Test for autocorrelation:
I use the Durbin’s Lagrange Multiplier Test in order to test the null hypothesis of no serial correlation.

h = (1-(d/2))sqrt(T/(1-Ts*s)))

Where
d = The Durbin-Watson statistic (which was included in the model output for the OLS regressions run)
T = Number of time periods (45 in total for 1950 - 1994).
s = standard error of the coefficient on the lagged endogenous variable.
"""

def durbin_lagrange_multiplier_test(d, T, s):
    """Returns h = (1-(d/2))sqrt(T/(1-Ts*s)))
    Where
    d = The Durbin-Watson statistic (which was included in the model output for the OLS regressions run)
    T = Number of time periods (45 in total for 1950 - 1994).
    s = standard error of the coefficient on the lagged endogenous variable.
    """
    return (1-(d/2))*np.sqrt(T/(1-(T*s*s)))

# Test for equation (7) VMT/vehicle
d = 1.476 # The Durbin-Watson statistic
T = 45
s = 0.076
h = durbin_lagrange_multiplier_test(d, T, s)
print('h stat for VMT/vehicle:', h)

print ("checking s*s > 1/T:", s*s > 1/T)

h stat for VMT/vehicle: 2.0429992830872603
checking s*s > 1/T: False


In [225]:
# Test for equation (8) Vehicles/Driver
d = 1.826 # The Durbin-Watson statistic
T = 45
s = 0.109
h = durbin_lagrange_multiplier_test(d, T, s)
print('h stat for VMT/vehicle:', h)

print ("checking s*s > 1/T:", s*s > 1/T)

h stat for VMT/vehicle: 0.8555262061724588
checking s*s > 1/T: False


In [226]:
"""Part d)
For each model where you conclude autocorrelation exists (based on (c)),
reestimate the Pickrell’s model without the lagged endogenous variable.
Examine specifications of your own.
Can you come up with a model that has more explanatory power than Pickrell’s?
Use the Durbin-Watson statistic to test for first order autocorrelation in your best model.
What are the properties (e.g., consistency, efficiency) of your OLS estimates when autocorrelation
is present and when autocorrelation is not present?


I estimate the 2 additional models without lagged endogenous variables:

Pickrell’s model without the lagged variable
(1)  ln VMT/Vehiclet = b0 + b1 ln GDP/capitat + b2 ln Pfuel/gal,t + b3 ln MPGt + b4 ln Vehicles/drivert + b5 ln psuburbst + et

And a model of my own specification, which is similar to Pickrell’s original model, but different in the following
ways:
- The lagged endogenous variable VMT/Vehicles is removed
- The psuburbs variable is  removed because it was not statistically significant in the original model.
- I compute and add an additional variable: roads/vehicles.
My intuition for adding the roads/vehicles explanatory variable is inspired by the common belief of urban
planners “if you build it, they will come”.

(2)  ln VMT/Vehiclet = b0 + b1 ln GDP/capitat + b2 ln Pfuel/gal,t + b3 ln MPGt + b4 ln Vehicles/drivert + b5 ln roads/vehiclest  + et
"""

# OLS Estimate for:
# (1)  ln VMT/Vehiclet = b0 + b1 ln GDP/capitat + b2 ln Pfuel/gal,t + b3 ln MPGt + b4 ln Vehicles/drivert + b5 ln psuburbst + et

vmt_per_vehicle_model_1_X = sm.add_constant(vmt_per_vehicle_df[[
                                  LOG_GDP_PER_CAPITA,
                                  LOG_PFUEL_PER_GALLON,
                                  LOG_MPG,
                                  LOG_VEHICLES_PER_DRIVER,
                                  LOG_PSUBURBS
                              ]])
vmt_per_vehicle_model_1_Y = vmt_per_vehicle_df[LOG_VMT_PER_VEHICLE]
vmt_per_vehicle_results_1 = sm.OLS(vmt_per_vehicle_model_1_Y, vmt_per_vehicle_model_1_X).fit()

print(vmt_per_vehicle_results_1.summary())

                            OLS Regression Results                            
Dep. Variable:        ln VMT/Vehiclet   R-squared:                       0.872
Model:                            OLS   Adj. R-squared:                  0.856
Method:                 Least Squares   F-statistic:                     53.14
Date:                Mon, 18 Mar 2019   Prob (F-statistic):           2.28e-16
Time:                        11:29:00   Log-Likelihood:                 113.32
No. Observations:                  45   AIC:                            -214.6
Df Residuals:                      39   BIC:                            -203.8
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                   5.3845    

In [115]:
# OLS Estimate for:
# (2)  ln VMT/Vehiclet = b0 + b1 ln GDP/capitat + b2 ln Pfuel/gal,t + b3 ln MPGt + b4 ln Vehicles/drivert + b5 ln roads/vehiclest  + et
# first compute ln roads/vehicles variable
LOG_ROADS_PER_VEHICLES = "ln roads/vehicles"
# roads values are missing for the last 2 years of data -- insert last available values into missing rows
roads = pd.to_numeric(df["roads"], errors="coerce")
for i in range(len(roads)):
    roads[i] = roads[i-1] if pd.isna(roads[i]) else roads[i]
log_roads_per_vehicles = np.log(roads/df["vehicles"])
# Add this new variable to the df
vmt_per_vehicle_df[LOG_ROADS_PER_VEHICLES] = log_roads_per_vehicles

vmt_per_vehicle_model_2_X = sm.add_constant(vmt_per_vehicle_df[[
                                  LOG_GDP_PER_CAPITA,
                                  LOG_PFUEL_PER_GALLON,
                                  LOG_MPG,
                                  LOG_VEHICLES_PER_DRIVER,
                                  LOG_ROADS_PER_VEHICLES
                              ]])
vmt_per_vehicle_model_2_Y = vmt_per_vehicle_df[LOG_VMT_PER_VEHICLE]
vmt_per_vehicle_results_2 = sm.OLS(vmt_per_vehicle_model_2_Y, vmt_per_vehicle_model_2_X).fit()

print(vmt_per_vehicle_results_2.summary())

                            OLS Regression Results                            
Dep. Variable:        ln VMT/Vehiclet   R-squared:                       0.877
Model:                            OLS   Adj. R-squared:                  0.861
Method:                 Least Squares   F-statistic:                     55.54
Date:                Sun, 17 Mar 2019   Prob (F-statistic):           1.08e-16
Time:                        18:08:01   Log-Likelihood:                 114.19
No. Observations:                  45   AIC:                            -216.4
Df Residuals:                      39   BIC:                            -205.5
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                   3.7919    

In [227]:
"""Part e) Use the Iterative Cochrane-Orcutt procedure described in Appendix C
to correct for first order autocorrelation and to estimate the correlation between
successive stochastic error terms.
With first order autocorrelation permitted, are your elasticity estimates substantially changed?
Which results are more plausible and credible, those in part (b) or here? Why?
"""

CO_MAX_ITERATIONS = 15
co_iterations = 0
p_estimate_old = None

# Step 1
#Ordinary least squares is used to estimate the original model, Eq. (9).
co_procedure_model_X = sm.add_constant(vmt_per_vehicle_df[[
                                  LOG_GDP_PER_CAPITA,
                                  LOG_PFUEL_PER_GALLON,
                                  LOG_MPG,
                                  LOG_VEHICLES_PER_DRIVER,
                                  LOG_PSUBURBS
                              ]])
X_columns = co_procedure_model_X.columns
co_procedure_model_Y = vmt_per_vehicle_df[LOG_VMT_PER_VEHICLE]

co_step1_results = sm.OLS(co_procedure_model_Y, co_procedure_model_X).fit()
params = co_step1_results.params

while(co_iterations <= CO_MAX_ITERATIONS):
    co_iterations += 1
    print('cochrane-orcutt iteration', co_iterations)
    
    # Step 2. The estimated parameters are used to construct residuals
    fittedvalues = np.dot(co_procedure_model_X, [param for param in params])
    co_procedure_residuals = co_procedure_model_Y - fittedvalues


    # These residuals are then used to perform the regression to get an estimate for p
    # et = p*et-1 + nt
    e_t = [e for e in co_procedure_residuals[1:]]
    e_t_1 = co_procedure_residuals[:-1]

    co_step2_results = sm.OLS(e_t, e_t_1).fit()
    p_estimate = co_step2_results.params[0]
    print('p_estimate', p_estimate)

    # Step 3. The estimated value of p is used to perform the generalized differencing transformation process,
    # and a new regression is run. The transformed equation is
    # Y_star_t =b0*(1-p) + b*X_star_t + vt
    # where
    # Y_star_t = Y_t - pY_t-1
    # X_star_t = X_t - p*X_t-1
    # v_t = e_t - p*e_t-1
    T = len(co_procedure_model_Y)
    Y_star = [co_procedure_model_Y[t] - p_estimate*co_procedure_model_Y[t-1] for t in range(1, T)]
    X_star = [co_procedure_model_X.iloc[t] - p_estimate*co_procedure_model_X.iloc[t-1] for t in range(1, T)]
    v = e_t - p_estimate*e_t_1

    # Step 4. The estimated transformed equation yields new estimates for the parameter vector b. These revised parameter estimates are substituted into the original equation, and new regression residuals are obtained. That is, the new estimate of b is used upon returning to Step 2 to obtain a new estimate of r.
    # The iterative process can be carried on for as many steps as the researcher desires. Standard procedure is to stop the iterations when the new estimates of r differ from the old ones by less than
    # 0.01 or 0.005, or after 10 or 20 estimates of r have been obtained. The specific choice made depends upon the computational costs involved. The primary difficulty with the Cochrane-Orcutt procedure is that there is no guarantee that the final estimate of r will be the optimal estimate, in the sense of minimizing the sum of squared residuals. The difficulty arises because the iterative technique may lead to a local rather than a global minimum.
    co_step1_results = sm.OLS(Y_star, X_star).fit()
    params = co_step1_results.params
    print('new params:', params, '\n')

    if (p_estimate_old is not None):
        p_estimate_diff = abs(p_estimate_old - p_estimate)
        print('abs(p_estimate_old - p_estimate)', abs(p_estimate_old - p_estimate))
        if (p_estimate_diff < 0.01):
            print('New p_estimate differs from last p_estimate by < 0.01.  Ending procedure')
            break
    p_estimate_old = p_estimate
    
print('completed cochrane-orcutt procedure after', co_iterations, 'iterations')
print('final parameters:', ", ".join([X_columns[i] + ': ' + str(params[i]) for i in range(len(params))]))



cochrane-orcutt iteration 1
p_estimate 0.7658540899060109
new params: [ 4.72160436  0.34217797 -0.12759275  0.17702364 -0.54939502  0.02331098] 

cochrane-orcutt iteration 2
p_estimate 0.8469407173766601
new params: [ 4.6353444   0.33293047 -0.12247375  0.2063709  -0.54855908 -0.00661573] 

abs(p_estimate_old - p_estimate) 0.08108662747064921
cochrane-orcutt iteration 3
p_estimate 0.8654673140474386
new params: [ 4.59928646  0.33103601 -0.12157045  0.21401217 -0.5476801  -0.02404608] 

abs(p_estimate_old - p_estimate) 0.018526596670778495
cochrane-orcutt iteration 4
p_estimate 0.8746969284566184
new params: [ 4.57681742  0.33002897 -0.12114081  0.21788787 -0.54696549 -0.0360723 ] 

abs(p_estimate_old - p_estimate) 0.009229614409179798
New p_estimate differs from last p_estimate by < 0.01.  Ending procedure
completed cochrane-orcutt procedure after 4 iterations
final parameters: const: 4.5768174216002535, ln GDP/capitat: 0.3300289667214744, ln Pfuel/gal,t: -0.12114081394586919, ln MPGt:

In [195]:
"""Part f) Now reestimate your best model specification from parts (b), (d) and (e) using only the data through 1970.
It may be argued that the oil shocks of the 1970s and growing environmental consciousness may have led to a 
structural change in travel demand patterns.
Do you agree with this assertion? Describe how you might test this assertion statistically.


I compare the modified Pickrell model specification (1) for data thought only 1970 versus all of the data.
"""
vmt_per_vehicle_model_1_1970_X = sm.add_constant(vmt_per_vehicle_df[[
  LOG_GDP_PER_CAPITA,
  LOG_PFUEL_PER_GALLON,
  LOG_MPG,
  LOG_VEHICLES_PER_DRIVER,
  LOG_PSUBURBS
]].iloc[:21])
vmt_per_vehicle_model_1_1970_Y = vmt_per_vehicle_df[LOG_VMT_PER_VEHICLE].iloc[:21]

vmt_per_vehicle_results_1970_1_results = sm.OLS(vmt_per_vehicle_model_1_1970_Y, vmt_per_vehicle_model_1_1970_X).fit()
print(vmt_per_vehicle_results_1970_1_results.summary())

                            OLS Regression Results                            
Dep. Variable:        ln VMT/Vehiclet   R-squared:                       0.871
Model:                            OLS   Adj. R-squared:                  0.828
Method:                 Least Squares   F-statistic:                     20.23
Date:                Sun, 17 Mar 2019   Prob (F-statistic):           3.50e-06
Time:                        22:56:29   Log-Likelihood:                 76.628
No. Observations:                  21   AIC:                            -141.3
Df Residuals:                      15   BIC:                            -135.0
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                   5.6126    

In [263]:
"""Part g)
Many urban planners and transportation professionals believe the hypothesis of 
“if you build it, they will come,” that is, that any increase in road capacity 
is immediately consumed by additional travel at congestion levels similar to 
before the road construction. Mark Kiefer and Shomik Mehndiratta (1998) present 
a counter-argument that while VMT and congestion have risen dramatically from the
’50s to the ’90s, this growth in VMT should not be expected to continue in the future 
for two reasons: 
(1) the growth in labor-force participation should be tapering off now that labor-force 
participation among women is approaching that of men, and 
(2) the growth in household auto ownership appears to slowing to a rate of one vehicle per driver.
Forecast the VMT/Vehicle for the year 2016 (that is, 1994, the last year of the data, 
plus a 22- year planning horizon) assuming LFP continues to grow at 0.3% per year and auto ownership 
(vehicles per drive) continues to grow at 0.9% per year — these are the average growth rates for 1950-1994. 
(Other average growth rates are GDP per capita, 1.8% per year; gasoline price, -0.8% per year; vehicle cost, 
0.2% per year; fuel efficiency, 0.7% per year; and proportion of suburbs, 1.7% per year. Do final forecast 
variables look reasonable?)
Rerun your forecast of VMT/Vehicle for 2016 assuming that LFP stays constant at 66.3 percent, 
and auto ownership remains at 1.089 vehicles per driver. Is the difference in forecasts large enough that 
Kiefer and Mehndiratta’s hypothesis should be given more analysis?

---

To complete and compare the forecasts for year 2016, I again use the modified Pickrell model that excludes the use of a lagged variable:
(1)  ln VMT/Vehiclet = b0 + b1 ln GDP/capitat + b2 ln Pfuel/gal,t + b3 ln MPGt + b4 ln Vehicles/drivert + b5 ln psuburbst + et

Note that this model relies on the Vehicles/driver variable, but not LFP.  Therefore LFP will not be directly used in this model’s forecasts for 2016.

I extended the available data into the year 2016 in two slightly different datasets:
The first dataset uses the average growth rates of the provided data for all variables to project into the future.
The second dataset is the same as the first, but it keeps the vehicles/drivers values constant at the last available value in the dataset (the value for 1994).

I forecast VMT/Vehicle for year 2016 using the 2 different datasets.
"""

# To complete and compare the forecasts for year 2016, I again use the modified Pickrell model that
# excludes the use of a lagged variable:
# (1)  ln VMT/Vehiclet = b0 + b1 ln GDP/capitat + b2 ln Pfuel/gal,t + b3 ln MPGt + b4 ln Vehicles/drivert + b5 ln psuburbst + et

# I extend the available data to 2016 by using the average growth rates of the provided data.
# I create 2 new dataframes that extend the current vmt_per_vehicle_df dataframe to Year 2016:
# 1. One dataframe vmt_per_vehicle_2016_1_df uses the current growth rates for all variables

# 2. The second dataframe vmt_per_vehicle_2016_2_df is the same as vmt_per_vehicle_2016_1_df but keeps
# vehicles/driver constant at the value for 1994.

# Need to extend values, but first without the log.  Then take log of entire series.
last_available_year_index = len(df) - 1
year_extension_range = range(1, 2016 - 1994 + 1)

def get_future_value(last_value, growth_rate, t):
    return last_value*((1 + growth_rate)**t)

years = pd.Series(range(1950, 2016 + 1))

# Make new gdp per capita series -- annual growth rate: 1.8%
gdp_per_capita = [v for v in df["gdp"]/df["pop"]]
for t in year_extension_range:
    gdp_per_capita.append(get_future_value(gdp_per_capita[last_available_year_index], 0.018, t))
    
gdp_per_capita = pd.Series(gdp_per_capita)

# Make new Pfuel/gallon series -- annual growth rate: -0.8%
pfuel_per_gallon = [v for v in df["pgas"]]
for t in year_extension_range:
    pfuel_per_gallon.append(get_future_value(pfuel_per_gallon[last_available_year_index], -0.008, t))

# Make new mpg series -- annual growth rate: 0.7%
mpg = [v for v in df["mpg"]]
for t in year_extension_range:
    mpg.append(get_future_value(mpg[last_available_year_index], 0.007, t))

# Make new vehicles/drivers series -- annual growth rate: 0.9%
vehicles_per_driver_1 = [v for v in df["vehicles"]/df["drivers"]]
for t in year_extension_range:
    vehicles_per_driver_1.append(get_future_value(vehicles_per_driver_1[last_available_year_index], 0.009, t))

# Make alternative vehicles/drivers series where the value remains at the 1994 value - annual growth rate: 0
vehicles_per_driver_2 = [v for v in df["vehicles"]/df["drivers"]]
for t in year_extension_range:
    vehicles_per_driver_2.append(get_future_value(vehicles_per_driver_2[last_available_year_index], 0, t))

# Make new psuburbs series -- annual growth rate: 1.7%
# Remember, first need to pad 1991-1994 values of psuburbs which are otherwise unavailable
psuburbs = [float(v) for v in df["psuburbs"][:-4]]
for i in range(4):
    psuburbs.append(psuburbs[-1])
for t in year_extension_range:
    psuburbs.append(get_future_value(psuburbs[last_available_year_index], 0.017, t))

# Dataset 1
vmt_per_vehicle_2016_1_df = pd.DataFrame({
    YEAR: years,
    LOG_GDP_PER_CAPITA: np.log(gdp_per_capita),
    LOG_PFUEL_PER_GALLON: np.log(pfuel_per_gallon),
    LOG_MPG: np.log(mpg),
    LOG_VEHICLES_PER_DRIVER: np.log(vehicles_per_driver_1),
    LOG_PSUBURBS: np.log(psuburbs)
})

# Data set 2 -- the difference is vehicles/driver
vmt_per_vehicle_2016_2_df = pd.DataFrame({
    YEAR: years,
    LOG_GDP_PER_CAPITA: np.log(gdp_per_capita),
    LOG_PFUEL_PER_GALLON: np.log(pfuel_per_gallon),
    LOG_MPG: np.log(mpg),
    LOG_VEHICLES_PER_DRIVER: np.log(vehicles_per_driver_2),
    LOG_PSUBURBS: np.log(psuburbs)
})

const                  1.000000
ln GDP/capitat         3.413999
ln Pfuel/gal,t        -0.099747
ln MPGt                3.124391
ln Vehicles/drivert    6.993319
ln psuburbst          -0.401334
Name: 66, dtype: float64

In [290]:
# Make prediction of VMT/Vehicles using dataset 1 prepared directly above
# The model is used with estimates from part (d)
vmt_per_vehicle_2016_1_X = sm.add_constant(vmt_per_vehicle_2016_1_df[[
    LOG_GDP_PER_CAPITA,
    LOG_PFUEL_PER_GALLON,
    LOG_MPG,
    LOG_VEHICLES_PER_DRIVER,
    LOG_PSUBURBS,
]]).iloc[-1]

vmt_per_vehicle_2016_1_prediction = np.dot(vmt_per_vehicle_results_1.params, vmt_per_vehicle_2016_1_X)
print("Prediction 1 for VMT/Vehicles for 2016:", np.exp(vmt_per_vehicle_2016_1_prediction)*1000)

# Make prediction of VMT/Vehicles using dataset 2 prepared directly above
vmt_per_vehicle_2016_2_X = sm.add_constant(vmt_per_vehicle_2016_2_df[[
    LOG_GDP_PER_CAPITA,
    LOG_PFUEL_PER_GALLON,
    LOG_MPG,
    LOG_VEHICLES_PER_DRIVER,
    LOG_PSUBURBS,
]]).iloc[-1]
vmt_per_vehicle_2016_2_prediction = np.dot(vmt_per_vehicle_results_1.params, vmt_per_vehicle_2016_2_X)
# Transform value into true value: raise to e and convert units.
# VMT is reported in millions of miles, and vehicles is reported in 1000's.  1,000,000/1000 = 1000
print("Prediction 2 for ln VMT/Vehicles for 2016:", np.exp(vmt_per_vehicle_2016_2_prediction)*1000)


Prediction 1 for VMT/Vehicles for 2016: 12308.602946157378
Prediction 2 for ln VMT/Vehicles for 2016: 13961.376970592748


In [302]:
"""
In order to interpret what these numbers mean in relation to Kiefer and Mehndiratta’s hypothesis,
I multiply these VMT/Vehicles rates by the forecasted Vehicles/drivers rates for 2016,
in order to end up with estimates for VMT/drivers.
"""

print("VMT/drivers for dataset 1:")
print(np.exp(vmt_per_vehicle_2016_1_prediction)*vehicles_per_driver_1[-1])

print("VMT/drivers for dataset 2:")
print(np.exp(vmt_per_vehicle_2016_2_prediction)*vehicles_per_driver_2[-1])

VMT/drivers for dataset 1:
16329.55884138165
VMT/drivers for dataset 2:
15208.566675171449


In [304]:
# Does the forecasted 2016 price of fuel per gallon seem reasonable?
print("forecasted price of fuel per gallon in 1987 US dollars:")
print(pfuel_per_gallon[-1])
# The value is 0.91 which is equivalent to about ~$1.90 2016 US dollars.
# This forecasted value is well below the true 2016 price of gas. 

forecasted price of fuel per gallon in 1987 US dollars:
0.9050666096172386
