# July 18, 2019 Binary Logistic regression with R
* name : Jikhan Jeong
* reference: https://rpubs.com/sallychen/313125
* reference: https://eml.berkeley.edu/books/choice2.html
* for latex equations: https://www.codecogs.com/latex/eqneditor.php

# Using latex format in Jupyter

k_{n+1} = n^2 + k_n^2 -k_{n-1}: put $ in the begin and the end of the sentence, then equation will turn out in the latex format as follows:

$k_{n+1} = n^2 + k_n^2 -k_{n-1}$ 

In [1]:
install.packages('AER')

package 'AER' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
	C:\rstudio\RtmpsvrGAt\downloaded_packages


In [14]:
install.packages('mlogit')

also installing the dependencies 'miscTools', 'maxLik', 'statmod'



package 'miscTools' successfully unpacked and MD5 sums checked
package 'maxLik' successfully unpacked and MD5 sums checked
package 'statmod' successfully unpacked and MD5 sums checked
package 'mlogit' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
	C:\rstudio\RtmpgLwMlb\downloaded_packages


In [2]:
library(AER)

Loading required package: car
Loading required package: lmtest
Loading required package: zoo

Attaching package: 'zoo'

The following objects are masked from 'package:base':

    as.Date, as.Date.numeric

Loading required package: sandwich
Loading required package: survival


In [3]:
library(mlogit)
library(dplyr)

Loading required package: Formula
Loading required package: maxLik
Loading required package: miscTools

Please cite the 'maxLik' package as:
Henningsen, Arne and Toomet, Ott (2011). maxLik: A package for maximum likelihood estimation in R. Computational Statistics 26(3), 443-458. DOI 10.1007/s00180-010-0217-1.

If you have questions, suggestions, or comments regarding the 'maxLik' package, please use a forum or 'tracker' at maxLik's R-Forge site:
https://r-forge.r-project.org/projects/maxlik/

Attaching package: 'dplyr'

The following object is masked from 'package:car':

    recode

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union



# Data
* Multinomial Logit 
* Dependent variables = 4 differents choices set ={air, train, bus, car} = each person has a 4 column and 1 yes among 4 alternatives
* Independent varaibles = mode, choice, wait, vcost, travel, gcost, income, size

In [4]:
data("TravelMode", package="AER") # load data

In [5]:
head(TravelMode,8)

individual,mode,choice,wait,vcost,travel,gcost,income,size
1,air,no,69,59,100,70,35,1
1,train,no,34,31,372,71,35,1
1,bus,no,35,25,417,70,35,1
1,car,yes,0,10,180,30,35,1
2,air,no,64,58,68,68,30,2
2,train,no,44,31,354,84,30,2
2,bus,no,53,25,399,85,30,2
2,car,yes,0,11,255,50,30,2


In [6]:
dim(TravelMode) # 840 individuals 8 independent variables

In [42]:
Travel = TravelMode

In [8]:
nrow(Travel)

In [9]:
ncol(Travel)

* how many individuals? nrow(=840) / #of alternatives (=4) = 210

In [10]:
length(unique(Travel$individual))

* missing data

In [11]:
table(is.na(Travel))


FALSE 
 7560 

# STEP1. Individual Likelihood Function 1 : Utility Function
* theta = # mix of constant variable and 3 coefficient for the 3 independent varaibles (gcost, wait, travel)
* without learning a value of coefficient, in here; coefficients of 3 independent variables are given.

In [12]:
theta = c(1,2,3,-0.001, -0.003, -0.005) # one set of parameters

In [13]:
theta # mix of constant variable and 3 coefficient for the 3 independent varaibles (gcost, wait, travel)

In [14]:
sample <- filter(Travel, individual ==1) # filter the individual 1

In [15]:
sample

individual,mode,choice,wait,vcost,travel,gcost,income,size
1,air,no,69,59,100,70,35,1
1,train,no,34,31,372,71,35,1
1,bus,no,35,25,417,70,35,1
1,car,yes,0,10,180,30,35,1


* making new variable constant based on alternatives, car (=base group) =0, air =1, train =2, bus =3

In [16]:
sample$constant <-0

In [17]:
sample

individual,mode,choice,wait,vcost,travel,gcost,income,size,constant
1,air,no,69,59,100,70,35,1,0
1,train,no,34,31,372,71,35,1,0
1,bus,no,35,25,417,70,35,1,0
1,car,yes,0,10,180,30,35,1,0


In [18]:
sample$constant[sample$mode=="air"] = theta[1]

In [19]:
sample

individual,mode,choice,wait,vcost,travel,gcost,income,size,constant
1,air,no,69,59,100,70,35,1,1
1,train,no,34,31,372,71,35,1,0
1,bus,no,35,25,417,70,35,1,0
1,car,yes,0,10,180,30,35,1,0


In [20]:
sample$constant[sample$mode=="train"] = theta[2]

In [21]:
sample$constant[sample$mode=="bus"] = theta[3]

* Utility with constant, and 3 variables and 3 coefficient for 3 variables (gcost, wait, travel)

In [22]:
sample$utility = theta[4]*sample$gcost + theta[5]*sample$wait + theta[6]*sample$travel + sample$constant

In [23]:
sample

individual,mode,choice,wait,vcost,travel,gcost,income,size,constant,utility
1,air,no,69,59,100,70,35,1,1,0.223
1,train,no,34,31,372,71,35,1,2,-0.033
1,bus,no,35,25,417,70,35,1,3,0.74
1,car,yes,0,10,180,30,35,1,0,-0.93


# STEP2. Individual Likelihood Function 2: Probability 

In [24]:
sample

individual,mode,choice,wait,vcost,travel,gcost,income,size,constant,utility
1,air,no,69,59,100,70,35,1,1,0.223
1,train,no,34,31,372,71,35,1,2,-0.033
1,bus,no,35,25,417,70,35,1,3,0.74
1,car,yes,0,10,180,30,35,1,0,-0.93


* actual choice = choice(=yes), in the above case, car was selected among 4 possible alternatives

In [25]:
actual_choice <- filter(sample, choice =='yes')

In [26]:
actual_choice

individual,mode,choice,wait,vcost,travel,gcost,income,size,constant,utility
1,car,yes,0,10,180,30,35,1,0,-0.93


In [27]:
actual_choice$utility

In [28]:
sample$utility

In [29]:
exp(sample$utility)

In [30]:
sum(exp(sample$utility))

# predicted probability of car =1
$prob_{n=1}(i=car)=\frac{\beta'{e}^{X_{ni}}}{\sum _{j}\beta'{e}^{X_{nj}}}$
* n= individuals
* i= actual choice
* j= alternative choice sets 

In [31]:
(actual_choice_probability = exp(actual_choice$utility)/sum(exp(sample$utility)))

# STEP3. Utility Function for All Individuals 1
* write a function with step 1 and step 2 for calcuating a predicted probability of all individuals 

In [32]:
choice.prob <-function(sample){
    x=filter(sample, choice=='yes')
    prob = exp(x$utility)/sum(exp(sample$utility))
    return(prob)
    }

# STEP4. Utility Function for all Individuals 2

* group the dataset by individuals
* compute choice probability within group
* summarize choice probability across group
* group_by(), summarise(), do() function in dplyr package is used

In [35]:
group = group_by(Travel, individual) # group the data by individuals

In [37]:
head(group,8)

individual,mode,choice,wait,vcost,travel,gcost,income,size
1,air,no,69,59,100,70,35,1
1,train,no,34,31,372,71,35,1
1,bus,no,35,25,417,70,35,1
1,car,yes,0,10,180,30,35,1
2,air,no,64,58,68,68,30,2
2,train,no,44,31,354,84,30,2
2,bus,no,53,25,399,85,30,2
2,car,yes,0,11,255,50,30,2


In [40]:
head(summarise(group, avg_travel = mean(travel)),2) # calcuate the average travel time for each individuals, showing only 2 persons

individual,avg_travel
1,267.25
2,269.0


In [44]:
Travel$utility<-runif(nrow(Travel),0,1) # generate some random utility, each utility between 0 and 1

In [46]:
head(Travel,8)

individual,mode,choice,wait,vcost,travel,gcost,income,size,utility
1,air,no,69,59,100,70,35,1,0.7254003
1,train,no,34,31,372,71,35,1,0.6233933
1,bus,no,35,25,417,70,35,1,0.2258345
1,car,yes,0,10,180,30,35,1,0.287773
2,air,no,64,58,68,68,30,2,0.203789
2,train,no,44,31,354,84,30,2,0.6162616
2,bus,no,53,25,399,85,30,2,0.5138813
2,car,yes,0,11,255,50,30,2,0.9781352


In [51]:
Probability <- Travel %>% group_by(individual) %>% do(data.frame(prob=choice.prob( . )))

In [49]:
head(Probability,3)

individual,prob
1,0.204592
2,0.3589402
3,0.3585138


* sum up the log likelihood

In [53]:
sum(log(Probability$prob))

# Likelihood Function as a Function 
* regeard coefficients of each independent variables as a given
* care for constant for each alteratives
* get utility for each alteratives
* using function choice.prob = probability of actual choice 
* getting a sum of log likelikelihood

In [55]:
Likelihood <- function(theta) {
    Travel$constant =0
    Travel$constant[sample$mode=="air"] = theta[1]
    Travel$constant[sample$mode=="train"] = theta[2]
    Travel$constant[sample$mode=="bus"] = theta[3]
    Travel$utility = theta[4]*Travel$gcost + theta[5]*Travel$wait + theta[6]*Travel$travel + Travel$constant
    
    Probability <- Travel %>% group_by(individual) %>% do(data.frame(prob=choice.prob( . )))
    return(-sum(log(Probability$prob)))
    }

# Maxium Likelihood Estimation : Optimization
* using optim() function <- should know and options, in here, BFGS method is used
* initial values (constant for car is 0 as a base group, other individiaul coeffifient starts from negative values -0.01)

In [None]:
estimation <- optim(c(4.0,4.0,3.0,-0.001,-0.01,-0.001), Likelihood, method ="BFGS")

In [60]:
estimation$par

* from above coefficients
---------------------------------
* Constant air: 4.084855252
* Constant train: 3.651420169
* Constant bus: 3.195951785
* Constant car: 0 (assumption in the begin)
* General Cost: -0.003250702
* Wait Time: -0.097567752
* Travel Time: -0.003434490

# Comparing the above results with mlogit (=K.Train based)
* step1: formating for mlogit data frame
* step2: run multinomial wit mlogit
* step3: apply for predicted probability
* step4: mean choice probability 

* step1: formating for mlogit data frame

In [61]:
Data <- mlogit.data(Travel, choice="choice", shape="long", chid.var = "individual", alt.var ="mode", drop.index= TRUE)

In [62]:
head(Data,4)

Unnamed: 0,choice,wait,vcost,travel,gcost,income,size,utility
1.air,False,69,59,100,70,35,1,0.7254003
1.train,False,34,31,372,71,35,1,0.6233933
1.bus,False,35,25,417,70,35,1,0.2258345
1.car,True,0,10,180,30,35,1,0.287773


* step 2: run a mlogit

In [63]:
ml.Data <- mlogit(choice ~ gcost + wait + travel, Data, reflevel ="car" ) # reference group for constant =0

In [64]:
summary(ml.Data)


Call:
mlogit(formula = choice ~ gcost + wait + travel, data = Data, 
    reflevel = "car", method = "nr", print.level = 0)

Frequencies of alternatives:
    car     air   train     bus 
0.28095 0.27619 0.30000 0.14286 

nr method
5 iterations, 0h:0m:1s 
g'(-H)^-1g = 0.000159 
successive function values within tolerance limits 

Coefficients :
                    Estimate Std. Error t-value  Pr(>|t|)    
air:(intercept)    4.0540450  0.8366245  4.8457 1.262e-06 ***
train:(intercept)  3.6445988  0.4427624  8.2315 2.220e-16 ***
bus:(intercept)    3.1957885  0.4519434  7.0712 1.536e-12 ***
gcost             -0.0028601  0.0060976 -0.4691  0.639032    
wait              -0.0974635  0.0103529 -9.4141 < 2.2e-16 ***
travel            -0.0034895  0.0011489 -3.0371  0.002388 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Log-Likelihood: -195
McFadden R^2:  0.31281 
Likelihood ratio test : chisq = 177.52 (p.value = < 2.22e-16)

* step3: apply for fitted mean choice probability for each alternatives

In [66]:
apply(fitted(ml.Data, outcome=FALSE),2,mean) #  matrix 1 indicates rows, 2 indicates columns

In [70]:
help(apply)

* step4: mean choice probability

In [71]:
ml.Data$freq/sum(ml.Data$freq)


      car       air     train       bus 
0.2809524 0.2761905 0.3000000 0.1428571 