In [1]:
import pandas as pd

In [2]:
# import the admissions dataset
adm = pd.read_csv("dta_admissions.csv")

In [3]:
# check for the first 10 records
adm.head(10)

Unnamed: 0,admit,gre,gpa,rank
0,0,380,3.61,3
1,1,660,3.67,3
2,1,800,4.0,1
3,1,640,3.19,4
4,0,520,2.93,4
5,1,760,3.0,2
6,1,560,2.98,1
7,0,400,3.08,2
8,1,540,3.39,3
9,0,700,3.92,2


In [4]:
# check the datatype of the features of dataset
adm.dtypes

admit      int64
gre        int64
gpa      float64
rank       int64
dtype: object

## Binary Logit - Stats Model

In [5]:
# uncomment the below code to install the package
#pip install statsmodels

In [6]:
# importing the stats model

import statsmodels.formula.api as smf

## Rank as Numerical

In [7]:
# logit model for rank as numerical (continuous)

num_model = smf.logit('admit ~ gre + gpa + rank', data = adm).fit()
print(num_model.summary()) 

Optimization terminated successfully.
         Current function value: 0.574302
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                  admit   No. Observations:                  400
Model:                          Logit   Df Residuals:                      396
Method:                           MLE   Df Model:                            3
Date:                Fri, 07 Jun 2024   Pseudo R-squ.:                 0.08107
Time:                        14:03:11   Log-Likelihood:                -229.72
converged:                       True   LL-Null:                       -249.99
Covariance Type:            nonrobust   LLR p-value:                 8.207e-09
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -3.4495      1.133     -3.045      0.002      -5.670      -1.229
gre            0.0023      0.

**Statistical Significance:**

*** : p < 0.001

** : 0.001 <= p < 0.01

*: 0.01 <= p < 0.05

. : 0.05 <= p < 0.1

## Rank As Categorical

In [9]:
# rank as categorical value

# converting the rank into category datatype
adm['rank_category'] = adm['rank'].astype('category')

print(adm.dtypes)

admit               int64
gre                 int64
gpa               float64
rank                int64
rank_category    category
dtype: object


In [10]:
# Logit model for rank as category
cat_model = smf.logit('admit ~ gre + gpa + rank_category',data=adm).fit()
print(cat_model.summary())

Optimization terminated successfully.
         Current function value: 0.573147
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                  admit   No. Observations:                  400
Model:                          Logit   Df Residuals:                      394
Method:                           MLE   Df Model:                            5
Date:                Fri, 07 Jun 2024   Pseudo R-squ.:                 0.08292
Time:                        14:05:27   Log-Likelihood:                -229.26
converged:                       True   LL-Null:                       -249.99
Covariance Type:            nonrobust   LLR p-value:                 7.578e-08
                         coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------
Intercept             -3.9900      1.140     -3.500      0.000      -6.224      -1.756
rank_

**Statistical Significance:**

*** : p < 0.001

** : 0.001 <= p < 0.01

*: 0.01 <= p < 0.05

. : 0.05 <= p < 0.1

## Multinominal Logit

In [32]:
travel = pd.read_csv("TravelMode.csv")

In [33]:
travel.head(10)

Unnamed: 0.1,Unnamed: 0,individual,mode,choice,wait,vcost,travel,gcost,income,size
0,1,1,air,no,69,59,100,70,35,1
1,2,1,train,no,34,31,372,71,35,1
2,3,1,bus,no,35,25,417,70,35,1
3,4,1,car,yes,0,10,180,30,35,1
4,5,2,air,no,64,58,68,68,30,2
5,6,2,train,no,44,31,354,84,30,2
6,7,2,bus,no,53,25,399,85,30,2
7,8,2,car,yes,0,11,255,50,30,2
8,9,3,air,no,69,115,125,129,40,1
9,10,3,train,no,34,98,892,195,40,1


In [40]:
travel['choice'] = travel['choice'].astype('category')
travel['choice_code'] = travel['choice'].cat.codes  # Convert categories to codes for statsmodels

# Check the unique values in 'choice'
print(travel['choice'].unique())

# Create dummy variables for 'mode' column
travel_dummies = pd.get_dummies(travel, columns=['mode'], drop_first=True)
print(travel_dummies.head())  # Display the first few rows of the modified dataset

['no', 'yes']
Categories (2, object): ['no', 'yes']
   Unnamed: 0  individual choice  wait  vcost  travel  gcost  income  size  \
0           1           1     no    69     59     100     70      35     1   
1           2           1     no    34     31     372     71      35     1   
2           3           1     no    35     25     417     70      35     1   
3           4           1    yes     0     10     180     30      35     1   
4           5           2     no    64     58      68     68      30     2   

   choice_code  mode_bus  mode_car  mode_train  
0            0     False     False       False  
1            0     False     False        True  
2            0      True     False       False  
3            1     False      True       False  
4            0     False     False       False  


In [42]:
formula = 'choice_code ~ wait + vcost + mode_bus + mode_car + mode_train'

# Fit the model
model = smf.mnlogit(formula, data=travel_dummies).fit()

# Display the summary
print(model.summary())

Optimization terminated successfully.
         Current function value: 0.453272
         Iterations 7
                          MNLogit Regression Results                          
Dep. Variable:            choice_code   No. Observations:                  840
Model:                        MNLogit   Df Residuals:                      834
Method:                           MLE   Df Model:                            5
Date:                Fri, 07 Jun 2024   Pseudo R-squ.:                  0.1939
Time:                        19:32:44   Log-Likelihood:                -380.75
converged:                       True   LL-Null:                       -472.36
Covariance Type:            nonrobust   LLR p-value:                 1.095e-37
     choice_code=1       coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------
Intercept              5.9078      0.790      7.478      0.000       4.359       7.456
mode_

In [66]:
predicted_probabilities = model.predict()

# Verify the shape of the predicted probabilities
print(predicted_probabilities)

[[0.86117711 0.13882289]
 [0.67836355 0.32163645]
 [0.82525354 0.17474646]
 ...
 [0.87442567 0.12557433]
 [0.97158324 0.02841676]
 [0.70994218 0.29005782]]
