# Lecture 23: Logistical Regression - Diganostics

---

## Ex-Ante Diagnostics

The example below employs the Indian Time Use Survey to model the choice of shopping channel (no-shopping, in-store, online, both) as logistic regression, accounting for individual specific variables (socio-demographics and socio-economic parameters).

In [1]:
# Load necessary libraries
library(tidyr)
library(dplyr)
library(mlogit)
library(ggplot2)
options(repr.plot.width = 12, repr.plot.height = 8)

# 2024 ITUS Individual Data (model)
url  <- "https://raw.githubusercontent.com/anmpahwa/CE5540/refs/heads/main/resources/ITUS_IND_BD.csv"
data <- read.csv(url) # Loading Data
str(data)


Attaching package: 'dplyr'


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union


Loading required package: dfidx



'data.frame':	454192 obs. of  54 variables:
 $ Unique_ID        : chr  "2024-30010-1-241-17-13-11-2-2420-4-1-1" "2024-30010-1-241-17-13-11-2-2420-4-10-1" "2024-30010-1-241-17-13-11-2-2420-4-10-2" "2024-30010-1-241-17-13-11-2-2420-4-11-1" ...
 $ Unique_HH_ID     : chr  "2024-30010-1-241-17-13-11-2-2420-4-1" "2024-30010-1-241-17-13-11-2-2420-4-10" "2024-30010-1-241-17-13-11-2-2420-4-10" "2024-30010-1-241-17-13-11-2-2420-4-11" ...
 $ q1               : int  0 0 0 0 0 0 0 0 0 0 ...
 $ q2               : int  1 1 1 1 1 1 1 1 1 1 ...
 $ q3               : int  0 0 0 0 0 0 0 0 0 0 ...
 $ q4               : int  0 0 0 0 0 0 0 0 0 0 ...
 $ weekday          : int  1 1 1 0 0 0 0 0 1 1 ...
 $ weekend          : int  0 0 0 1 1 1 1 1 0 0 ...
 $ rural            : int  1 1 1 1 1 1 1 1 1 1 ...
 $ urban            : int  0 0 0 0 0 0 0 0 0 0 ...
 $ north            : int  0 0 0 0 0 0 0 0 0 0 ...
 $ west             : int  1 1 1 1 1 1 1 1 1 1 ...
 $ central          : int  0 0 0 0 0 0 0 0 0 0 ...
 $ east

In [2]:
# Counts
counts <- data %>%
  summarise(
    None = sum(none, na.rm = TRUE),
    InStore = sum(instore, na.rm = TRUE),
    Online = sum(online, na.rm = TRUE),
    Both = sum(both, na.rm = TRUE)
  ) %>%
  pivot_longer(cols = everything(), names_to = "channel", values_to = "count") %>%
  mutate(share = (count / sum(count)) * 100)
print(counts)

[90m# A tibble: 4 × 3[39m
  channel  count   share
  [3m[90m<chr>[39m[23m    [3m[90m<int>[39m[23m   [3m[90m<dbl>[39m[23m
[90m1[39m None    [4m4[24m[4m0[24m[4m9[24m897 90.2   
[90m2[39m InStore  [4m3[24m[4m9[24m728  8.75  
[90m3[39m Online    [4m4[24m252  0.936 
[90m4[39m Both       315  0.069[4m4[24m


In [3]:
# Temporal: Quarter Counts
quarter_counts <- data %>%
  summarise(
    None_Q1 = sum(none * q1, na.rm = TRUE),
    InStore_Q1 = sum(instore * q1, na.rm = TRUE),
    Online_Q1 = sum(online * q1, na.rm = TRUE),
    Both_Q1 = sum(both * q1, na.rm = TRUE),

    None_Q2 = sum(none * q2, na.rm = TRUE),
    InStore_Q2 = sum(instore * q2, na.rm = TRUE),
    Online_Q2 = sum(online * q2, na.rm = TRUE),
    Both_Q2 = sum(both * q2, na.rm = TRUE),

    None_Q3 = sum(none * q3, na.rm = TRUE),
    InStore_Q3 = sum(instore * q3, na.rm = TRUE),
    Online_Q3 = sum(online * q3, na.rm = TRUE),
    Both_Q3 = sum(both * q3, na.rm = TRUE),

    None_Q4 = sum(none * q4, na.rm = TRUE),
    InStore_Q4 = sum(instore * q4, na.rm = TRUE),
    Online_Q4 = sum(online * q4, na.rm = TRUE),
    Both_Q4 = sum(both * q4, na.rm = TRUE)
  ) %>%
  pivot_longer(cols = everything(), names_to = "channel_quarter", values_to = "count") %>%
  separate(channel_quarter, into = c("channel", "quarter"), sep = "_") %>%
  group_by(quarter) %>%
  mutate(share = (count / sum(count)) * 100) %>%
  ungroup() %>%
  mutate(channel = factor(channel, levels = c("None", "InStore", "Online", "Both")))

# Tabulate
table <- quarter_counts %>%
  ungroup() %>%
  select(channel, quarter, share) %>%
  pivot_wider(names_from = quarter, values_from = share)
print(table)

[90m# A tibble: 4 × 5[39m
  channel      Q1      Q2      Q3      Q4
  [3m[90m<fct>[39m[23m     [3m[90m<dbl>[39m[23m   [3m[90m<dbl>[39m[23m   [3m[90m<dbl>[39m[23m   [3m[90m<dbl>[39m[23m
[90m1[39m None    91.0    90.2    89.9    89.8   
[90m2[39m InStore  7.61    8.57    9.33    9.56  
[90m3[39m Online   1.31    1.10    0.699   0.601 
[90m4[39m Both     0.071[4m3[24m  0.079[4m4[24m  0.070[4m4[24m  0.055[4m7[24m


In [4]:
# Temporal: Day Counts
day_counts <- data %>%
  summarise(
    None_WD = sum(none * weekday, na.rm = TRUE),
    InStore_WD = sum(instore * weekday, na.rm = TRUE),
    Online_WD = sum(online * weekday, na.rm = TRUE),
    Both_WD = sum(both * weekday, na.rm = TRUE),

    None_WE = sum(none * weekend, na.rm = TRUE),
    InStore_WE = sum(instore * weekend, na.rm = TRUE),
    Online_WE = sum(online * weekend, na.rm = TRUE),
    Both_WE = sum(both * weekend, na.rm = TRUE)
  ) %>%
  pivot_longer(cols = everything(), names_to = "channel_day", values_to = "count") %>%
  separate(channel_day, into = c("channel", "day"), sep = "_") %>%
  group_by(day) %>%
  mutate(share = (count / sum(count)) * 100) %>%
  ungroup() %>%
  mutate(channel = factor(channel, levels = c("None", "InStore", "Online", "Both")))

# Tabulate
table <- day_counts %>%
  ungroup() %>%
  select(channel, day, share) %>%
  pivot_wider(names_from = day, values_from = share)
print(table)

[90m# A tibble: 4 × 3[39m
  channel      WD      WE
  [3m[90m<fct>[39m[23m     [3m[90m<dbl>[39m[23m   [3m[90m<dbl>[39m[23m
[90m1[39m None    90.7    89.2   
[90m2[39m InStore  8.38    9.73  
[90m3[39m Online   0.900   1.03  
[90m4[39m Both     0.063[4m6[24m  0.084[4m7[24m


In [5]:
# Spatial: Sector Counts
sector_counts <- data %>%
  summarise(
    None_R = sum(none * rural, na.rm = TRUE),
    InStore_R = sum(instore * rural, na.rm = TRUE),
    Online_R = sum(online * rural, na.rm = TRUE),
    Both_R = sum(both * rural, na.rm = TRUE),

    None_U = sum(none * urban, na.rm = TRUE),
    InStore_U = sum(instore * urban, na.rm = TRUE),
    Online_U = sum(online * urban, na.rm = TRUE),
    Both_U = sum(both * urban, na.rm = TRUE)
  ) %>%
  pivot_longer(cols = everything(), names_to = "channel_sector", values_to = "count") %>%
  separate(channel_sector, into = c("channel", "sector"), sep = "_") %>%
  group_by(sector) %>%
  mutate(share = (count / sum(count)) * 100) %>%
  ungroup() %>%
  mutate(channel = factor(channel, levels = c("None", "InStore", "Online", "Both")))

# Tabulate
table <- sector_counts %>%
  ungroup() %>%
  select(channel, sector, share) %>%
  pivot_wider(names_from = sector, values_from = share)
print(table)

[90m# A tibble: 4 × 3[39m
  channel       R      U
  [3m[90m<fct>[39m[23m     [3m[90m<dbl>[39m[23m  [3m[90m<dbl>[39m[23m
[90m1[39m None    91.6    88.0  
[90m2[39m InStore  7.66   10.6  
[90m3[39m Online   0.706   1.33 
[90m4[39m Both     0.047[4m3[24m  0.107


In [6]:
# Spatial: Tier Counts
tier_counts <- data %>%
  summarise(
    None_T1 = sum(none * tierI, na.rm = TRUE),
    InStore_T1 = sum(instore * tierI, na.rm = TRUE),
    Online_T1 = sum(online * tierI, na.rm = TRUE),
    Both_T1 = sum(both * tierI, na.rm = TRUE),

    None_T2 = sum(none * tierII, na.rm = TRUE),
    InStore_T2 = sum(instore * tierII, na.rm = TRUE),
    Online_T2 = sum(online * tierII, na.rm = TRUE),
    Both_T2 = sum(both * tierII, na.rm = TRUE),

    None_T3 = sum(none * tierIII, na.rm = TRUE),
    InStore_T3 = sum(instore * tierIII, na.rm = TRUE),
    Online_T3 = sum(online * tierIII, na.rm = TRUE),
    Both_T3 = sum(both * tierIII, na.rm = TRUE)
  ) %>%
  pivot_longer(cols = everything(), names_to = "channel_tier", values_to = "count") %>%
  separate(channel_tier, into = c("channel", "tier"), sep = "_") %>%
  group_by(tier) %>%
  mutate(share = (count / sum(count)) * 100) %>%
  ungroup() %>%
  mutate(channel = factor(channel, levels = c("None", "InStore", "Online", "Both")))

# Tabulate
table <- tier_counts %>%
  ungroup() %>%
  select(channel, tier, share) %>%
  pivot_wider(names_from = tier, values_from = share)
print(table)

[90m# A tibble: 4 × 4[39m
  channel      T1      T2      T3
  [3m[90m<fct>[39m[23m     [3m[90m<dbl>[39m[23m   [3m[90m<dbl>[39m[23m   [3m[90m<dbl>[39m[23m
[90m1[39m None    90.4    91.1    89.3   
[90m2[39m InStore  8.74    8.02    9.48  
[90m3[39m Online   0.825   0.811   1.17  
[90m4[39m Both     0.077[4m6[24m  0.047[4m1[24m  0.083[4m3[24m


## Model Diagnostics

In [None]:
# Convert data to mlogit format
data <- data %>%
  mutate(
    shopping_choice = case_when(
      none == 1 ~ "None",
      instore == 1 ~ "InStore",
      online == 1 ~ "Online"
    )
  )

data <- mlogit.data(data, choice = "shopping_choice", shape = "wide", varying = NULL, id.var = "Unique_ID")

In [None]:
# TODO: Model Development
model <- mlogit(shopping_choice ~ weekend + urban, data = data)

In [None]:
# TODO: Model Diagnostics
summary(model)

## Ex-Post Diagnostics

...