# Matching

This is based on the code in https://arxiv.org/abs/2211.15849.
The core procedure in `match2C` is documented at https://cran.r-project.org/web/packages/match2C/vignettes/tutorial.html.

v0.1 Aggregated by Jiayao Zhang
June 14, 2023

In [1]:
library(tidyverse)
library(match2C)
library(ggplot2)
library(rigr)
library(tableone)
library(xtable)


── [1mAttaching core tidyverse packages[22m ───────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.2     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.4.2     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.2     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.1     
── [1mConflicts[22m ─────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
rigr version 1.0.4: Regression, Inference, and General Data Analysis Tools in R



In [160]:
source('./src/functions_customized_dist_left.R')

In [23]:
design_mat <- read_csv('./design_mat.csv')


[1mRows: [22m[34m8979[39m [1mColumns: [22m[34m53[39m
[36m──[39m [1mColumn specification[22m [36m─────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (5): submission_id, full_decision, primary_keyword, secondary_keyword, ...
[32mdbl[39m (40): year, binary_decision, input_len, n_review, rating_avg, rating_max...
[33mlgl[39m  (8): arxiv_first, any_reported_f, fst_reported_f, any_perceived_f, fst_...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [64]:
design_mat <- design_mat_raw %>% mutate(
  'sub_fluency' = apply(as.array(design_mat_raw$sub_fluency),1,as.numeric,1),
  'n_fig' = apply(as.array(design_mat_raw$n_fig),1,as.numeric,1),
  'n_ref' = apply(as.array(design_mat_raw$n_ref),1,as.numeric,1),
  'n_sec' = apply(as.array(design_mat_raw$n_sec),1,as.numeric,1),
  'n_author' = apply(as.array(design_mat_raw$n_author),1,as.numeric,1),
  'any_reported_f' = apply(as.array(design_mat_raw$any_reported_f),1,as.logical),
  'fst_reported_f' = apply(as.array(design_mat_raw$fst_reported_f),1,as.logical),
  'cnt_reported_f' = apply(as.array(design_mat_raw$cnt_reported_f),1,as.numeric,1),
  'year' = apply(as.array(design_mat_raw$year),1,as.character,1),
  'cluster' = apply(as.array(design_mat_raw$cluster),1,as.character,1),
  'demo_no_us' = apply(as.array(design_mat_raw$demo_no_us),1,as.logical),
  'arxiv_first' = apply(as.array(design_mat_raw$arxiv_first),1,as.logical),
)

In [171]:
var_list <- c('log_input_len', 'year',  'n_fig', 'n_ref', 'n_sec',
              'sub_fluency', 'cluster', 'n_author',
              'fst_reported_f', 'any_reported_f', 'cnt_reported_f', 'demo_no_us',
              'log_ins_rank_min', 'log_ins_rank_avg', 'log_ins_rank_max',
              'log_author_cite_min', 'log_author_cite_avg', 'log_author_cite_max'
)
fb_var <- c('cluster')


In [58]:
A <- 'arxiv_first'

In [101]:
Z = as.matrix(design_mat[,(A)])

In [103]:
n_tr = sum(Z)
n_ct = length(Z) - n_tr

In [151]:
propensity <- glm(
    as.formula(paste("arxiv_first ~ ", paste(var_list, collapse ="+"), sep = "")),
    data=design_mat, family='binomial'
)$fitted.values


In [138]:
X <- as.matrix(design_mat[,var_list])

In [166]:
dist_list_left <- create_list_from_scratch_overall(Z, X, 
    exact = c('year', 'n_author'), 
    p = propensity, penalty = 100, caliper_low = 1)



In [168]:
dist_list_left$d <- 1e2*dist_list_left$d

In [180]:
X <- as.matrix(design_mat[,var_list])

In [181]:
dist_list_right = create_list_from_scratch(Z = Z, X = propensity,
                                           p = propensity,
                                           caliper_low = 1,
                                           k = NULL,
                                           method = 'L1')

In [None]:
matching_output = match_2C_list(Z = Z, dataset = design_mat,
                                dist_list_1 = dist_list_left,
                                dist_list_2 = dist_list_right,
                                lambda = 100,
                                controls = 1)

In [None]:
write.csv(matching_output$matched_data_in_order, 'matched_design_mat_ordered.csv')

In [None]:
### with fine-balance

In [191]:
dist_list_right_fb = create_list_from_scratch(Z = Z, X = as.numeric(X[, fb_var]),
                                              p = propensity,
                                              caliper_low = 1,
                                              k = NULL,
                                              method = '0/1')
dist_list_right$d = 1e2*dist_list_right$d + 1e4*dist_list_right_fb$d





In [192]:
fb_matching_output = match_2C_list(Z = Z, dataset = design_mat,
                                dist_list_1 = dist_list_left,
                                dist_list_2 = dist_list_right,
                                lambda = 100,
                                controls = 1)

In [193]:
write.csv(matching_output$matched_data_in_order, 'fb_matched_design_mat_ordered.csv')