# Primary and NOC Analysis

v.1 Jiayao Zhang
June 20

In [12]:
from __future__ import print_function, absolute_import, division
%load_ext autoreload
%autoreload 2
from pathlib import Path
import numpy as np
import pandas as pd
import tqdm
import json
import uuid
import sqlite3


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
import seaborn as sns
import matplotlib
from matplotlib import pyplot as plt

sns.set()
sns.set(font_scale=2.5,)
sns.set_style("white")
sns.set_palette("colorblind")
tqdm.tqdm.pandas()

import logging
logging.getLogger().setLevel(logging.CRITICAL)
import warnings
warnings.filterwarnings("ignore", )


# Load Processed Data

In [14]:
Ns = ['c365', 'c730', 'c1095']
A = 'arxiv_first'
Y = 'binary_decision'
C_cols = ['year',
 'log_input_len',
 'n_fig',
 'n_ref',
 'n_sec',
 'sub_fluency',
 'cluster',
 'n_author',
 'fst_reported_f',
 'any_reported_f',
 'cnt_reported_f',
 'demo_no_us',
 'log_ins_rank_min',
 'log_ins_rank_avg',
 'log_ins_rank_max',
 'log_author_cite_min',
 'log_author_cite_avg',
 'log_author_cite_max'
]

C_vars = ['C(year)',
 'log_input_len',
 'n_fig',
 'n_ref',
 'n_sec',
 'sub_fluency',
 'C(cluster)',
 'n_author',
 'C(fst_reported_f)',
 'C(any_reported_f)',
 'cnt_reported_f',
 'C(demo_no_us)',
 'log_ins_rank_min',
 'log_ins_rank_avg',
 'log_ins_rank_max',
 'log_author_cite_min',
 'log_author_cite_avg',
 'log_author_cite_max'
]


In [18]:
# change the path accordingly
import functools
citation_data_path = [
    ('c365', '../data/s2_citation/c_365.tsv', '\t'),
    ('c730', '../data/s2_citation/c_730.tsv', '\t'),
    ('c1095', '../data/s2_citation/c_1095.tsv', '\t'),
    ('c1825', '../data/s2_citation/c_1825.tsv', '\t')

]

cites_within_window = functools.reduce(
    lambda x,y: x.merge(y,on=['submission_id'], how='outer'),
    [(pd.read_csv(p, sep=s)
      .rename({'cites_within_year':l,'cites_within_window':l},axis=1,errors='ignore')
      [['submission_id',l]]
     )
     for l, p, s in citation_data_path],
)


FileNotFoundError: [Errno 2] No such file or directory: './data/s2_citation/c_365.tsv'

Load processed "design matrix"

In [5]:
design_mat=pd.read_csv('./data/design_mat.csv')


In [6]:
# stratification by subgroups
design_mat = (
    design_mat.drop(['c365','c1095','c1825'],axis=1)
    .merge(cites_within_window, on='submission_id', how='left')
)

design_mat['inst_level'] =\
    design_mat['log_ins_rank_min'].apply(lambda s : 'L' if s < 1 else ('M' if s < 2 else 'H')
)

design_mat['author_level'] =\
    design_mat['author_cite_avg'].apply(lambda s : 'L' if s < 500 else ('M' if s < 2000 else 'H')
)


In [8]:
def load_matched_df(path, df):
    matched_design_mat = pd.read_csv(path).drop('Unnamed: 0',axis=1)
    matched_design_mat=matched_design_mat[~matched_design_mat.matched_set.isna()]
    return pd.concat([
        matched_design_mat.iloc[::2].assign(grp='treated'),
        matched_design_mat.iloc[1::2].assign(grp='control'),]
    )


Load matched pairs

In [9]:
## with fine-balance on topic cluster
## this will be used in the subsequent analysis
fb_matched_dat = load_matched_df('./data/fb_matched_design_mat_ordered.csv', design_mat)


### Inspecting Sample Sizes

In [154]:
smpsize_df=pd.concat([dat[data_selector(dat,N)]
 .query(f"{N}>={t}")
 .groupby(
    ['arxiv_first','inst_level']).size()
 .to_frame('smps').assign(N=N,t=t)
 for N,t  in thres
])

for i, (N,t) in enumerate(thres):
    q = [50, 75, 90]
    smpsize_df.loc[(smpsize_df.N==N)&(smpsize_df.t==t),'t'] =\
        rf"${q[i%3]}\%$ (${t}$)"


In [156]:
print(
(smpsize_df.reset_index()
 .replace({'inst_level': {
     'H': r'Top-$10$',
     'L': r'Top-$10$ to $100$',
     'M': r'Others'
     },
    'N':{
         N:r"$\cc^{(" +f'{i+1}' + r")}$"
         for i,N in enumerate(Ns)
        },
    'arxiv_first': {
     True: r"$A=1$",
     False: r"$A=0$",
         }
    })
 .rename({'inst_level':'Author Institution', 'arxiv_first': r'Early arXiving ($A$)'},axis=1)
 .pivot(index=['N','t'], columns=['Author Institution', r'Early arXiving ($A$)'], values='smps')
 .sort_index(axis='columns', level='Author Institution',
            key=lambda s:s.map({r'Top-$10$':0,
                                r'Top-$10$ to $100$':1,
                                r'Others':2})
            )
).to_latex(escape=False)
)


\begin{tabular}{llrrrrrr}
\toprule
            & Author Institution & \multicolumn{2}{l}{Top-$10$} & \multicolumn{2}{l}{Top-$10$ to $100$} & \multicolumn{2}{l}{Others} \\
            & Early arXiving ($A$) &    $A=0$ & $A=1$ &             $A=0$ & $A=1$ &  $A=0$ & $A=1$ \\
N & t &          &       &                   &       &        &       \\
\midrule
$\cc^{(1)}$ & $50\%$ ($4$) &       36 &    46 &               320 &   398 &    214 &   322 \\
            & $75\%$ ($11$) &       14 &    16 &               171 &   225 &     86 &   139 \\
            & $90\%$ ($23$) &        5 &     4 &                77 &   102 &     26 &    42 \\
$\cc^{(2)}$ & $50\%$ ($10$) &       26 &    39 &               247 &   304 &    141 &   232 \\
            & $75\%$ ($29$) &       13 &    16 &               133 &   172 &     54 &   108 \\
            & $90\%$ ($65$) &        3 &     6 &                52 &    85 &     16 &    34 \\
$\cc^{(3)}$ & $50\%$ ($11$) &       20 &    26 &               168 &   205 &

In [188]:
print(
    dat.sort_values(['binary_decision'])
    .replace({'arxiv_first':{
        True: r"$A=1$",
        False: r"$A=0$",
    }})
    .assign(
    acc=lambda s:s['binary_decision'],
    rej=lambda s:s['binary_decision'],
).groupby(['year','arxiv_first']).agg({
    'acc': lambda s : sum(s==1),
    'rej': lambda s : sum(s==0),
        'c365': lambda s: np.round(np.mean(s),2),
        'c730': lambda s: np.round(np.mean(s),2),
    'c1095': lambda s: np.round(np.mean(s),2),
}).to_latex(escape=False)
)


\begin{tabular}{llrrrrr}
\toprule
     &       &  acc &  rej &   c365 &    c730 &   c1095 \\
year & arxiv_first &      &      &        &         &         \\
\midrule
2018 & $A=0$ &   10 &   10 &   9.85 &   27.45 &   47.40 \\
     & $A=1$ &   19 &    1 &  30.10 &  112.60 &  249.95 \\
2019 & $A=0$ &   24 &   24 &  14.52 &   38.70 &   63.52 \\
     & $A=1$ &   27 &   21 &  14.13 &   39.04 &   69.79 \\
2020 & $A=0$ &  190 &  312 &   8.32 &   20.53 &   34.03 \\
     & $A=1$ &  235 &  267 &  11.86 &   34.50 &   58.14 \\
2021 & $A=0$ &  182 &  320 &   8.99 &   25.15 &   12.71 \\
     & $A=1$ &  232 &  271 &   8.66 &   24.33 &   35.84 \\
2022 & $A=0$ &  198 &  216 &   8.76 &   14.24 &    0.29 \\
     & $A=1$ &  236 &  177 &   7.81 &   20.21 &    0.53 \\
\bottomrule
\end{tabular}



# Primary Analysis

In [21]:
import statsmodels.api as sm
import statsmodels.formula.api as smf


In [22]:
C_formula = '+'.join(C_vars)
formula = f"{A} + " + C_formula
inst_formula = f"{A} + " + '+'.join([var for var in C_vars if 'inst_rank' not in var])
author_formula = f"{A} + " + '+'.join([var for var in C_vars if 'author_cite' not in var])


In [23]:
def print_cond_means_y(predictor, dat, A_col):
    Y_1 = np.mean(predictor.predict(dat[dat[A_col]]))
    Y_0 = np.mean(predictor.predict(dat[~dat[A_col]]))
    return pd.DataFrame([[Y_1,Y_0,Y_1-Y_0]],columns=['Y_1','Y_0','D1'])


### (Optional) Unmatched

In [15]:
Y_unadjust = smf.logit(formula=f"{Y} ~ {formula}", data=design_mat).fit()


Optimization terminated successfully.
         Current function value: 0.606045
         Iterations 6


In [17]:
print_cond_means_y(Y_unadjust, design_mat, 'arxiv_first')


Unnamed: 0,Y_1,Y_0,D1
0,0.504038,0.377019,0.127019


### Primary Analysis: on the Matched Sample (without NOC)

#### 1. On the full matched sample

In [400]:
Y_matched = smf.logit(formula=f"{Y} ~ {formula}", data=dat).fit()


Optimization terminated successfully.
         Current function value: 0.623841
         Iterations 5


In [618]:
# helper for bootstraping
def bootstrap_ci(dat, A_col, match_col='matched_set', n_rep=2000, est_name='D1'):
    idx_set = list(dat[match_col].unique())

    _bootstraps = []
    for _ in range(n_rep):
        smp_idx = set(np.random.choice(idx_set, len(idx_set), replace=True))
        dt = dat.loc[dat[dat[match_col].isin(smp_idx)].index].reset_index(drop=True)

        predictor = smf.logit(formula=f"{Y} ~ {formula}", data=dt).fit(disp=0)
        _bootstraps.append(
            np.mean(predictor.predict(dt[dt[A_col]])) - np.mean(predictor.predict(dt[~dt[A_col]]))
        )
    res = pd.DataFrame(_bootstraps, columns=[est_name])
    res['bidx'] = list(range(n_rep))
    return res


In [552]:
Y_matched_boot=bootstrap_ci(dat, 'arxiv_first',
                            match_col='matched_set', n_rep=2000,)


100%|██████████| 2000/2000 [02:37<00:00, 12.66it/s]


In [558]:
get_ci_table(Y_matched_boot.assign(grp=0),effect_col='D1', groups=['grp'],alpha=0.05)


Unnamed: 0_level_0,r,ll,hl
grp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.09762,0.070083,0.126071


#### 2. On subsets of matched sample determined by availability of citation counts

In [619]:
Y_matched_boot_byN=pd.concat([bootstrap_ci(
        dat[data_selector(dat,N)], 'arxiv_first',
        match_col='matched_set', n_rep=2000,est_name='DiD'
    ).assign(N=N)
 for N in tqdm.tqdm(Ns)])




  0%|          | 0/3 [00:00<?, ?it/s][A[A

  0%|          | 0/3 [05:00<?, ?it/s]3.58s/it][A[A


 67%|██████▋   | 2/3 [04:50<02:23, 143.90s/it][A[A

100%|██████████| 3/3 [06:49<00:00, 136.61s/it][A[A


In [623]:
get_ci_table(Y_matched_boot_byN.assign(grp=0),effect_col='DiD', groups=['grp','N'],alpha=0.05)


Unnamed: 0_level_0,Unnamed: 1_level_0,r,ll,hl
grp,N,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,c1095,0.100301,0.054199,0.143662
0,c365,0.097885,0.070897,0.124604
0,c730,0.098994,0.06801,0.131942


# Anlaysis with NOC

NOCs are defined by thresholding citation counts using the 0.5, 0.7, and 0.9 -th quantile
in the matched sample. Below is the threshold for each citation count.

In [24]:

_=[print([np.quantile(dat[N].dropna(),q) for q in [0.5,0.75,0.9]])
  for N in ['c365','c730','c1095']
]


[4.0, 11.0, 23.0]
[10.0, 29.0, 65.0]
[11.0, 41.0, 102.70000000000005]


In [25]:
thres = [
    ('c365', 4),
    ('c365', 11),
    ('c365', 23),
    ('c730', 10),
    ('c730', 29),
    ('c730', 65),
    ('c1095', 11),
    ('c1095', 41),
    ('c1095', 103),
]
def data_selector(df, N):
    """Select only valid subset of data."""
    if N not in ['c365','c730','c1095']:
        return (df.index >=0) | (df.index<0)
    return df.year <= {
        'c365': 2022,
        'c730': 2021,
        'c1095': 2020,
    }[N]


### Analysis on all matched samples

In [26]:
def fit_full_model(data, N, t):
    """Model for DiD (additive equi-confounding adjustment for DiD)"""
    sel = data_selector(data, N)
    var = f"N_full_{N}_{t}"
    data[var] = 1. * ( data[N] >= t)
    try:
        Y_model = smf.logit(f"{Y}~{formula}", data=data[sel]).fit(disp=0)
    except:
        Y_model = smf.logit(f"{Y}~{formula}", data=data[sel]).fit(disp=0,method='bfgs')

    try:
        N_model = smf.logit(f"{var}~{formula}", data=data[sel]).fit(disp=0)
    except:
        N_model = smf.logit(f"{var}~{formula}", data=data[sel]).fit(disp=0,method='bfgs')
    return Y_model, N_model


In [27]:
def print_cond_means(predictors, dat, A_col):
    """Helper for collecting the results."""
    Y_pred, N_pred = predictors
    X_tr, X_ct = dat[dat[A_col]], dat[~dat[A_col]]
    Y_1 = np.mean(Y_pred.predict(X_tr))
    Y_0 = np.mean(Y_pred.predict(X_ct))
    N_1 = np.mean(N_pred.predict(X_tr))
    N_0 = np.mean(N_pred.predict(X_ct))
    return pd.DataFrame([[
        Y_1, Y_0, N_1, N_0, N_1-N_0, Y_1-Y_0, (Y_1-N_1)-(Y_0-N_0)
    ]], columns=['Y_1','Y_0','N_1','N_0','DN','DY','DiD'])
    print(f"DN: {N_1-N_0:6.4f} vanilla ATET: {Y_1-Y_0:6.4f}\tNOC ATET: {(Y_1-N_1)-(Y_0-N_0):6.4f}")


In [28]:
def cond_means_and_ci(dat, A_col, N, t, match_col='matched_set', n_rep=2000,):
    """Bootstrap CI."""
    idx_set = list(dat[match_col].unique())


    _bootstraps = []

    for bidx in range(n_rep):
        smp_idx = set(np.random.choice(idx_set, len(idx_set), replace=True))
        dt = dat[dat[match_col].isin(smp_idx)].reset_index(drop=True)
        sel = data_selector(dt, N)
        predictors = fit_full_model(dt, N, t)

        _bootstraps.append(print_cond_means(predictors, dt[sel], A_col)
         .assign(N=N, t=t, n_N1=sum(dt[sel][N] > t))
         .assign(boot_idx=bidx)
        )
    return pd.concat(_bootstraps)


Sample sizes for each study

In [437]:
[dat[data_selector(dat, N)].shape[0] for N in Ns]


[2972, 2145, 1140]

#### NOC-adjusted Models

In [376]:
fitted_full_models = [
    fit_full_model(dat, N, t)
    for N, t in tqdm.tqdm(thres)
]



  0%|          | 0/9 [00:00<?, ?it/s][A
 11%|█         | 1/9 [00:00<00:01,  7.25it/s][A
 22%|██▏       | 2/9 [00:00<00:00,  7.76it/s][A
 33%|███▎      | 3/9 [00:00<00:00,  7.96it/s][A
 56%|█████▌    | 5/9 [00:00<00:00,  9.10it/s][A
 67%|██████▋   | 6/9 [00:00<00:00,  9.22it/s][A
100%|██████████| 9/9 [00:00<00:00,  9.73it/s][A


In [377]:
full_model_atet = pd.concat(
    [(print_cond_means(predictors, dat[data_selector(dat, N)], 'arxiv_first')
     .assign(N=N, t=t, n_N1=sum(dat[data_selector(dat, N)][N] > t))
    ) for predictors, (N,t) in tqdm.tqdm(zip(fitted_full_models, thres))
])



0it [00:00, ?it/s][A
2it [00:00, 11.41it/s][A
4it [00:00, 11.84it/s][A
6it [00:00, 11.84it/s][A
9it [00:00, 12.06it/s][A


In [537]:
full_model_ci = pd.concat([
    cond_means_and_ci(dat, 'arxiv_first', N=N, t=t, n_rep=2000)
    for N, t in tqdm.tqdm(thres)
])


100%|██████████| 9/9 [42:55<00:00, 286.17s/it]


In [29]:
def get_ci_table(boot_df, effect_col='DiD', groups=['N','t'], alpha=0.05):
    ll = alpha / 2.
    hl = 1. - ll
    return pd.concat([
      boot_df.groupby(groups)[effect_col].apply(lambda s : np.mean(s)).to_frame('r'),
      boot_df.groupby(groups)[effect_col].apply(lambda s : np.quantile(s,ll)).to_frame('ll'),
     boot_df.groupby(groups)[effect_col].apply(lambda s : np.quantile(s,hl)).to_frame('hl'),
    ],axis=1)


In [None]:
full_model_ci.to_csv('./processed_data/nongrp_bootstrap_smps.csv',
                    header=True, index=False)


In [538]:
full_model_ci_table = get_ci_table(full_model_ci,alpha=0.05)


In [594]:
full_model_ci_table


Unnamed: 0_level_0,Unnamed: 1_level_0,r,ll,hl
N,t,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
c1095,11,-0.091711,-0.13388,-0.048697
c1095,41,-0.02633,-0.06877,0.017409
c1095,103,0.021648,-0.027108,0.070624
c365,4,-0.006561,-0.037596,0.022369
c365,11,0.037272,0.009594,0.065131
c365,23,0.075585,0.047771,0.102834
c730,10,-0.040641,-0.072574,-0.008928
c730,29,0.007396,-0.025262,0.042002
c730,65,0.051294,0.016825,0.08658


In [541]:
print(
    r'$'+r'$ & $'.join(
     [f"{r:.4f}"for r in
        [full_model_ci_table.reset_index().query(f"(N=='{N}') and (t=={t})").r.iloc[0]
         for N,t in thres]
     ]
    )+r'$ \\'
)


$-0.0066$ & $0.0373$ & $0.0756$ & $-0.0406$ & $0.0074$ & $0.0513$ & $-0.0917$ & $-0.0263$ & $0.0216$ \\


In [450]:
print(
    r'$'+r'$ & $'.join(
     [f"{r:.4f}"for r in
        [full_model_ci_table.reset_index().query(f"(N=='{N}') and (t=={t})").r.iloc[0]
         for N,t in thres]
     ]
    )+r'$ \\'
)


$-0.0066$ & $0.0377$ & $0.0752$ & $-0.0402$ & $0.0072$ & $0.0508$ & $-0.0912$ & $-0.0264$ & $0.0210$ \\


In [542]:
print(
    r'$'+r'$ & $'.join(
     [f"({ll:.4f},{lh:.4f})"for ll,lh in
        [(full_model_ci_table.reset_index().query(f"(N=='{N}') and (t=={t})").ll.iloc[0],
         full_model_ci_table.reset_index().query(f"(N=='{N}') and (t=={t})").hl.iloc[0] )
         for N,t in thres]
     ]
    )+r'$ \\'
)


$(-0.0376,0.0224)$ & $(0.0096,0.0651)$ & $(0.0478,0.1028)$ & $(-0.0726,-0.0089)$ & $(-0.0253,0.0420)$ & $(0.0168,0.0866)$ & $(-0.1339,-0.0487)$ & $(-0.0688,0.0174)$ & $(-0.0271,0.0706)$ \\


In [451]:
print(
    r'$'+r'$ & $'.join(
     [f"({ll:.4f},{lh:.4f})"for ll,lh in
        [(full_model_ci_table.reset_index().query(f"(N=='{N}') and (t=={t})").ll.iloc[0],
         full_model_ci_table.reset_index().query(f"(N=='{N}') and (t=={t})").hl.iloc[0] )
         for N,t in thres]
     ]
    )+r'$ \\'
)


$(-0.0136,0.0002)$ & $(0.0295,0.0457)$ & $(0.0669,0.0836)$ & $(-0.0478,-0.0325)$ & $(-0.0016,0.0162)$ & $(0.0407,0.0611)$ & $(-0.1012,-0.0810)$ & $(-0.0387,-0.0142)$ & $(0.0078,0.0345)$ \\


## Stratified Anlaysis for Author Subgroups

In [73]:
def cond_means_ci_subgrp(dat, N, t, A_col, level_col, level_vars, level_lb,
                         all_lb='all',grp_name='level',
                         n_rep=2000, match_col='matched_set'):
    """DiD/NOC for stratified analysis."""

    estimates = []
    for lv, lb in zip([None,]+level_vars, [all_lb,]+level_lb):
        df = dat[dat[level_col] == lv] if lv is not None else dat
        df = df[data_selector(df, N)].reset_index(drop=True)
        idx_set = list(df[match_col].unique())

        _bootstraps = []
        for bidx in range(n_rep):
            smp_idx = set(np.random.choice(idx_set, len(idx_set), replace=True))
            dt = df[df[match_col].isin(smp_idx)].reset_index(drop=True)
            try:
                Y_pred, N_pred = fit_full_model(dt, N, t)
            except:
                continue
            X_tr, X_ct = dt[dt[A_col]], dt[~dt[A_col]]
            Y_1 = Y_pred.predict(X_tr)
            Y_0 = Y_pred.predict(X_ct)
            N_1 = N_pred.predict(X_tr)
            N_0 = N_pred.predict(X_ct)

            # DiD
            _bootstraps.append(
                (np.mean(Y_1) - np.mean(N_1)) -\
                (np.mean(Y_0) - np.mean(N_0))
            )
        res = pd.DataFrame(_bootstraps, columns=['DiD'])
#         res['bidx'] = list(range(n_rep))
        res['grp'] = grp_name
        res['n'] = len(idx_set)
        res[level_col] = lb
        estimates.append(res)
    return pd.concat(estimates).assign(N=N, t=t)


In [31]:
def get_ci_table(boot_df, effect_col='DiD', groups=['N','t'], alpha=0.05):
    ll = alpha / 2.
    hl = 1. - ll
    return pd.concat([
      boot_df.groupby(groups)[effect_col].apply(lambda s : np.mean(s)).to_frame('r'),
      boot_df.groupby(groups)[effect_col].apply(lambda s : np.quantile(s,ll)).to_frame('ll'),
     boot_df.groupby(groups)[effect_col].apply(lambda s : np.quantile(s,hl)).to_frame('hl'),
    ],axis=1)


In [None]:
author_model_ci = pd.concat([
        cond_means_ci_subgrp(dat, N=N, t=t,A_col='arxiv_first',
            level_col='author_level',level_vars=['L','M','H'],
            level_lb=['<500', '500-2000', '>2000'], grp_name='Max Author Citation',
            n_rep=1000
        )
    for N, t in tqdm.tqdm(thres+[(n,0) for n in Ns])
])

inst_model_ci = pd.concat([
        cond_means_ci_subgrp(dat, N=N, t=t,A_col='arxiv_first',
            level_col='inst_level',level_vars=['L','M','H'],
            level_lb=['top-10', '10-100', 'others'], grp_name='Min Institution Rank',
            n_rep=1000
        )
    for N, t in tqdm.tqdm(thres)
])


In [None]:
full_subgrp_boots = pd.concat([
    inst_model_ci.rename({'inst_level':'level'},axis=1).replace({'level':{'all':'All Institutions'}},),
    author_model_ci.rename({'author_level':'level'},axis=1).replace({'level':{'all':'All Authors'}},),
])


full_subgrp_boots['t_lb']=full_subgrp_boots.apply(lambda s :f"{s['N']}>{s['t']}",axis=1)
full_subgrp_boots['q']=full_subgrp_boots.apply(lambda s :f"{s['N']}>{s['t']}",axis=1)

for i, (N, t) in enumerate(thres):
    q = [r'50\%', r'75\%', r'90\%'][i%3]
    full_subgrp_boots.loc[
        (full_subgrp_boots.N==N)&(full_subgrp_boots.t==t), 'q'] = q
for N in Ns:
        full_subgrp_boots.loc[
        (full_subgrp_boots.N==N)&(full_subgrp_boots.t==0), 'q'] = r'0\%'


In [209]:
full_subgrp_boots.to_csv('./stratified_bootstrap_smps.csv',
                         header=True,
                         index=False)
