In [None]:
%pip install --force-reinstall --no-deps git+https://github.com/chrisjcameron/TexSoup.git@develop-main
#! pip install --editable /Users/cjc73/gits/arxiv/TexSoup/

In [1]:
#import zipfile
import tarfile
import io
import importlib
import os
import regex as re
import glob
import pandas as pd
import itertools as itr

In [2]:
import pyperclip   #copy text to clipboard for inspecting

In [3]:
from tqdm.auto import tqdm

In [4]:
from IPython.core.interactiveshell import InteractiveShell
# pretty print all cell's output and not just the last one
InteractiveShell.ast_node_interactivity = "all"

In [5]:
import TexSoup as TS
from TexSoup.tokens import MATH_ENV_NAMES
# -

In [6]:
TS.__file__

'/Users/cjc73/miniconda3/envs/cforge/lib/python3.11/site-packages/TexSoup/__init__.py'

In [7]:
TS.__file__

'/Users/cjc73/miniconda3/envs/cforge/lib/python3.11/site-packages/TexSoup/__init__.py'

In [8]:
#importlib.reload(TS)

In [9]:
LOCAL_DATA_PATH = './data/2201_00_all/'

In [10]:
def pre_format(text):
    '''Apply some substititions to make LaTeX easier to parse'''
    source_text = (
        text
        .replace('\\}\\', '\\} \\')  # Due to escape rules \\ is equivalent to \
        .replace(')}', ') }')
        .replace(')$', ') $')
        #.replace(r'\left [', r'\left[ ')
        #.replace(r'\left (', r'\left( ')
        #.replace(r'\left \{', r'\left\{ ')
    )
    return source_text
    #clean_lines = []
    #for line in source_text.splitlines(False):
    #    cleanline = line.strip()
    #    if cleanline.startswith(r'\newcommand'):
    #        cleanline = r'%' + cleanline
    #    elif cleanline.startswith(r'\def'):
    #        cleanline = r'%' + cleanline
    #    clean_lines.append(cleanline)
    #return '\n'.join(clean_lines)

In [11]:
def find_doc_class(wrapped_file, name_match=False):
    '''Search for document class related lines in a file  and return a code to represent the type'''
    doc_class_pat = re.compile(r"^\s*\\document(?:style|class)")
    sub_doc_class = re.compile(r"^\s*\\document(?:style|class).*(?:\{standalone\}|\{subfiles\})")

    for line in wrapped_file:
        if doc_class_pat.search(line):
            if name_match:
                # we can miss if there are two or more lines with documentclass 
                # and the first one is not the one that has standalone/subfile
                if sub_doc_class.search(line):
                    return -99999
                return 1 #main_files[tf] = 1
            
    return 0 #main_files[tf] = 0


def find_main_tex_source_in_tar(tar_path, encoding='uft-8'):
    '''Identify the main Tex file in a tarfile.
    
    Args:
        tar_path: A gzipped tar archive of a directory containing tex source and support files.
    '''
    
    tex_names = set(["paper", "main", "ms.", "article"])

    with tarfile.open(tar_path, 'r') as in_tar:
        tex_files = [f for f in in_tar.getnames() if f.endswith('.tex')]
        
        # got one file
        if len(tex_files) == 1:
            return tex_files[0]

        main_files = {}
        for tf in tex_files:
            depth = len(tf.split('/')) - 1
            has_main_name = any(kw in tf for kw in tex_names)
            fp = in_tar.extractfile(tf)
            wrapped_file = io.TextIOWrapper(fp, newline=None, encoding=encoding) #universal newlines
            # does it have a doc class?
            # get the type
            main_files[tf] = find_doc_class(wrapped_file, name_match = has_main_name) - depth 
            wrapped_file.close() 
        
        # got one file with doc class
        if len(main_files) == 1:
            return(main_files.keys()[0])
        
        # account for multi-file submissions
        return(max(main_files, key=main_files.get))

In [12]:
def soup_from_tar(tar_path, encoding='utf-8', tolerance=0):
    tex_main = find_main_tex_source_in_tar(tar_path, encoding=encoding)
    with tarfile.open(tar_path, 'r') as in_tar:
        fp = in_tar.extractfile(tex_main)
        wrapped_file = io.TextIOWrapper(fp, newline=None, encoding=encoding) #universal newlines
        source_text = pre_format(wrapped_file.read())
        soup = TS.TexSoup(source_text, tolerance=tolerance, skip_envs=MATH_ENV_NAMES)
        return soup

In [13]:
def source_from_tar(tar_path, encoding='utf-8'):
    tex_main = find_main_tex_source_in_tar(tar_path, encoding=encoding)
    with tarfile.open(tar_path, 'r') as in_tar:
        fp = in_tar.extractfile(tex_main)
        wrapped_file = io.TextIOWrapper(fp, newline=None, encoding=encoding) #universal newlines
        source_text = pre_format(wrapped_file.read())
        return source_text

In [14]:
swap = itr.cycle((True, False))

def find_bad(current_text_lines):
    mid = int(len(current_text_lines)/2)
    part_a = current_text_lines[0:mid]
    part_b = current_text_lines[mid:]
    if next(swap):
        part_b, part_a = part_a, part_b
    bad = ""
    try:
        soup = TS.TexSoup("\n".join(part_a), tolerance=tolerance, skip_envs=MATH_ENV_NAMES)
    except KeyboardInterrupt:
        raise
    except:
        return part_a
    try:
        soup = TS.TexSoup("\n".join(part_b), tolerance=tolerance, skip_envs=MATH_ENV_NAMES)
    except KeyboardInterrupt:
        raise
    except:
        return part_b
    return "--"
    

def find_bad_lines(tar_path, encoding='utf-8'):
    tex_main = find_main_tex_source_in_tar(tar_path, encoding=encoding)
    with tarfile.open(tar_path, 'r') as in_tar:
        fp = in_tar.extractfile(tex_main)
        wrapped_file = io.TextIOWrapper(fp, newline=None, encoding=encoding) #universal newlines
        source_text = pre_format(wrapped_file.read())
        current_text = source_text.splitlines()

    while len(current_text) > 1:
        bad_half = find_bad(current_text)
        if current_text == bad_half:
            break
        current_text = bad_half
        
    return bad_half

In [22]:
def show_context(text_path, offset, context_size=50):
    try: 
        with open(text_path, 'r', encoding='utf-8') as file:
            file.seek(offset)
            context = file.read(context_size)
            return context
            # Is it unicode?
    except UnicodeDecodeError as ue:
        pass
    try:
        with open(text_path, 'r', encoding='latin-1') as file:
            file.seek(offset)
            context = file.read(context_size)
            return context
    except: 
        raise

# file_path = 'Ising_v2.tex'
# offset_position = 805
# context = show_context(file_path, offset_position)
# print("Error context at offset 805:", context)

In [25]:
show_context(infile_path, 8584)

'¯P ÿfãé£GO\x9f\x1a§æð\x06ÕÛ£Á\x19®åF6\x072Ó\x8b\x93\x89×\x82?`-Mðï8ð\x93\x18\x7fÀ¤ÄÊË\t'

## Check a file with parse errors

In [16]:
min_example=r"""
{\subsection}
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
try:
    TS.TexSoup(pre_format(min_example), tolerance=0)
except AssertionError as e:
    print(e)
#print(min_example)

TypeError: [Line: 0, Offset 0] Malformed argument. First and last elements must match a valid argument format. In this case, TexSoup could not find matching punctuation for: {.
Just finished parsing: ['{', TexCmd('subsection', [BraceGroup('}')])]

In [17]:
min_example=r"""
\renewcommand{\tilde}{\widetilde}
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
try:
    TS.TexSoup(pre_format(min_example), tolerance=0)
except AssertionError as e:
    print(e)
#print(min_example)

TypeError: [Line: 0, Offset 21] Malformed argument. First and last elements must match a valid argument format. In this case, TexSoup could not find matching punctuation for: {.
Just finished parsing: ['{', TexCmd('widetilde', [BraceGroup('}')])]

In [18]:
infile_path = "./data/2201_00_all/2201.00001v1.tar.gz" #'./data/2201_samp/2201.00048v1.tar.gz'

text = source_from_tar(infile_path)
pyperclip.copy(text)
soup = soup_from_tar(infile_path, tolerance=1)


title = soup.find('title')
if title: print(f"{title.name}: {title.text}")
for sec in soup.find_all('section'):
    print(f' {sec.name}: {sec.text}')

title: ['Modeling Advection on Directed Graphs using  Mat', "\\'", 'e', 'rn Gaussian Processes for Traffic Flow']
 section: ['Introduction']
 section: ['Understanding the directed graph advection operator']
 section: ['Directed Graph Advection Mat', "\\'", 'e', 'rn Gaussian Process (DGAMGP) ']
 section: ['Numerical Results']
 section: ['Conclusions']
 section: ['Upwinding discretizations of linear advection']
 section: ['Examples of ', '$', 'L_', 'adv', '$', ' on balanced graphs resulting in finite difference discretizations of linear advection']
 section: ['Additional Experiments']


## Quick check a folder of tar files

In [None]:
files = glob.glob(f'{LOCAL_DATA_PATH}/*.tar.gz')
files_count = len(files)
utf_count = 0
latin_count = 0 
err_files = {}

TOLERANCE = 0

with tqdm(total=files_count, desc="errors") as err_prog:
    for tar_file in tqdm(files, desc="Progress", display=True):
        # Is it unicode?
        try:
            soup = soup_from_tar(tar_file, encoding='utf-8', tolerance=TOLERANCE)
            utf_count += 1
            continue
        except EOFError as eof:
            err_files[tar_file] = type(eof)
            _ = err_prog.update(1)
            continue
        except UnicodeDecodeError as ue:
            pass
        except KeyboardInterrupt as KB_err:
            break
        except Exception as e:
            err_files[tar_file] = type(e)
            _ = err_prog.update(1)
            continue

        # Is it something else?
        try:
            soup = soup_from_tar(tar_file, encoding='latin-1', tolerance=TOLERANCE)
            latin_count += 1
            continue
        except KeyboardInterrupt as KB_err:
            break
        except Exception as e:
            err_files[tar_file] = type(e)
            _ = err_prog.update(1)
            pass

In [None]:
print(f"{files_count} processed, {len(err_files)} failures.")
print(f"UTF8: {utf_count}; Latin1: {latin_count}")
err_files

## Scratch below here

In [24]:
show_context(infile_path, 8584)

AttributeError: 'str' object has no attribute 'decode'

In [19]:
TOLERANCE = 0
infile_path = "./data/2201_00_all/2201.00468v1.tar.gz" #'./data/2201_samp/2201.00048v1.tar.gz'

text = source_from_tar(infile_path)
pyperclip.copy(text)
soup = soup_from_tar(infile_path, tolerance=TOLERANCE)


title = soup.find('title')
if title: print(f"{title.name}: {title.text}")
for sec in soup.find_all('section'):
    print(f' {sec.name}: {sec.text}')

TypeError: [Line: 0, Offset 8584] Malformed argument. First and last elements must match a valid argument format. In this case, TexSoup could not find matching punctuation for: {.
Just finished parsing: ['{', TexCmd('widetilde', [BraceGroup('}')]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('hd')), BraceGroup(TexCmd('hat', [BraceGroup(TexCmd('mathbf', [BraceGroup('d')]))]))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('bs')), BraceGroup(TexCmd('bar', [BraceGroup('s')]), '_n')]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('hR')), BraceGroup(TexCmd('hat', [BraceGroup('R')]))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('hD')), BraceGroup(TexCmd('hat', [BraceGroup('D')]), '_N')]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('hl')), BraceGroup(TexCmd('hat', [BraceGroup(TexCmd('ell'))]), '_', BraceGroup('n,k'))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('hlo')), BraceGroup(TexCmd('hat', [BraceGroup(TexCmd('ell'))]), '^', BraceGroup('(1) '), '_', BraceGroup('n,k'))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('hlz')), BraceGroup(TexCmd('hat', [BraceGroup(TexCmd('ell'))]), '^', BraceGroup('(0) '), '_', BraceGroup('n,k'))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('hlt')), BraceGroup(TexCmd('hat', [BraceGroup(TexCmd('ell'))]), '^', BraceGroup('(t) '), '_', BraceGroup('n,k'))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('hm')), BraceGroup(TexCmd('hat', [BraceGroup('m')]), '_', BraceGroup('n,k'))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('hateo')), BraceGroup(TexCmd('hat', [BraceGroup('e')]), '^', BraceGroup('(1) '), '_', BraceGroup('n,k'))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('hatez')), BraceGroup(TexCmd('hat', [BraceGroup('e')]), '^', BraceGroup('(0) '), '_', BraceGroup('n,k'))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('hatet')), BraceGroup(TexCmd('hat', [BraceGroup('e')]), '^', BraceGroup('(t) '), '_', BraceGroup('n,k'))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('lnk')), BraceGroup(TexCmd('ell'), '_', BraceGroup('n,k'))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('enk')), BraceGroup('e_', BraceGroup('n,k'))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('hf')), BraceGroup(TexCmd('hat', [BraceGroup('f')]), '_n')]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('hfs')), BraceGroup(TexCmd('hat', [BraceGroup('f')]), '_', BraceGroup(TexCmd('S')))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('phis')), BraceGroup(TexCmd('phi'), '^*')]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('tphi')), BraceGroup(TexCmd('tilde', [BraceGroup(TexCmd('phi'))]))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('phihatn')), BraceGroup(TexCmd('hat', [BraceGroup(TexCmd('phi'))]), '_n')]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('phihatnk')), BraceGroup(TexCmd('hat', [BraceGroup(TexCmd('phi'))]), '_', BraceGroup('n,k'))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('hmi')), BraceGroup(TexCmd('hat', [BraceGroup('m')]), '^', BraceGroup('(i) '))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('pihatN')), BraceGroup(TexCmd('hat', [BraceGroup(TexCmd('pi'))]), '_N')]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('pihatn')), BraceGroup(TexCmd('hat', [BraceGroup(TexCmd('pi'))]), '_n')]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('pihatnk')), BraceGroup(TexCmd('hat', [BraceGroup(TexCmd('pi'))]), '_', BraceGroup('n,k'))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('pis')), BraceGroup(TexCmd('pi'), '^*')]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('hai')), BraceGroup(TexCmd('hat', [BraceGroup('A')]), '_', BraceGroup('q,i'))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('bh')), BraceGroup(TexCmd('bar', [BraceGroup('h')]))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('bez')), BraceGroup(TexCmd('beta'), '_0')]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('vt')), BraceGroup(TexCmd('theta'), '_0')]), '\n', '%\\newcommand{\\vt}{{\\boldsymbol\\theta}}', '\n', TexCmd('newcommand', [BraceGroup(TexCmd('bdelta')), BraceGroup(BraceGroup(TexCmd('boldsymbol'), TexCmd('delta')))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('bfeta')), BraceGroup(BraceGroup(TexCmd('boldsymbol'), TexCmd('eta')))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('bvt')), BraceGroup(TexCmd('bar', [BraceGroup(TexCmd('theta'))]))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('hvt')), BraceGroup(TexCmd('hat', [BraceGroup(TexCmd('theta'))]))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('thetahatsup')), BraceGroup(TexCmd('hvt'), '_', BraceGroup(TexCmd('mbox', [BraceGroup(TexCmd('tiny'), ' SUP')])))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('thetahatss')), BraceGroup(TexCmd('hvt'), '_', BraceGroup(TexCmd('mbox', [BraceGroup(TexCmd('tiny'), ' SS')])))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('thetahatora')), BraceGroup(TexCmd('hvt'), '_', BraceGroup(TexCmd('mbox', [BraceGroup(TexCmd('tiny'), ' ORA')])))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('muhatsup')), BraceGroup(TexCmd('hat', [BraceGroup(TexCmd('mu'))]), '_', BraceGroup(TexCmd('mbox', [BraceGroup(TexCmd('tiny'), ' SUP')])))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('muhatss')), BraceGroup(TexCmd('hat', [BraceGroup(TexCmd('mu'))]), '_', BraceGroup(TexCmd('mbox', [BraceGroup(TexCmd('tiny'), ' SS')])))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('muhatora')), BraceGroup(TexCmd('hat', [BraceGroup(TexCmd('mu'))]), '_', BraceGroup(TexCmd('mbox', [BraceGroup(TexCmd('tiny'), ' ORA')])))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('vti')), BraceGroup(TexCmd('theta'), '_', BraceGroup(TexCmd('mbox', [BraceGroup(TexCmd('tiny'), ' INIT')])))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('vtos')), BraceGroup(TexCmd('theta'), '_', BraceGroup(TexCmd('mbox', [BraceGroup(TexCmd('tiny'), ' OS')])))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('thetahatinit')), BraceGroup(TexCmd('hvt'), '_', BraceGroup(TexCmd('mbox', [BraceGroup(TexCmd('tiny'), ' INIT')])))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('lams')), BraceGroup(TexCmd('lambda'), '_', BraceGroup(TexCmd('mbox', [BraceGroup(TexCmd('tiny'), ' SUP')])))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('lamss')), BraceGroup(TexCmd('lambda'), '_', BraceGroup(TexCmd('mbox', [BraceGroup(TexCmd('tiny'), ' SS')])))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('lameff')), BraceGroup(TexCmd('lambda'), '_', BraceGroup(TexCmd('mbox', [BraceGroup(TexCmd('tiny'), ' EFF')])))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('lamate')), BraceGroup(TexCmd('lambda'), '_', BraceGroup(TexCmd('mbox', [BraceGroup(TexCmd('tiny'), ' ATE')])))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('zes')), BraceGroup(TexCmd('zeta'), '_', BraceGroup(TexCmd('mbox', [BraceGroup(TexCmd('tiny'), ' SUP')])))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('zess')), BraceGroup(TexCmd('zeta'), '_', BraceGroup(TexCmd('mbox', [BraceGroup(TexCmd('tiny'), ' SS')])))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('zeeff')), BraceGroup(TexCmd('zeta'), '_', BraceGroup(TexCmd('mbox', [BraceGroup(TexCmd('tiny'), ' EFF')])))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('oms')), BraceGroup(TexCmd('omega'), '_', BraceGroup(TexCmd('mbox', [BraceGroup(TexCmd('tiny'), ' SUP')])))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('omss')), BraceGroup(TexCmd('omega'), '_', BraceGroup(TexCmd('mbox', [BraceGroup(TexCmd('tiny'), ' SS')])))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('omeff')), BraceGroup(TexCmd('omega'), '_', BraceGroup(TexCmd('mbox', [BraceGroup(TexCmd('tiny'), ' EFF')])))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('sigsup')), BraceGroup(TexCmd('sigma'), '_', BraceGroup(TexCmd('mbox', [BraceGroup(TexCmd('tiny'), ' SUP')])))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('sigss')), BraceGroup(TexCmd('sigma'), '_', BraceGroup(TexCmd('mbox', [BraceGroup(TexCmd('tiny'), ' SS')])))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('sigeff')), BraceGroup(TexCmd('sigma'), '_', BraceGroup(TexCmd('mbox', [BraceGroup(TexCmd('tiny'), ' EFF')])))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('sigqte')), BraceGroup(TexCmd('sigma'), '_', BraceGroup(TexCmd('mbox', [BraceGroup(TexCmd('tiny'), ' QTE')])))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('muhatcc')), BraceGroup(TexCmd('hat', [BraceGroup(TexCmd('mu'))]), '_', BraceGroup(TexCmd('mbox', [BraceGroup(TexCmd('tiny'), ' CC')])))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('thetahatcc')), BraceGroup(TexCmd('hvt'), '_', BraceGroup(TexCmd('mbox', [BraceGroup(TexCmd('tiny'), ' CC')])))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('vts')), BraceGroup(TexCmd('vt'), '^*')]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('mhatn')), BraceGroup(TexCmd('hat', [BraceGroup('m')]), '_n')]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('mhatnk')), BraceGroup(TexCmd('hat', [BraceGroup('m')]), '_', BraceGroup('n,k'))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('bth')), BraceGroup(TexCmd('bar', [BraceGroup(TexCmd('theta'))]))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('tth')), BraceGroup(TexCmd('tilde', [BraceGroup(TexCmd('theta'))]))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('tqc')), BraceGroup(TexCmd('theta'), '_', BraceGroup(TexCmd('mq'), '^c'))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('tvt')), BraceGroup(TexCmd('tilde', [BraceGroup(TexCmd('theta'))]))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('te')), BraceGroup(TexCmd('tilde', [BraceGroup('E')]))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('tc')), BraceGroup(TexCmd('tilde', [BraceGroup(TexCmd('hbox', [BraceGroup('Cov')]))]))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('home')), BraceGroup(TexCmd('hat'), ' ', TexCmd('omega'))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('hq')), BraceGroup(TexCmd('hat', [BraceGroup('q')]))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('hpsi')), BraceGroup(TexCmd('hat', [BraceGroup(TexCmd('psi'))]), '_', BraceGroup('n,k'))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('tabincell')), BraceGroup('[')]), '2', ']', BraceGroup(TexNamedEnv('tabular', ['#2'], [BraceGroup('@', BraceGroup(), '#1@', BraceGroup())])), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('nn')), BraceGroup('n^', BraceGroup('-1'))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('n')), BraceGroup('N^', BraceGroup('-1'))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('hth')), BraceGroup(TexCmd('hat', [BraceGroup(TexCmd('theta'))]))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('heta')), BraceGroup(TexCmd('hat', [BraceGroup(TexCmd('boldsymbol'), TexCmd('eta'))]))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('hal')), BraceGroup(TexCmd('wh', [BraceGroup(TexCmd('balpha'))]))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('hb')), BraceGroup(TexCmd('hat', [BraceGroup('b')]))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('hP')), BraceGroup(TexCmd('hat', [BraceGroup('P')]))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('tbe')), BraceGroup(TexCmd('tilde', [BraceGroup(TexCmd('bbeta'))]))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('bbe')), BraceGroup(TexCmd('boldsymbol', [BraceGroup(TexCmd('bar', [BraceGroup(TexCmd('beta'))]))]))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('bgammahat')), BraceGroup(TexCmd('wh', [BraceGroup(TexCmd('bgamma'))]))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('bxihat')), BraceGroup(TexCmd('wh', [BraceGroup(TexCmd('bxi'))]))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('hg')), BraceGroup(TexCmd('wh', [BraceGroup('g')]))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('hG')), BraceGroup(TexCmd('wh', [BraceGroup('G')]))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('bG')), BraceGroup(TexCmd('overline', [BraceGroup('G')]))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('mbG')), BraceGroup(TexCmd('mathbb', [BraceGroup('G')]))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('mbP')), BraceGroup(TexCmd('mathbf', [BraceGroup('P')]))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('hmbP')), BraceGroup(TexCmd('hat', [BraceGroup(TexCmd('mathbf', [BraceGroup('P')]))]), '_k')]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('E')), BraceGroup(TexCmd('mathbb', [BraceGroup('E')]))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('hT')), BraceGroup(TexCmd('wh', [BraceGroup('T')]))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('hz')), BraceGroup(TexCmd('hat', [BraceGroup(TexCmd('zeta'))]))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('tz')), BraceGroup(TexCmd('tilde', [BraceGroup(TexCmd('zeta'))]))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('ta')), BraceGroup(TexCmd('tilde', [BraceGroup('a')]), '_n')]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('tx')), BraceGroup(TexCmd('tilde', [BraceGroup(TexCmd('x'))]))]), '\n', '%\\newcommand{\\tz}{\\tilde{Z}}', '\n', TexCmd('newcommand', [BraceGroup(TexCmd('tw')), BraceGroup(TexCmd('tilde', [BraceGroup('X')]))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('hw')), BraceGroup(TexCmd('hat', [BraceGroup('X')]))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('hY')), BraceGroup(TexCmd('hat', [BraceGroup('Y')]))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('tX')), BraceGroup(TexCmd('widetilde', [BraceGroup(TexCmd('X'))]))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('az')), BraceGroup(TexCmd('widetilde', [BraceGroup('Z')]))]), '\n', '%\\newcommand{\\be}{\\begin{equation}}', '\n\t', '%\\newcommand{\\ee}{\\end{equation}}', '\n', '%\\newcommand{\\md}{\\mathcal{D}}', '\n', TexCmd('def', [TexCmd('ms'), BraceGroup(TexCmd('mathcal', [BraceGroup('S')]))]), '\n', TexCmd('def', [TexCmd('mp'), BraceGroup(TexCmd('mathcal', [BraceGroup('P')]))]), '\n', TexCmd('def', [TexCmd('md'), BraceGroup(' ', BraceGroup(TexCmd('cal'), ' D'))]), '\n', TexCmd('def', [TexCmd('mx'), BraceGroup(' ', BraceGroup(TexCmd('cal'), ' X'))]), '\n', TexCmd('def', [TexCmd('mb'), BraceGroup(' ', TexCmd('mathcal', [BraceGroup('B')]))]), '\n', TexCmd('def', [TexCmd('mg'), BraceGroup(' ', TexCmd('mathcal', [BraceGroup('G')]))]), '\n', TexCmd('def', [TexCmd('cl'), BraceGroup(' ', TexCmd('mathcal', [BraceGroup('L')]))]), '\n', TexCmd('def', [TexCmd('cu'), BraceGroup(' ', TexCmd('mathcal', [BraceGroup('U')]))]), '\n', TexCmd('def', [TexCmd('ct'), BraceGroup(' ', TexCmd('mathcal', [BraceGroup('T')]))]), '\n', TexCmd('def', [TexCmd('xq'), BraceGroup('X_', BraceGroup(TexCmd('mq')))]), '\n', TexCmd('def', [TexCmd('xqc'), BraceGroup('X_', BraceGroup(TexCmd('mq'), '^c'))]), '\n', TexCmd('def', [TexCmd('xiq'), BraceGroup('X_', BraceGroup('i', TexCmd('mq')))]), '\n', TexCmd('def', [TexCmd('xiqc'), BraceGroup('X_', BraceGroup('i', TexCmd('mq'), '^c'))]), '\n', TexCmd('def', [TexCmd('tzero'), BraceGroup(TexCmd('mathbf', [BraceGroup('0')]))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('red')), BraceGroup(TexCmd('textcolor', [BraceGroup('red')]))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('blue')), BraceGroup(TexCmd('textcolor', [BraceGroup('blue')]))]), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('p')), BraceGroup(TexCmd('mathrm', [BraceGroup('pr')]))]), '\n', TexCmd('def', [TexCmd('si'), BraceGroup(TexCmd('hbox', [BraceGroup('$', TexCmd('sum'), '_', BraceGroup('i=1'), '^', BraceGroup('n+N'), '$')]))]), '\n', TexCmd('def', [TexCmd('sm'), BraceGroup(TexCmd('hbox', [BraceGroup('$', TexCmd('sum'), '_', BraceGroup('m=1'), '^', BraceGroup('M'), '$')]))]), '\n', TexCmd('def', [TexCmd('sl'), BraceGroup(TexCmd('hbox', [BraceGroup('$', TexCmd('sum'), '_', BraceGroup('i=1'), '^n', '$')]))]), '\n', TexCmd('def', [TexCmd('slk'), BraceGroup(TexCmd('hbox', [BraceGroup('$', TexCmd('sum'), '_', BraceGroup('i', TexCmd('in'), TexCmd('I'), '_', BraceGroup('k'), '^-'), '$')]))]), '\n', TexCmd('def', [TexCmd('su'), BraceGroup(TexCmd('hbox', [BraceGroup('$', TexCmd('sum'), '_', BraceGroup('i=n+1'), '^', BraceGroup('n+N'), '$')]))]), '\n', TexCmd('def', [TexCmd('sk'), BraceGroup(TexCmd('hbox', [BraceGroup('$', TexCmd('sum'), '_', BraceGroup('k=1'), '^', TexCmd('kK'), '$')]))]), '\n', TexCmd('def', [TexCmd('sjq'), BraceGroup(TexCmd('hbox', [BraceGroup('$', TexCmd('sum'), '_', BraceGroup('j=1'), '^q', '$')]))]), '\n', TexCmd('def', [TexCmd('th'), BraceGroup('^', BraceGroup(TexCmd('rm'), ' th'))]), '\n', TexCmd('def', [TexCmd('mm'), BraceGroup(TexCmd('mathcal', [BraceGroup('M')]))]), '\n', TexCmd('def', [TexCmd('ttho'), BraceGroup(TexCmd('widetilde', [TexCmd('theta')]), '_', BraceGroup('k_0m_0', TexCmd('D'), "'"))]), '\n', TexCmd('def', [TexCmd('hv'), BraceGroup(TexCmd('hat', [BraceGroup('V')]), '_', BraceGroup('km', TexCmd('md')))]), '\n', TexCmd('def', [TexCmd('hve'), BraceGroup(TexCmd('hat', [BraceGroup(TexCmd('varepsilon'))]), '_', BraceGroup('i'))]), '\n', TexCmd('def', [TexCmd('bze'), BraceGroup(TexCmd('mathbf', [BraceGroup('0')]))]), '\n', TexCmd('def', [TexCmd('bon'), BraceGroup(TexCmd('mathbf', [BraceGroup('1')]))]), '\n', TexCmd('def', [TexCmd('mbtv'), BraceGroup(TexCmd('mb'), '(', TexCmd('vt'), ',', TexCmd('v'), ') ')]), '\n', TexCmd('def', [TexCmd('mbbv'), BraceGroup(TexCmd('mb'), '(', TexCmd('bbeta'), '_0,', TexCmd('v'), ') ')]), '\n', TexCmd('def', [TexCmd('st'), BraceGroup(TexCmd('hbox', [BraceGroup('$', TexCmd('sup'), '_', BraceGroup(TexCmd('theta'), TexCmd('in'), TexCmd('ct')), '$')]))]), '\n', TexCmd('def', [TexCmd('sg'), BraceGroup(TexCmd('hbox', [BraceGroup('$', TexCmd('sup'), '_', BraceGroup(TexCmd('bgamma'), TexCmd('in'), TexCmd('ct')), '$')]))]), '\n', TexCmd('def', [TexCmd('sth'), BraceGroup(TexCmd('hbox', [BraceGroup('$', TexCmd('sup'), '_', BraceGroup(TexCmd('theta'), TexCmd('in'), TexCmd('Theta')), '$')]))]), '\n', TexCmd('def', [TexCmd('stx'), BraceGroup(TexCmd('hbox', [BraceGroup('$', TexCmd('sup'), '_', BraceGroup(TexCmd('x'), TexCmd('in'), TexCmd('mx'), ',', '\\,', TexCmd('theta'), TexCmd('in'), TexCmd('ct')), '$')]))]), '\n', TexCmd('def', [TexCmd('sgx'), BraceGroup(TexCmd('hbox', [BraceGroup('$', TexCmd('sup'), '_', BraceGroup(TexCmd('x'), TexCmd('in'), TexCmd('mx'), ',', '\\,', TexCmd('bgamma'), TexCmd('in'), TexCmd('ct')), '$')]))]), '\n', TexCmd('def', [TexCmd('sthk'), BraceGroup(TexCmd('hbox', [BraceGroup('$', TexCmd('sup'), '_', BraceGroup(TexCmd('theta'), TexCmd('in'), TexCmd('Theta'), '_k'), '$')]))]), '\n', TexCmd('def', [TexCmd('sthkm'), BraceGroup(TexCmd('hbox', [BraceGroup('$', TexCmd('sup'), '_', BraceGroup(TexCmd('theta'), TexCmd('in'), TexCmd('Theta'), '_', BraceGroup('km')), '$')]))]), '\n', TexCmd('def', [TexCmd('sthx'), BraceGroup(TexCmd('hbox', [BraceGroup('$', TexCmd('sup'), '_', BraceGroup(TexCmd('x'), TexCmd('in'), TexCmd('mx'), ',', '\\,', TexCmd('theta'), TexCmd('in'), TexCmd('Theta')), '$')]))]), '\n', TexCmd('def', [TexCmd('sb'), BraceGroup(TexCmd('hbox', [BraceGroup('$', TexCmd('sup'), '_', BraceGroup(TexCmd('theta'), TexCmd('in'), TexCmd('mbtv')), '$')]))]), '\n', TexCmd('def', [TexCmd('sbk'), BraceGroup(TexCmd('hbox', [BraceGroup('$', TexCmd('sup'), '_', BraceGroup(TexCmd('theta'), TexCmd('in'), TexCmd('mb'), '_k'), '$')]))]), '\n', TexCmd('def', [TexCmd('sbx'), BraceGroup(TexCmd('hbox', [BraceGroup('$', TexCmd('sup'), '_', BraceGroup(TexCmd('x'), TexCmd('in'), TexCmd('mx'), ',', '\\,', TexCmd('theta'), TexCmd('in'), TexCmd('mbtv')), '$')]))]), '\n', TexCmd('def', [TexCmd('sx'), BraceGroup(TexCmd('hbox', [BraceGroup('$', TexCmd('sup'), '_', BraceGroup(TexCmd('x'), TexCmd('in'), TexCmd('mx')), '$')]))]), '\n', TexCmd('def', [TexCmd('sxx'), BraceGroup(TexCmd('hbox', [BraceGroup('$', TexCmd('sup'), '_', BraceGroup(TexCmd('x'), ',', TexCmd('X'), TexCmd('in'), TexCmd('mx')), '$')]))]), '\n', TexCmd('def', [TexCmd('sp'), BraceGroup(TexCmd('hbox', [BraceGroup('$', TexCmd('sup'), '_', BraceGroup(TexCmd('mbP'), TexCmd('in'), TexCmd('mp'), '_0'), '$')]))]), '\n', TexCmd('def', [TexCmd('sxp'), BraceGroup(TexCmd('hbox', [BraceGroup('$', TexCmd('sup'), '_', BraceGroup(TexCmd('x'), TexCmd('in'), TexCmd('mx'), ',', '\\,', TexCmd('mbP'), TexCmd('in'), TexCmd('mp')), '$')]))]), '\n', TexCmd('def', [TexCmd('sxop'), BraceGroup(TexCmd('hbox', [BraceGroup('$', TexCmd('sup'), '_', BraceGroup(TexCmd('x'), TexCmd('in'), TexCmd('mx'), '_', BraceGroup('0'), ',', '\\,', TexCmd('mbP'), TexCmd('in'), TexCmd('mp')), '$')]))]), '\n', TexCmd('def', [TexCmd('sxbe'), BraceGroup(TexCmd('hbox', [BraceGroup('$', TexCmd('sup'), '_', BraceGroup(TexCmd('x'), TexCmd('in'), TexCmd('mx'), ',', '\\,', TexCmd('bbeta'), TexCmd('in'), TexCmd('mbbv')), '$')]))]), '\n', TexCmd('def', [TexCmd('sxthp'), BraceGroup(TexCmd('hbox', [BraceGroup('$', TexCmd('sup'), '_', BraceGroup(TexCmd('x'), TexCmd('in'), TexCmd('mx'), ',', '\\,', TexCmd('theta'), TexCmd('in'), TexCmd('mbtv'), ',', '\\,', TexCmd('mbP'), TexCmd('in'), TexCmd('mp')), '$')]))]), '\n', TexCmd('def', [TexCmd('sxthb'), BraceGroup(TexCmd('hbox', [BraceGroup('$', TexCmd('sup'), '_', BraceGroup(TexCmd('x'), TexCmd('in'), TexCmd('mx'), ',', '\\,', TexCmd('theta'), TexCmd('in'), TexCmd('mbtv'), ',', '\\,', TexCmd('bbeta'), TexCmd('in'), TexCmd('mbbv')), '$')]))]), '\n', TexCmd('def', [TexCmd('ss'), BraceGroup(TexCmd('hbox', [BraceGroup('$', TexCmd('sup'), '_', BraceGroup(TexCmd('s'), TexCmd('in'), TexCmd('ms')), '$')]))]), '\n', TexCmd('def', [TexCmd('ssx'), BraceGroup(TexCmd('hbox', [BraceGroup('$', TexCmd('sup'), '_', BraceGroup(TexCmd('s'), TexCmd('in'), TexCmd('ms'), '\\,', TexCmd('x'), TexCmd('in'), TexCmd('mx')), '$')]))]), '\n', TexCmd('def', [TexCmd('limn'), BraceGroup(TexCmd('hbox', [BraceGroup('$', TexCmd('lim'), '_', BraceGroup('n', TexCmd('to'), TexCmd('infty')), '$')]))]), '\n', TexCmd('def', [TexCmd('limh'), BraceGroup(TexCmd('hbox', [BraceGroup('$', TexCmd('lim'), '_', BraceGroup('h', TexCmd('to'), ' 0'), '$')]))]), '\n', TexCmd('def', [TexCmd('hL'), BraceGroup(TexCmd('hat', [BraceGroup('L')]), '_n')]), '\n', TexCmd('def', [TexCmd('hT'), BraceGroup(TexCmd('hat', [BraceGroup('T')]))]), '\n', TexCmd('def', [TexCmd('mn'), BraceGroup(TexCmd('mathcal', [BraceGroup('N')]))]), '\n', TexCmd('def', [TexCmd('Enk'), BraceGroup(TexCmd('E'), '_', BraceGroup('n,k'))]), '\n', '%\\def\\bmu{\\mbox{\\boldmath $\\mu$}}', '\n', TexCmd('def', [TexCmd('bmu'), BraceGroup(TexCmd('mathbf', [BraceGroup('M')]))]), '\n', TexCmd('def', [TexCmd('bPsi'), BraceGroup(TexCmd('mbox', [BraceGroup(TexCmd('boldmath'), ' ', '$', TexCmd('Psi'), '$')]))]), '\n', TexCmd('def', [TexCmd('bbetahat'), BraceGroup(TexCmd('hat', [BraceGroup(TexCmd('bbeta'))]))]), '\n', TexCmd('def', [TexCmd('Xarrow'), BraceGroup(TexCmd('overrightarrow', [BraceGroup(TexCmd('X'))]))]), '\n', '\n', TexCmd('makeatletter'), '\n', '%\\renewcommand{\\algocf@captiontext}[2]{#1\\algocf@typo. \\AlCapFnt{}#2} % text of caption', '\n', '%\\renewcommand{\\AlTitleFnt}[1]{#1\\unskip}% default definition', '\n', '%\\def\\@algocf@capt@plain{top}', '\n', '%\\renewcommand{\\algocf@makecaption}[2]{%', '\n\t', '%  \\addtolength{\\hsize}{\\algomargin}%', '\n\t', '%  \\sbox\\@tempboxa{\\algocf@captiontext{#1}{#2}}%', '\n\t', '%  \\ifdim\\wd\\@tempboxa >\\hsize%     % if caption is longer than a line', '\n\t', '%    \\hskip .5\\algomargin%', '\n\t', '%    \\parbox[t]{\\hsize}{\\algocf@captiontext{#1}{#2}}% then caption is not centered', '\n\t', '%  \\else%', '\n\t', '%    \\global\\@minipagefalse%', '\n\t', '%    \\hbox to\\hsize{\\box\\@tempboxa}% else caption is centered', '\n\t', '%  \\fi%', '\n\t', '%  \\addtolength{\\hsize}{-\\algomargin}%', '\n\t', '%}', '\n', '%\\makeatother', '\n', '\n', TexCmd('def', [TexCmd('trans'), BraceGroup('^', BraceGroup(TexCmd('rm'), ' T'))]), '\n', TexCmd('def', [TexCmd('t'), BraceGroup(TexCmd('rm'), ' T')]), '\n', TexCmd('def', [TexCmd('boxit'), BraceGroup('#')]), BraceGroup(TexCmd('vbox', [BraceGroup(TexCmd('hrule'), TexCmd('hbox', [BraceGroup(TexCmd('vrule'), TexCmd('kern'), '6pt  ', TexCmd('vbox', [BraceGroup(TexCmd('kern'), '6pt#1', TexCmd('kern'), '6pt')]), TexCmd('kern'), '6pt', TexCmd('vrule'))]), TexCmd('hrule'))])), '\n', TexCmd('def', [TexCmd('rjccomment'), BraceGroup('#')]), BraceGroup(TexCmd('vskip'), ' 2mm', TexCmd('boxit', [BraceGroup(TexCmd('vskip'), ' 2mm', BraceGroup(TexCmd('color', [BraceGroup('black')]), TexCmd('bf'), '#1'), ' ', BraceGroup(TexCmd('color', [BraceGroup('blue')]), TexCmd('bf'), ' -- RJC', TexCmd('vskip'), ' 2mm'))]), TexCmd('vskip'), ' 2mm'), '\n', TexCmd('def', [TexCmd('rcom'), BraceGroup('#')]), BraceGroup(BraceGroup(TexCmd('color', [BraceGroup('red')]), TexCmd('bf'), '#1'), ' '), '\n', TexCmd('def', [TexCmd('bcom'), BraceGroup('#')]), BraceGroup(BraceGroup(TexCmd('color', [BraceGroup('blue')]), TexCmd('bf'), '#1'), ' '), '\n', TexCmd('def', [TexCmd('daicomment'), BraceGroup('#')]), BraceGroup(TexCmd('vskip'), ' 2mm', TexCmd('boxit', [BraceGroup(TexCmd('vskip'), ' 2mm', BraceGroup(TexCmd('color', [BraceGroup('black')]), TexCmd('bf'), '#1'), ' ', BraceGroup(TexCmd('color', [BraceGroup('blue')]), TexCmd('bf'), ' -- Dai', TexCmd('vskip'), ' 2mm'))]), TexCmd('vskip'), ' 2mm'), '\n', '%%% User-defined macros should be placed here, but keep them to a minimum.', '\n', TexCmd('def', [TexCmd('Bka'), BraceGroup(BraceGroup(TexCmd('it'), ' Biometrika'))]), '\n', TexCmd('def', [TexCmd('AIC'), BraceGroup(TexCmd('textsc', [BraceGroup('aic')]))]), '\n', TexCmd('def', [TexCmd('T'), BraceGroup(BraceGroup(' ', TexCmd('mathrm', [BraceGroup(TexCmd('scriptscriptstyle'), ' T')]), ' '))]), '\n', TexCmd('def', [TexCmd('v'), BraceGroup(BraceGroup(TexCmd('varepsilon')))]), '\n', '\n', '%\\addtolength\\topmargin{35pt}', '\n', '%\\DeclareMathOperator{\\Thetabb}{\\mathcal{C}}', '\n', TexCmd('def', [TexCmd('bse'), BraceGroup(TexNamedEnv('eqnarray*', ['}\n\t\\def\\ese{'], []))]), '\n', TexCmd('def', [TexCmd('be'), BraceGroup(TexNamedEnv('eqnarray', ['}\n\t\\def\\ee{'], []))]), '\n', TexCmd('pagenumbering', [BraceGroup('arabic')]), '\n', TexCmd('newlength', [BraceGroup(TexCmd('gnat'))]), '\n', TexCmd('setlength', [BraceGroup(TexCmd('gnat')), BraceGroup('22pt')]), '\n', TexCmd('baselineskip'), '=', TexCmd('gnat'), '\n', '\n', '%%% Extra definitions being added by me. Please don\'t delete - AC. I have a "Definitions-AC" file as well that has many standard defs. But apparently some of them are conflicting with those you have here. So I am having to define on my own whatever I need here. %%%', '\n', '\n', TexCmd('def', [TexCmd('ghat'), BraceGroup(TexCmd('widehat', [BraceGroup('g')]))]), '\n', TexCmd('newcommand', [TexCmd('ind'), BraceGroup(TexCmd('protect'), TexCmd('mathpalette', [BraceGroup(TexCmd('protect'), TexCmd('independenT')), BraceGroup(TexCmd('perp'))]))]), '\n', TexCmd('def', [TexCmd('independenT'), BraceGroup('#')]), BraceGroup(TexCmd('mathrel', [BraceGroup(TexCmd('rlap', [BraceGroup('$', '#1#2', '$')]), TexCmd('mkern'), '4mu', BraceGroup('#1#2'))])), '\n', TexCmd('def', [TexCmd('thetahat'), BraceGroup(TexCmd('widehat', [BraceGroup(TexCmd('theta'))]))]), '\n', TexCmd('def', [TexCmd('bbeta'), BraceGroup(TexCmd('boldsymbol', [BraceGroup(TexCmd('beta'))]))]), '\n', TexCmd('def', [TexCmd('mbY'), BraceGroup(TexCmd('mathbb', [BraceGroup('Y')]))]), '\n', TexCmd('def', [TexCmd('R'), BraceGroup(TexCmd('mathbb', [BraceGroup('R')]))]), '\n', TexCmd('def', [TexCmd('mq'), BraceGroup(TexCmd('mathcal', [BraceGroup('Q')]), '_', BraceGroup('n,N,k'))]), '\n', TexCmd('def', [TexCmd('mh'), BraceGroup(TexCmd('mathcal', [BraceGroup('H')]), '_', BraceGroup('N'))]), '\n', '\n', TexCmd('def', [TexCmd('tcr'), BraceGroup(TexCmd('textcolor', [BraceGroup('red')]))]), '\n', TexCmd('def', [TexCmd('tcm'), BraceGroup(TexCmd('textcolor', [BraceGroup('magenta')]))]), '\n', TexCmd('def', [TexCmd('tcb'), BraceGroup(TexCmd('textcolor', [BraceGroup('blue')]))]), '\n', TexCmd('def', [TexCmd('tcn'), BraceGroup(TexCmd('textcolor', [BraceGroup('teal')]))]), '\n', TexCmd('def', [TexCmd('tcg'), BraceGroup(TexCmd('textcolor', [BraceGroup('bluegreen')]))]), '\n', TexCmd('def', [TexCmd('tcn'), BraceGroup(TexCmd('textcolor', [BraceGroup('navyblue')]))]), '\n', '\n', TexCmd('def', [TexCmd('cred'), BraceGroup(TexCmd('color', [BraceGroup('red')]))]), '\n', TexCmd('def', [TexCmd('cmag'), BraceGroup(TexCmd('color', [BraceGroup('magenta')]))]), '\n', '\n', TexCmd('def', [TexCmd('tcr'), BraceGroup(TexCmd('textcolor', [BraceGroup('black')]))]), '\n', TexCmd('def', [TexCmd('tcg'), BraceGroup(TexCmd('textcolor', [BraceGroup('black')]))]), '\n', '\n', TexCmd('def', [TexCmd('cred'), BraceGroup(TexCmd('color', [BraceGroup('black')]))]), '\n', TexCmd('def', [TexCmd('cmag'), BraceGroup(TexCmd('color', [BraceGroup('black')]))]), '\n', '\n', TexCmd('endlocaldefs'), '\n', '\n', TexNamedEnv('document', ['\n\t\n\t\\begin{frontmatter}\n\t\t\\title{A General Framework for Treatment Effect Estimation in Semi-supervised and High Dimensional Settings}\n\t\t\n\t\t%\\title{A sample article title with some additional note\\thanksref{t1}}\n\t\t\\runtitle{Semi-supervised Treatment Effect Estimation}\n\t\t%\\thankstext{T1}{A sample additional note to the title.}\n\t\t\n\t\t\\begin{aug}\n\t\t\t%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n\t\t\t%%Only one address is permitted per author. %%\n\t\t\t%%Only division, organization and e-mail is %%\n\t\t\t%%included in the address.                  %%\n\t\t\t%%Additional information can be included in %%\n\t\t\t%%the Acknowledgments section if necessary. %%\n\t\t\t%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n\t\t\t\\author[A]{\\fnms{Abhishek} \\snm{Chakrabortty}\\ead[label=e1]{abhishek@stat.tamu.edu}},\n\t\t\t%\\and\n\t\t\t\\author[B]{\\fnms{Guorong} \\snm{Dai}\\ead[label=e2]{guorongdai@fudan.edu.cn}}\n\t\t\t\\and\n\t\t\t\\author[C]{\\fnms{Eric} \\snm{Tchetgen Tchetgen}\\ead[label=e3]{ett@wharton.upenn.edu}}\n\t\t\t\n\t\t\t%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n\t\t\t%% Addresses                                %%\n\t\t\t%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n\t\t\t\\address[A]{Department of Statistics, Texas A\\&M University, \\href{mailto:abhishek@stat.tamu.edu}{abhishek@stat.tamu.edu}} %\\printead{e1}}\n\n%%NOTE: This \\printead command seems to be a HUGE problem with arxiv submission\'s compiling (and sometimes even in own computer/Overleaf compiling!! Not sure what the problem is (possibly due to some conflict with other pkgs./commands here, as I have done this before (e.g., for the SS MAR paper) in the new IMS template + sty file edited) without issues!!\n%% Ultimately, using a brute-force approach using href!\n\t\t\n\t\t\\address[B]{Department of Statistics and Data Science, School of Management, Fudan University, \\href{mailto:guorongdai@fudan.edu.cn}{guorongdai@fudan.edu.cn}} %\\printead{e2}}\n\t\n\t\\address[C]{Department of Statistics, The Wharton School, University of Pennsylvania, \\href{mailto:ett@wharton.upenn.edu}{ett@wharton.upenn.edu}} %\\printead{e3}}\n\n%%\\customfootnotetext{$^1$}{Supported in part by a grant from the National Cancer Institute (U01-CA057030).}\n%\\customfootnotetext{$^{1,2}$}{Joint first authors.}\n\\customfootnotetext{\\tcr{$^1$}}{\\tcr{AC\'s research was supported in part by the National Science Foundation grant NSF DMS-2113768.}}\n\\customfootnotetext{$^2$}{Corresponding author. (Previously at Texas A\\&M University during the initial preparation of this work.) \\vspace{0.05in}}\n\\end{aug}\n\n\\vspace{-0.05in} %% Adding this to take care of strange vertical space above abstract. Can make this 0.15in as well.\n\\begin{abstract}\nIn this article, we \\tcr{aim to provide a general and complete understanding of {\\it semi-supervised} (SS) causal inference for treatment effects, using two such estimands as prototype cases. Specifically,} we consider estimation of\\tcr{:} (a) \\tcr{the} {\\it average treatment effect} and (b) \\tcg{the} {\\it quantile treatment effect}\\tcr{,} in an \\tcr{SS} %{\\it semi-supervised (SS) }\nsetting, which is characterized by two available data sets: (i) a {\\it labeled data set} of size $n$, providing %a few%\nobservations for a response and a set of potentially high dimensional covariates, \\tcr{as well as %including\na binary treatment indicator;} and\n%\\tcr{as well as a binary treatment; and}\n(ii) \\emph{an unlabeled data set} of size $N$, \\emph{much \\tcr{larger}} than $n$, %\\emph{much \\tcr{larger in size},}\t\n\\tcr{but without the response observed.} %\\tcr{with observations only on \\tcg{the} covariates.}\n%only for the covariates and the treatment.\n%ETT: NO TREATMENT?.\\tcr{**Addressed -- AC.**}\nUsing these two data sets, we develop a \\tcr{\\it family} of SS estimators which are guaranteed to \\tcr{be:  (1) } more robust \\tcr{\\emph{and} (2) more} efficient\\tcr{,} than their supervised counterparts \\tcr{based on the} %restricted to\nthe labeled data set only. \\tcg{\\tcr{Moreover, b}eyond the ``standard\'\' double robustness results \\tcr{(in terms of consistency) } that can be achieved by supervised methods as well,} %ETT NOT SURE THIS IS SURPRISING \\tcr{**Addressed -- AC.**}\nwe further establish  \\tcr{\\it root-$n$ consistency and asymptotic normality} of our SS estimators whenever the propensity score in the model is correctly specified, \\tcg{\\emph{without requiring specific forms of the nuisance functions involved}.} %ETT NOT SURE IF THIS IS SURPRISING? \\tcr{**Addressed -- AC.**}\nSuch an improvement \\tcr{in} %of\nrobustness arises from the use of \\tcr{the} massive unlabeled data, so it is generally not attainable in a purely supervised setting. In addition, our estimators are shown to be semi-parametrically efficient \\tcr{also} as long as all the nuisance functions are correctly specified. %ETT IN WHAT MODEL? EFFICIENCY BOUND IS WRT A MODEL.SO THIS IS LOCAL EFFICIENCY? CAN WE GET GLOBAL EFFICIENCY IF NUISANCE MODELS ARE ESTIMATED NONPARAMETRICALLY? \\tcr{**Addressed. Yes, the model is fully non-parametric (upto the extra info that is given for sure in SS setup). And yes, we are getting global efficiency here as well. But we are avoiding all these details in the abstract to keep things short -- AC.**}\nMoreover, as an illustration of the nuisance function estimation, we consider \\tcg{{inverse-probability-weighting type}} kernel smoothing estimators involving possibly unknown %ETT WHY DO WE SUDDENLY HAVE SAMPLE WEIGHTS? \\tcr{**Addressed. This was just a semantic issue -- we meant unknown IPW weights in the kernel smoother - AC.**}\ncovariate %generating\n\\tcg{transformation} mechanisms,\n%\\tcr{(or transformations) }\nand establish in high dimensional scenarios novel results \\tcr{on} their uniform convergence rates. \\tcr{These results should be of independent interest.} %\\tcr{**REVISIT NEXT PART}\n\\tcg{Numerical results on both simulated and real data validate the advantage of our methods over their supervised counterparts with respect to both robustness and efficiency.}\n\\end{abstract}%ETT I HONESTLY THINK THAT THE ABSTRACT IS TOO LONG AND A BIT INCOHERENT, CAN WE SHORTEN AND STREAMLINE AND ONLY FOCUS ON MAJOR CONTRIBUTIONS. \\tcr{**Addressed. It\'s shorter now, hopefully this is better -- AC.**}\n\n\\begin{comment}\n\\begin{keyword}[class=MSC2010]\n\\kwd[Primary ]{00X00}\n\\kwd{00X00}\n\\kwd[; secondary ]{00X00}\n\\end{keyword}\n\\end{comment}\n\n\\par\\bigskip\n\\begin{keyword}\n\\kwd{\\tcg{Average/Quantile treatment effects}}\n\\kwd{\\tcg{Semi-supervised \\tcr{causal} inference}}\n\\kwd{\\tcg{\\tcr{Double r}obust- ness and efficiency}}\n\\kwd{\\tcg{High dimensional nuisance estimators}}\n\\kwd{\\tcr{Robust root-$n$ rate inference.}}\n\\end{keyword}\n\n\\end{frontmatter}\n%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n%% Please use \\tableofcontents for articles %%\n%% with 50 pages and more                   %%\n%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n%\\tableofcontents\n\n\n\\section{Introduction}\\label{seci}\nSemi-supervised (SS) learning has received increasing attention as one of the most promising areas in statistics and machine learning in recent years.\n%\\tcr{in recent years as a promising area in modern statistics and machine learning}.\nWe refer interested readers to \\citet{zhu2005semi} and \\citet{chapelle2010semi} for a detailed overview on this topic, including its definition, \\tcr{goals}, %theory,\napplications and \\tcr{the} fast growing literature. Unlike traditional supervised or unsupervised learning settings, a\\tcr{n} {\\it SS setting}, as the name suggests, represents a \\tcr{confluence} %junction\nof these two kinds of settings, in the sense that it involves two data sets: (i) a {\\it labeled data set} $\\cl$ containing observations for an outcome $\\mbY$ and a set of covariates $\\X$ (that are possibly high dimensional), and (ii) a \\emph{much larger} {\\it unlabeled data set} $\\cu$ where only $\\X$ is observed. Such situations arise naturally when $\\X$ is easily available for a large number of individuals while the corresponding observations for $\\mbY$ are much harder to collect owing to cost or time constraints.  The SS setting is common\nto a broad class of practical problems in the modern era of ``big data\'\', including \\tcr{machine learning applications like} text mining, web page classification, speech recognition, natural language processing etc.\n\n%In\n\\tcr{Among} biomedical \\tcr{applications}, SS settings have turned out to be increasingly relevant \\tcr{in} %to\nmodern integrative genomics, especially \\tcr{in} %the research of\nexpression quantitative trait loci \\tcr{(eQTL) } studies \\citep{michaelson2009detection} combining genetic association studies with gene expression profiling. %Roughly speaking, these are association mapping studies between gene expression levels and genetic variants.\n%Lately, they\n\\tcr{These} have become instrumental in understanding various important questions in genomics, including gene regulatory networks \\citep{gilad2008revealing, hormozdiari2016colocalization}. However, one issue with such studies is that they are often under-powered due to the limited size of the gene expression data which are expensive \\citep{flutre2013statistical}. On the other hand, records on the genetic variants are cheaper and often available for a massive cohort, thus naturally leading to SS settings while necessitating robust and efficient strategies that can leverage this extra information to produce more powerful association mapping tools as well as methods for detecting the causal effects of the genetic variants. \\tcr{Moreover, SS settings also have great relevance in the analysis of electronic health records data, which are popular resources for discovery research but also suffer from a major bottleneck in obtaining validated outcomes due to logistical constraints; see\\tcr{, e.g.,}\n\\citet{chakrabortty2018efficient} and \\citet{cheng2020robust} for more details.} %tcr{**Add 1-2 sentences discussion on EHR appln. + relevance of SS + one ref. related}\n\n\\subsection{Problem setup}\\label{sec:psetup}\n\\tcr{In this paper, we consider causal inference problems in SS settings. To characterize the \\tcr{basic} setup,} suppose our sample consists of two independent data sets: the labeled \\tcr{(or supervised) } data $\\cl:=\\{(\\mbY_i,T_i,\\X_i\\trans)\\trans:i=1,\\ldots,n\\}$, and the unlabeled \\tcr{(or unsupervised) } data $\\cu:=\\{(T_i,\\X\\trans_i)\\trans:i=n+1,\\ldots,n+N\\}$ \\tcr{(with $N \\gg n$ possibly) }, containing \\tcr{$n$ and $N$}\nindependent copies of $\\Z:=(\\mbY,T,\\X\\trans)\\trans$ and $(T,\\X\\trans)\\trans$, respectively, where $T\\in\\{0,1\\}$ serves as\na {\\it treatment indicator}, i.e., $T=1$ or $0$ represents whether an individual is treated or not. %\\tcr{and we possibly have $N \\gg n$}.\n\\tcr{(\\emph{Note}: Though not the main focus of this paper, we \\tcr{also} consider the setting where the treatment $T$ is \\emph{unobserved} in $\\cu$\\tcr{,} in Section \\ref{sec_ate_u_dagger}.) }\n%ETT WHY USE BLACKBOARD Y HERE, I WOULD CHANGE TO REGULAR Y WHICH YOU USE LATER. ALSO TECHNICALLY, THIS IS THE CONSISTENCY ASSUMPTION, PLEASE SET IT AS THAT \\tcr{**Addressed. The reason why had to use bbY here was because we actually use $Y$ to generically denote $Y(1) $ later on -- we only work with one of the counterfactuals and denote it generically as $Y$; see the ``clarification\'\' para just before Sec. 1.2 -- AC.**}\nThe covariates \\tcr{(often also called confounders) } $\\X\\in\\mx\\subset\\rR^p$ \\tcr{are (possibly) \\emph{high dimensional}, with dimension} %with the dimensionality\n$p\\equiv p_n$ allowed to diverge and \\tcr{possibly exceed} %be greater than\n$n$ \\tcr{(including $p \\gg n$) }, while the {\\it observed outcome} \\tcr{is given by:}\n\\bse\n\\mbY~:=~TY(1) + (1-T)Y(0),\n\\ese\nwhere $Y(t) $ is the {\\it potential outcome} of an individual with $T=t$ $(t=0,1) $ \\citep{rubin1974estimating, imbens2015causal}. \\tcr{Thus, $(\\mbY \\mid T = t) ~\\equiv~ Y(t) $ (also called the consistency assumption).}\n\n\\tcr{A} major challenge \\tcr{(and a key feature) } in the above %configuration\n\\tcr{framework} arises from the \\tcr{(possibly) } {\\it disproportion\\tcr{ate} \\tcr{sizes}} of %between\n$\\cl$ and $\\cu$, \\tcr{namely $|\\cu| \\gg |\\cl|$}, \\tcr{an issue} widely \\tcr{encountered} %existing\nin modern \\tcr{(often digitally recorded) observational} datasets \\tcr{of massive sizes}, such as electronic health records\n\\citep{cheng2020robust}. We therefore assume \\tcr{(rather, {\\it allow} for) }:\n\\be\n\\nu~:=~\\hbox{$\\lim_{n,\\tcr{N}\\to\\infty}$}n/(n+N)~=~0,\n\\label{disproportion}\n\\ee\nas in \\citet{chakrabortty2018efficient} and \\citet{gronsbell2018semia}. An example \\tcr{of \\eqref{disproportion}} is the {\\it ideal SS setting} where $n<\\infty$ and $N=\\infty$ (i.e., the distribution of $(T,\\X\\trans)\\trans$ is known). Essentially, the condition \\eqref{disproportion} distinguishes our framework from that of traditional missing data theory, which typically requires the proportion of complete cases in the sample to be bounded away from zero %,\n\\tcr{-- often known as the ``positivity condition\'\' \\citep{imbens2004nonparametric, tsiatis2007semiparametric}. The \\emph{natural violation} of this condition in SS settings is what makes them %quite\n\\emph{unique} and more \\emph{challenging} than traditional missing data problems. %Lastly,\n\\tcg{On the other hand, we \\tcr{do assume} %make the following assumption\nthroughout this paper %, saying\n\\tcr{that $\\cl$ and $\\cu$ have the same underlying distribution (i.e., $\\mbY$ in $\\cu$ are missing completely at random) } which is the typical (and often implicit) setup in the traditional SS literature \\citep{zhu2005semi, chapelle2010semi}. %\\tcr{**ADD REFS}.\n\\tcr{We formalize this below.}}} %ETT CAN WE STATE THIS FORMALLY AS AN ASSUMPTION FOR THE PAPER THAT WE ARE ASSUMING MCAR. \\tcr{**Addressed -- AC.**}\n\\begin{assumption}\\label{ass_equally_distributed}\n\\tcg{\\tcr{The observations in $\\cl$ and $\\cu$ have the same underlying distribution, so that $\\{(\\mbY_i,T_i,\\X_i\\trans)\\trans: i=1,\\ldots,n\\}$ and $\\{(T_i,\\X_i\\trans)\\trans: i=n+1,\\ldots,n+N\\}$ respectively\nare $n$ and $N$ independent realizations from %follow\nthe distributions of $(\\mbY,T,\\X\\trans)\\trans$ and $(T,\\X\\trans)\\trans$.}} %($i=1,\\ldots,n;\\, \\tcr{i} =n+1,\\ldots,n+N$).} \\tcr{**Revisit}\n\\end{assumption}\n\n\n\\paragraph*{Causal parameters of interest} Based on the available data $\\cl\\cup\\cu$, we aim to estimate:\n\\vskip0.05in%\\\\\n\\tcr{(i) } the {\\it average treatment effect} (ATE)\\tcr{:}\n\\be\n\\mu_0(1) - \\mu_0(0)~:=~\\E\\{Y(1)\\}-\\E\\{Y(0)\\}, \\tcr{~~\\mbox{and}}\n\\label{ate}\n\\ee\n%and\n\\hspace{0.15in} \\tcr{(ii) } the {\\it quantile treatment effect} (QTE)\\tcr{:}\n\\be\n\\vt(1,\\tau)-\\vt(0,\\tau)~\\equiv~\\vt(1)-\\vt(0),\n\\label{qte}\n\\ee\nwhere $\\vt(t,\\tau)\\equiv\\vt(t) $ represents the $\\tau$\\tcr{-}quantile of $Y(t) $ for some fixed and known $\\tau\\in(0,1) $, defined as the solution to the equation\\tcr{:}\n\\be\n\\E[ \\psi\\{Y(t),\\vt(t,\\tau)\\}]  ~:=~  \\E[I\\{ Y(t) < \\vt(t,\\tau)\\} - \\tau] ~=~ 0 \\quad (t=0,1)\\tcr{,}\n\\label{defqte}\n\\ee\nwith $I(\\cdot) $ \\tcr{being} the indicator function. It is \\tcr{worth noting that} %important to notice that,\nby setting $T\\equiv 1$ and $\\mu_0(0)=\\vt(0)\\equiv 0$, the above problems \\tcr{also}\ncover SS estimation of \\tcr{the} response mean and quantile as \\tcr{\\emph{special cases}}.\n\\tcr{The ATE and the QTE are both well-studied choices of causal estimands in supervised settings; see Section \\ref{sec_literature} for an overview of these literature(s). %While the ATE is perhaps more popular, the QTE is arguably more useful and informative\nWhile the ATE is perhaps \\tcr{the more common choice,} the QTE is \\tcr{often} more useful and informative\\tcr{, especially} in settings where the causal effect of the treatment is heterogeneous and/or the outcome distribution\\tcr{(s) }\nis highly skewed so that the average causal effect may be of limited value.}\n%in terms of giving a better picture of the outcome distribution(s).\n%%\n%These together represent a fairly inclusive set of causal parameters, and also present a diverse spectrum of methodological and technical challenges (especially for the QTE; see Remark \\ref{qte_challenges}).However, our methods and the underlying principle \\emph{can} easily be adapted and extended to a much more \\emph{general} family of parameters based on any estimating equation; see Section \\ref{sec_conclusion_discussion} for some discussion, though we skip the details for sake of brevity and minimal obfuscation.} %a detailed treatment here for sake of brevity.}\n%\n%ETT THIS APPEARS UNNECESSARY\\tcr{**Addressed. We removed it from here (and added a bit of the main message (about generalizations) little later where it seemed more appropriate) -- AC.}\n\n\\vskip0.05in\n\\tcr{Our \\emph{goal} here, in general, is to investigate how, when, and to what extent, one can exploit the full data $\\cl \\cup \\cu$ to develop SS estimators of these parameters that can ``improve\'\' standard supervised approaches using $\\cl$ only, where the term ``improve\'\' could be in terms of efficiency or robustness or \\emph{both}. The rest of this paper is dedicated to a thorough understanding of such questions via a \\emph{complete characterization} of the possible SS estimators and their properties.} %and general goal is to provide a thorough understanding of how and when one can exploit.}\n\n\\vskip0.05in\n\\tcr{We also clarify here that we choose the ATE and QTE as two \\emph{representative} causal estimands -- presenting diverse methodological and technical challenges -- to exemplify the key features of our SS approach and its benefits, without compromising much on the clarity of the main messages. Extensions to other more general functionals (e.g., those based on general estimating equations) %, though technically more nuanced,\nare indeed possible -- as we discuss later in Section \\ref{sec_conclusion_discussion} -- though we skip any details on their technical nuances for the sake of brevity and minimal obfuscation.}\n\n\\paragraph*{\\tcr{Basic assumptions}} To %guarantee\n\\tcr{ensure} that the parameters %$\\mu_0(1) $, $\\mu_0(0) $, $\\vt(1) $ and $\\vt(0) $ are\n\\tcr{$\\{\\mu_0(t),\\vt(t)\\}_{t = 0}^1$} are identifiable and estimable \\tcr{from the observed data}, we make the \\tcr{following standard assumptions \\citep{imbens2004nonparametric}:}\n\\be\nT \\ind \\{Y(0), Y(1)\\} \\mid \\X,\\quad \\mbox{and} \\quad \\pi(\\x)~:=~\\E(T\\mid\\X=\\x) ~\\in (c,1-c)\\tcr{,} \\label{mar_positivity}\n\\ee\nfor any $\\x\\in\\mx$ and some constant $c\\in(0,1) $. \\tcr{The quantity $\\pi(\\x) $ is also known as the \\emph{propensity score} for the treatment. \\eqref{mar_positivity} encodes some well known conditions \\citep{imbens2015causal}.} The first part of \\eqref{mar_positivity} is \\tcr{often} known as the \\emph{no unmeasured confounding} assumption, equivalent to the {\\it missing at random} assumption in the context of missing data \\citep{tsiatis2007semiparametric, little2019statistical}, while the second part is the {\\it positivity} (or {\\it overlap}) assumption \\tcr{\\it on the treatment}. %\\tcr{\\citep{imbens2004nonparametric, imbens2015causal}}.\n%ETT I WOULD TAKE THIS OUT \\tcr{**Addressed -- AC.**}\n%(and should {\\it not} be confused with any labeling-related positivity condition which can clearly be {\\it violated} in our setting). In general, the condition \\eqref{mar_positivity} is fairly standard in the literature of causal inference \\citep{imbens2004nonparametric, imbens2015causal, h2020}.\n\n\\paragraph*{Clarification}\nConsidering that the \\tcr{corresponding case} %counterpart\nof $Y(0) $ is analogous, we would henceforth focus on the mean and quantile estimation of $Y(1) $ without loss of generality,\n\\tcr{and} %let\n\\be\n\\tcr{\\mbox{let~$\\{Y,\\mu_0,\\vt\\}$~~generically denote~~$\\{Y(1),\\mu_0(1),\\vt(1)\\}$.}} \\label{generic_notation}\n\\ee\n\n%\\subsection{Literature review}\\label{sec_literature}\n\\subsection{\\tcr{Related literature} }\\label{sec_literature}\n\\tcr{The setup and contributions of our work naturally relate to \\emph{three} different facets of existing literature, namely: (a) ``traditional\'\' (non-causal) SS inference, (b) supervised causal inference, and finally, (c) SS causal inference. Below we briefly summarize the relevant works in each of these areas, followed by a detailed account of our contributions.}\n\n\\paragraph*{SS learning and inference}\nFor estimation in an SS setup, the primary and most critical \\tcr{goal} %concern\nis \\tcr{to investigate} when and how its robustness and efficiency can be improved, compared to {\\it supervised} methods using the labeled data $\\cl$ only, by exploiting the unlabeled data $\\cu$. Chapter 2 of \\citet{Chakrabortty_Thesis_2016} provided an elaborate discussion on this question, claiming that the answer is generally determined by the \\tcr{nature of the relationship} %connections\nbetween the parameter of interest and the marginal distribution, $\\P_\\X$, of $\\X$\\tcr{,} as $\\cu$ provides information regarding $\\P_\\X$ only. Therefore\\tcr{,} many existing algorithms \\tcr{for} %of\nSS learning \\tcr{that target} %targeting\n$\\E(\\mbY\\mid\\X) $, including, for instance, generative modeling \\citep{nigam2000text, nigam2001using}, graph-based methods \\citep{zhu2005semi} and manifold regularization \\citep{belkin2006manifold}, rely to some extent on assumptions relating $\\P_\\X$ to the conditional distribution of $\\mbY$ given $\\X$. When these assumptions are violated, \\tcr{however,} they may perform even worse than the corresponding supervised methods \\citep{cozman2001unlabeled, cozman2003semi}. Such undesirable degradation highlights the need for safe usage of the unlabeled data $\\cu$. To achieve this goal, \\citet{chakrabortty2018efficient} advocated the {\\it robust} and {\\it adaptive} property for SS approaches, i.e., being consistent for the target parameters while \\tcr{being} at least as efficient as their supervised counterparts and more efficient whenever possible. Adopting such a perspective explicitly or implicitly, robust and adaptive %processes of\n\\tcr{procedures for} SS estimation and inference have been developed under the semi-parametric framework recently for various problems\\tcr{,} including mean estimation \\citep{zhang2019semi,zhang2019high}, linear regression \\citep{azriel2016semi, chakrabortty2018efficient}, general $Z$-estimation \\citep{kawakita2013semi, Chakrabortty_Thesis_2016}, prediction accuracy evaluation \\citep{gronsbell2018semia} and covariance functionals \\citep{tony2020semisupervised, chan2020semi}. \\tcr{However,} different from our work considering causal inference and treatment effect estimation, most of this recent progress focused on relatively ``standard\'\' \\tcr{(non-causal) } problems defined {\\it without} the potential outcome framework \\tcg{(and its ensuing challenges, e.g., confounding\\tcr{,} and \\tcr{the}\nmissingness of one of the potential outcomes %\\tcr{$\\{Y(t)\\}_{t=0}^1$}\ninduced by the treatment assignment \\tcr{$T$}) }.\n%ETT WHAT DOES MISSINGNESS FROM T MEAN? \\tcr{**Addressed - AC.**} %%while causal inference has been somewhat less considered in the SS literature.\n\n%Along these lines, we are interested in another important parameters, {\\it average treatment effect} (ATE) and {\\it quantile treatment effect} (QTE), somewhat less studied under SS settings.\n\n%\\tcr{**NEW (12/18) -- Working on this part onwards -- AC.}\n\\paragraph*{Average treatment effect} Both \\tcr{the} ATE and \\tcr{the} QTE are fundamental and popular causal estimands which have been extensively studied in the context of supervised causal inference based on a wide range of approaches; see \\citet{imbens2004nonparametric} and \\citet{tsiatis2007semiparametric} for an overview of the ATE literature. In particular, these include inverse probability weighted (IPW) approaches \\citep{rosenbaum1983central, rosenbaum1984reducing, robins1994estimation, hahn1998role, hirano2003efficient,  ertefaie2020nonparametric} involving approximation of the propensity score $\\pi(\\X) $, as well as \\tcr{\\emph{doubly robust}} (DR) methods \\citep{robins1994estimation, robins1995semiparametric, rotnitzky1998semiparametric, scharfstein1999adjusting, kang2007demystifying, vermeulen2015bias} which require estimating both $\\E(Y\\mid\\X) $ and $\\pi(\\X) $. As the name implies, the DR estimators are consistent whenever one of the two nuisance models is correctly specified, while attaining the semi-parametric efficiency bound for the unrestricted model,  as long as both are correctly specified. When the number of covariates is fixed, semi-parametric inference via such DR methods has a rich literature; see \\citet{bang2005doubly}, \\citet{tsiatis2007semiparametric}, \\citet{kang2007demystifying} and \\citet{graham2011efficiency} for a review. In recent times, there has also been substantial interest in the extension of these approaches to high dimensional scenarios,\nleading to a flurry of work\\tcr{, e.g., \\citet{farrell2015robust, chernozhukov2018double, athey2018approximate, smucler2019unifying}, among many others}. \\tcg{\\tcr{Most of t}hese papers generally impose one of the following two conditions on the nuisance function\\tcr{s\'} estimation to attain $n^{1/2}$\\tcr{-}consistency and asymptotic normality for valid (supervised) inference \\tcr{based on their ATE estimators}:}\n%\\tcr{**For the next part -- the new text added below -- I am not sure if putting everything together in detail here and then saying \\emph{very little} in Point I in Section 1.3 makes a lot of sense. That section is afterall is truly talking about \\emph{our} contributions, and not this part here. I think it is important to distribute the discussion somewhat. Mention just the key points -- and the citations -- here, and then maybe get back to these again in Point I in Section \\ref{sec_contributions} to make the comparison and our contribution more explicit. (Perhaps focusing on the last paragraph\'s content more in Our contributions. This seems a bit out of place here (also remember this is part is for ATE only. But our claims actually apply both to ATE and QTE. All the more reason to say this in more detail in Point I which talks about both parameters together -- AC.}\n\\begin{enumerate}[(a)]\n\\item \\tcg{Both $\\E(Y\\mid\\X) $ and $\\pi(\\X) $ are correctly specified, and the product of  their estimators\' convergence rates vanishes fast enough \\tcr{(typically, faster than $n^{-1/2}$) }\n\\citep{belloni2014inference, farrell2015robust, belloni2017program, chernozhukov2018double}.} %,athey2018approximate}.}\n\n\\item \\hspace{-0.056in}\n\\tcg{Either $\\E(Y\\mid\\X) $ or $\\pi(\\X) $ is correctly specified by a linear/logistic regression model\\tcr{, while} some \\tcr{carefully tailored} %sophisticated\nbias corrections are applied\\tcr{,}\nand some rate conditions are satisfied \\tcr{as well} \\citep{smucler2019unifying, tan2020model, dukes2021inference}.}\n\\end{enumerate}\n\\tcg{However, we will show that, \\tcr{under our SS setup,} through\nusing the massive unlabeled data, \\tcr{there are some striking \\emph{robustification benefits} that ensure} these requirements \\tcr{\\emph{can}} be substantially relaxed, %(or removed),\n\\tcr{and that \\emph{$n^{1/2}$-rate inference} on the ATE (or QTE) \\emph{can} be achieved in a \\emph{seamless} way, \\emph{without} requiring any specific forms of the nuisance model(s) nor any sophisticated\nbias correction techniques under  misspecification}; see Point (I) in Section \\ref{sec_contributions} for details.}\n\n%\\tcr{**NEW (12/18) -- Need to do better here (I mean the whole above portion in green)! -- AC.}\n\n\n%\\tcg{However, we will show that, through using the massive unlabeled data, these requirements can be substantially relaxed. To establish $n^{1/2}$ consistency and asymptotic normality of our estimators, we only need $\\pi(\\X) $ to be correctly specified and well estimated. Also, we allow for any reasonable strategies (parametric, semi-parametric or non-parametric), in addition to linear/logistic regression, used to estimate $\\E(Y\\mid\\X) $ and $\\pi(\\X) $, making our method much more flexible; see Point (I) in Section \\ref{sec_contributions} for the details of these improvements.}\n\n%ETT ADD REFERENCES TO TAN, AND SMUCLER ET AL AND VANSTEELANDT DUKES ET AL. \\tcr{**Addressed - AC.**}\n\n\\paragraph*{Quantile treatment effect} \\tcr{The} marginal QTE, though technically a more challenging parameter due to the \\tcr{inherently} {\\it non-smooth} nature of the quantile estimating equation \\eqref{defqte}, provides a \\tcr{more complete} %whole\npicture of the causal effect on \\tcr{the} outcome distribution, beyond just its mean\\tcr{.}\n%and %\\tcr{also}\n%measure the central tendency of heavy-tailed data, whose population mean may not exist.\n%\n\\tcr{There is a fairly rich literature on \\tcr{(supervised) } QTE estimation as well.}\nFor example, \\citet{firpo2007efficient} developed an IPW estimator \\tcr{that attains} %attaining\nsemi-parametric efficiency under some smoothness assumptions. \\citet{hsu2020qte} viewed the quantile $\\vt$ \\tcr{from the\nperspective of the conditional distribution,} as the solution to the equation $\\tau=\\E\\{F(\\vt\\mid\\X)\\}$\\tcr{,} %from a perspective of the conditional distribution,\nwhere $F(\\cdot\\mid\\x):=\\P(Y<\\cdot\\mid\\X=\\x) $. Their method thus requires estimating the whole conditional distribution\nof $Y$ given $\\X$. To avoid such a burdensome task, \\citet{kallus2019localized} recently proposed the localized debiased machine learning approach, which only involves estimation of $F(\\cdot\\mid\\X) $ at a preliminary estimate of the quantile and can leverage a broad range of machine learning methods besides kernel smoothing used by \\citet{hsu2020qte}. Moreover, \\citet{zhang2012causal} compared methods based on the propensity score $\\pi(\\X) $ and the conditional distribution $F(\\cdot\\mid\\X) $. They also devised a DR estimator for the QTE under parametric specification of $\\pi(\\X) $ and $F(\\cdot\\mid\\X) $. Nevertheless, all \\tcr{these} aforementioned work\\tcr{s are still} %was \\tcr{still}\nrestricted to the supervised domain involving only \\tcr{the} labeled data $\\cl$.\n\n\\paragraph*{SS inference for treatment effect\\tcr{s}} Although there \\tcr{has} %have\nbeen \\tcr{work} on a variety of problems in SS settings, as listed in the first paragraph of Section \\ref{sec_literature}, less attention, however, has been paid to causal inference and treatment effect estimation \\tcr{problems}, except \\tcr{for some %a little %little\n(very recent) } progress \\citep{zhang2019high, kallus2020role, cheng2020robust}. When there exist post-treatment surrogate variables that are potentially predictive of the outcome, \\citet{cheng2020robust} combined imputing and inverse probability weighting, building on the\\tcr{ir} technique of ``double-index\'\' propensity scores \\citep{cheng2020estimating}, to devise \\tcr{an IPW-type} SS estimator for \\tcr{the} ATE, which is doubly robust. \\tcr{Though not explicitly stated, their approach, however, only applies to low dimensional $(p \\ll n) $ settings, and more importantly, their estimator being of an IPW type,  does not have a naturally ``orthogonal\'\' structure (in the sense of \\citet{chernozhukov2018double}), and therefore, is not first order insensitive to estimation errors of the nuisance functions, unlike our proposed approach. This feature is particularly crucial in situations involving  high dimensional and/or non-parametric nuisance estimators.} \\citet{kallus2020role} also considered the role of surrogates in SS estimation of the ATE, but \\tcr{mostly} %(though not entirely)\n%ETT THIS IS CONFUSING, IF THEY CONSIDERED A SETTING WHERE THIS IS NOT IMPOSED MAYBE WE SHOULD NOT STATE THAT THEY DID NOT  \\tcr{**Addressed. I removed the ``though not entirely\'\' phrase. Our main point was they only considered it tangentially and not with any implementation details, just theory -- AC.}\nin cases where the labeling fractions are bounded below. Further, with a largely theoretical focus, their main aims were characterizations of efficiency and optimality\\tcr{,} rather than \\tcr{implementation.}\n%methodological developments and implementation.\n%ETT AGAIN I DONT KNOW WHAT THIS MEANS, LOOKS LIKE YOU ARE GOING OUT OF YOUR WAY TO MINIMIZE THEIR CONTRIBUTION, DO YOU MEAN THEY ACTUALLY DO NOT OFFER CONCRETE ESTIMATOR WITH CORRESPONDING ASYMPTOTIC THEORY. \\tcr{**Addressed. Yes, exactly (they don\'t talk much about estimator implementation etc.) Anyway, not to sound like we are minimizing them, I have shortened our critique and removed some phrases towards the end -- AC.**}\n%\n\\tcg{In a setting similar to \\citet{kallus2020role}, with surrogates available, \\citet{hou2021efficient}, a very recent work we noticed at the final stage\\tcr{s} of our preparation \\tcr{of} %for\nthis paper,\n%\\tcr{--} a very recent work \\tcr{that we noticed while this paper was in its final stages of preparation --}\n\\tcr{also} developed %their\nSS estimators \\tcr{for} %of\nthe ATE. Unlike our data structure\\tcr{,} where $\\cu$ provides observations for both $\\X$ and $T$, \\citet{hou2021efficient} %\\tcr{they}\nassumed %\\tcr{that}\nthe treatment indicator is missing in the unlabeled data, \\tcr{and} so their estimators have fairly different robustness guarantees from ours. This case, with $T$ unobserved in $\\cu$, is not of our \\tcr{primary} interest\\tcr{.} %main interest, but we will briefly address it\nBut we will briefly address \\tcr{it as well} in Section \\ref{sec_ate_u_dagger}.}\n%\\tcr{**NEW -- Need to mention they have different robustness guarantees ... - AC.}\n%\\tcr{REVISIT - Good place to add the Hou et. al. reference as well from two weeks ago.}\nLastly, \\citet{zhang2019high} extended their SS mean estimation method using a linear working model for $\\E(Y \\mid \\X) $ to the case of the ATE. While all these articles mostly investigated the efficiency of their approaches, none of them clarified the potential gain of \\tcr{\\it robustness} from leveraging \\tcr{the} unlabeled data $\\cu$. In addition, \\tcg{\\citet{zhang2019high} and \\citet{cheng2020robust}\n%\\tcr{**NEW (12/15) -- I am not very sure if mentioning \\citet{hou2021efficient} is appropriate here. They have a mix of results actually -- some general and some model specific -- AC.} \\tcg{**I noticed they used flexible methods, e.g., B-spline, to estimate the outcome model for the low dimensional case. So I removed \\citet{hou2021efficient} from this sentence -- GD.**}\nmainly focused on some specific working models for $\\E(Y\\mid\\X) $ and/or $\\pi(\\X) $}, and \\citet{zhang2019high} %just\n\\tcr{only} briefly discussed the\nATE estimation \\tcr{problem --}\nas an illustration of their SS mean estimation approach; see Remark \\ref{remark_comparison_zhang2019} for a more\ndetailed comparison of our work \\tcr{with} %\\tcr{that of}\n\\citet{zhang2019high}.\n\n\\vskip0.05in\n\\tcr{As for} \\tcr{the} QTE, its SS estimation has, to the best of our knowledge, not been studied in \\tcr{any of} the existing works. \\tcr{Our work here appears to be the \\emph{first} contribution in this regard.}\n\n\\subsection{Our contributions}\\label{sec_contributions}\n%To fill \\tcr{these} gaps in the literature of SS causal inference, this article makes several significant contributions that are summarized as follows.\n\\tcr{This paper aims to bridge some of these major gaps in the existing literature, towards a better and unified understanding -- both methodological and theoretical -- of SS causal inference and its benefits. We summarize our main contributions below.}\n\n%\\tcr{**Check my comments on top of page 5 regarding changes that should be made to Point I below -- AC.}\n\\begin{enumerate}[(I)]\n\n\\item We develop under the SS setting \\eqref{disproportion} a \\emph{family} of DR estimators for\\tcr{:} (a) the ATE \\tcr{(Section \\ref{secos}) } and (b) the QTE \\tcr{(Section \\ref{secqte}) }, which take the \\tcr{\\emph{whole}} data $\\cl\\cup\\cu$ into consideration and enable us to employ arbitrary methods for estimating the nuisance functions as long as some high level conditions are satisfied.  \\tcr{These estimators, apart from affording a \\emph{flexible} and \\emph{general} construction (\\tcr{involving} %including\nimputation and IPW strategies, along with \\tcr{the}\nuse of cross fitting, applied to $\\cl\\cup\\cu$), also enjoy several desirable properties and advantages.} In addition to \\tcr{being} DR in \\tcr{terms of} consistency, we further prove that, whenever the propensity score $\\pi(\\X) $ is correctly \\tcg{specified and estimated at a suitably fast rate} \\tcr{-- something that is indeed {\\it achievable} under our SS setting,}\n%\\tcg{that is achievable in our setting},\n%\n%ETT WHAT DOES IT MEAN TO BE CORRECTLY ESTIMATED, DO YOU NEED RATES FOR ESTIMATION OF NUISANCE PARAMETERS?\\tcr{**Addressed -- Changed the language, hope this makes more sense now -- AC.**}\nour estimators are \\emph{$n^{1/2}$-consistent and asymptotically normal} \\emph{even if the outcome model is misspecified \\tcg{and none of the nuisance functions has a specific (\\tcr{e.g.,} linear$/$logistic) form}}; see Theorems \\ref{thate} and \\ref{thqte} as well as Corollaries \\ref{corate} and \\ref{corqte}, along with the discussions in the \\tcr{subsequent Remarks \\ref{remark_ate_robustness} and \\ref{remark_qte_property}.} %following remarks.\n{\\it Agnostic to the construction of nuisance function estimators}, this \\tcr{robustness} property \\tcr{-- a \\emph{$n^{1/2}$-rate robustness property} of sorts --} is particularly desirable for inference, \\tcg{while \\emph{generally not achievable in purely supervised settings} \\tcr{without extra targeted (and nuanced) bias corrections which \\tcr{do}\nrequire specific (linear$/$logistic) forms of the nuisance function estimators along with %strong rate\n\\tcr{other} conditions, as discussed in our review of (supervised) ATE estimation in Section \\ref{sec_literature}.}}\n%unless one imposes some much more stringent conditions, e.g., those enlisted at the end of the ``\\emph{Average treatment effect}\'\' paragraph in Section \\ref{sec_literature}, which are typically required in the supervised ATE literature}.\n%\n%ETT THIS SHOULD HAVE BEEN DISCUSSED AS A LIMITATION OF PRIOR METHODS BEFORE GETTING HERE. \\tcr{**Addressed -- yes, that\'s a good idea. We have completely rewritten this portion (along with appropriate changes in Section \\ref{sec_literature} as well. Hope this looks better now -- AC.**}\n%\\tcr{**while \\emph{generally unachievable in supervised settings} without extra bias corrections which \\tcr{do} require specific (linear$/$logistic) forms of the nuisance function estimators \\citep{vermeulen2015bias, smucler2019unifying}.**}\n\\tcg{In contrast, our \\tcr{SS approach is} %method is\nmuch more flexible \\tcr{and seamless}, allowing for \\emph{any} reasonable strategies (parametric, semi-parametric or non-parametric) %used to estimate\n\\tcr{for estimating} the nuisance functions.}\n%\nMoreover, \\tcr{even if this improvement in robustness is set aside}, %as clarified in Remarks \\ref{remark_ate_efficiency} and \\ref{remark_qte_efficiency},\nour \\tcr{SS} estimators are %\\tcr{also}\nensured to be \\emph{more efficient} than their supervised counterparts, %even if the \\tcr{improvement} in robustness is ignored, %difference\n%ETT ITS WEIRD TO REFER TO FUTURE REMARK, I WOULD EITHER MAKE THE POINT HERE WITHOUT REFERRING TO THE REMARK OR NOT MAKE IT AT ALL HERE, JUST REALLY WORDY \\tcr{**Addressed -- Yes, I agree. I have removed the reference to the remarks at the end of the description.**}\nand are \\tcr{also} semi-parametrically \\emph{optimal} when correctly specifying both the propensity score $\\pi(\\X) $ and the outcome model, i.e., $\\E(Y\\mid\\X) $ or $F(\\cdot\\mid\\X) $ for the ATE or the QTE, respectively\\tcr{; see Remarks \\ref{remark_ate_efficiency} and \\ref{remark_qte_efficiency}, in particular, regarding these efficiency claims, and Table \\ref{table_ate_summary} for a full %complete\ncharacterization of the robustness and efficiency benefits of our SS estimators.}\n%. ETT IT\'S A BIT CONFUSING WHETHER YOU PLAN TO USE PARAMETRIC NUISANCE MODELS OR NONPARAMETRIC NUISANCE MODELS, AS YOU APPEAR TO MAKE REFERENCE TO BOTH. \\tcr{**Addressed -- AC.**}\n%\n%ETT THE ABOVE PARAGRAPH IS A BIT OF A MESS, CAN YOU STREAMLINE LANGUAGE AND GET STRAIGHT TO THE POINT.I THINK PART OF THE ISSUE IS THAT YOU ARE CONSTANTLY COMPARING TO EXISTING METHODS, BUT YOU ALREADY SPENT FIRST 5 PAGES OF PAPER TALKING ABOUT THOSE METHODS AND ESTABLISHING GAPS IN THE LITERATURE. HERE YOU MIGHT WANT TO ONLY FOCUS ON YOUR METHODS. \\tcr{**Addressed -- we have completely rewritten this portion. Hope this is better now -- AC.**}\n\n\\vskip0.1in%\\par\\smallskip\n\\item Compared to the case of the ATE, the QTE estimation is substantially more \\emph{challenging} in both theory and implementation due to the non-separability of $Y$ and $\\theta$ in the quantile estimating equation \\eqref{defqte}. To overcome these difficulties, we establish novel results of\nempirical process theory for deriving the properties of our QTE estimators; see Lemma \\ref{1v2} in \\tcr{Appendix} \\ref{sm_lemmas}.\n%Section \\ref{sm_lemmas} of the Supplementary Material.\nIn addition, we adopt the strategy of \\emph{one-step update} \\citep{van2000asymptotic, tsiatis2007semiparametric} in the construction of our QTE estimators to facilitate computation. This strategy also avoids the laborious task of recovering the conditional distribution function $F(\\cdot\\mid\\X) $ for the whole parameter space of $\\theta_0$. Instead, we \\emph{only} need to estimate $F(\\cdot\\mid\\X) $ at one \\emph{single} point. Such an advantage was advocated by \\citet{kallus2019localized} as well. \\tcr{Our QTE (as well as ATE) estimators thus have %In general, both our ATE and QTE estimators have %fairly\n\\emph{simple implementations}, in general.}\n\n\\vskip0.1in%\\par\\smallskip\n\\item \\tcr{Finally, a}nother major contribution \\tcr{of this work, though of a somewhat different flavor,} %, which is separate from the above ones in methodology, \\tcr{is} % is the results of\n\\tcr{are our results on} the \\emph{nuisance function\\tcr{s\'} estimation} \\tcr{(Section \\ref{secnf}) } \\tcr{-- an important component in all our SS estimators\' implementation --} for which we consider a \\emph{variety} of reasonable and flexible approaches\\tcr{,} including kernel smoothing \\tcr{(with possible use of dimension reduction) }, parametric regression and random forest. \\tcr{In particular,} as a \\tcr{detailed} illustration, we verify the high-level conditions required by our methods for \\emph{IPW type kernel smoothing estimators with \\tcr{so-called} ``generated\'\' covariates} \\citep{mammen2012nonparametric, escanciano2014uniform, mammen_rothe_schienle_2016} \\tcr{involving (unknown) transformations of possibly \\emph{high dimensional} covariates}. Specifically, we investigate in detail their \\emph{uniform \\tcr{($L_{\\infty}$) } convergence rates, extending the existing theory to cases involving high dimensionality and \\tcg{IPW schemes that need to be estimated}}; see Theorems \\ref{theorem_ks_ate} and \\ref{thhd}. \\tcg{These %conclusions\nresults are novel to the best of our knowledge, and can be %can be\napplicable more generally in other problems. Thus they should be %thereby should be\nof independent interest.}\n%\n%ETT THIS SEEMS TO COME OUT OF NOWHERE, THIS SEEMS TANGENTIAL TO THE MAIN CONTRIBUTIONS OF THE PAPER PARTICULARLY SAMPLING WEIGHTS WHICH WAS NOT PART OF THE SET UP DESCRIBED IN SECTION 1.1. IF THESE ARE PART OF THE OBJECTIVES OF THE PAPER SHOULD HAVE BEEN MADE CLEAR IN THAT SECTION. \\tcr{**Addressed -- we have rewritten this portion to make things a little better. And we also mention in the abstract as a separate contribution. So hopefully this suffices (and reads well) -- AC.**}\n\\end{enumerate}\n\n\\begin{comment}\nproposes a family of estimators for (a) the ATE and (b) the QTE, which takes the whole data $\\cl\\cup\\cu$ into consideration and enables us to employ arbitrary methods for estimating the nuisance functions as long as some high level conditions are satisfied. In addition to DR, which can be achieved by supervised approaches as well, we prove the $n^{1/2}$-consistency and asymptotic normality of our estimators whenever the \\tcr{propensity} score $\\pi(\\X) $ is correctly specified. These properties are desirable for inference and generally unachievable in supervised settings without extra bias corrections \\citep{vermeulen2015bias, smucler2019unifying}. Moreover, our methods are semi-parametrically efficient when correctly specifying both the propensity score $\\pi(\\X) $ and the outcome model, i.e., $\\E(Y\\mid\\X) $ or $F(\\cdot\\mid\\X) $ for the ATE or QTE, respectively. In the construction of our QTE estimator, we adopt the strategy of one-step update \\citep{van2000asymptotic, tsiatis2007semiparametric} to overcome computational difficulty incurred by the inseparability of $Y$ and $\\vt$ in (\\ref{defqte}). It also allows us to estimate the conditional distribution function $F(\\cdot\\mid\\X) $ at only one single point, an advantage advocated by \\citet{kallus2019localized}. In addition, to approximate the nuisance functions in our methods, we investigate the uniform convergence of IPW type kernel smoothing estimators with generated covariates \\citep{mammen2012nonparametric, escanciano2014uniform, mammen_rothe_schienle_2016}, extending the existing theory to cases involving high dimensionality and unknown weighting mechanisms. Our conclusions can be applicable more generally and thereby should be of independent interest. In summary, the main contribution of this article is as follows.\n\\begin{enumerate}[(I)]\n\\item We develop under the semi-supervised setting \\eqref{disproportion} a family of DR estimators for (a) ATE and (b) QTE, which attain $n^{1/2}$-consistency and asymptotic normality whenver $\\pi(\\cdot) $ is correctly specified, while achieving semi-parametric optimality given both the propensity score and the outcome model are correctly specified.\n\n\\item As an illustration of estimating the nuisance functions, we establish novel results of IPW type kernel smoothing estimators with generated covariates in high dimensional scenarios.\n\\end{enumerate}\n\\end{comment}\n\n\\subsection{Organization \\tcr{of the rest} of the article} We introduce our \\tcr{family of}\nSS estimators for (a) the ATE and (b) the QTE, as well as establish %ing\ntheir asymptotic properties, in Sections \\ref{secos} and \\ref{secqte}, respectively. Then the choice and estimation of \\tcr{the} nuisance functions \\tcr{involved} in our approaches, \\tcr{along with their theoretical properties}, are discussed in Section \\ref{secnf}. Section \\ref{sec_simulations} presents \\tcr{detailed} simulation results \\tcr{under various data generating settings to validate the claimed properties and improvements} of our proposed methods, followed by an empirical data example in Section \\ref{sec_data_analysis}. %A\nConcluding remark\\tcr{s} along with discussion on possible extension\\tcr{s} of our work \\tcr{are} provided %by\n\\tcr{in} Section \\ref{sec_conclusion_discussion}. \\tcr{All} technical materials, \\tcr{including proofs of all results}, and further numerical results can be found in the Supplementary Material \\tcr{(Appendices \\ref{sm_technical}--\\ref{sm_data_analysis}) }.\n\n\n\\section{SS estimation for the ATE}\\label{secos}\n\\tcr{Following our clarification at the end of Section \\ref{sec:psetup}, it suffices to focus only on the SS estimation of $\\mu_0$, as in \\eqref{generic_notation}, which will be our primary goal in Sections \\ref{sec_ate_sup}--\\ref{sec_ate_u_dagger}, after which we formally address SS inference for the ATE in Section \\ref{sec_ate_difference}.}\n\n\\vspace{-0.02in}\n\\subsection*{Notation\\tcr{s}}\n\\tcr{We first introduce some notations that will be used throughout the paper.} %In the following, we\n\\tcr{We} use the lower letter $c$ to represent a generic positive constant, including $c_1$, $c_2$, etc, which may vary from line to line. For a $d_1\\times d_2$ matrix $\\mbP$ whose $(i,j) $th component is $\\mbP_{[ij]}$, \\tcr{we} let\n\\bse\n&&\\hbox{$\\|\\mbP\\|_0~:=~\\max_{1\\leq j\\leq d_2}\\{\\sum_{i=1}^{d_1}I(\\mbP_{[ij]}\\neq 0)\\},~~ \\|\\mbP\\|_1~:=~\\max_{1\\leq j\\leq d_2}(\\sum_{i=1}^{d_1}|\\mbP_{[ij]}|) $}, \\\\\n&&\\|\\mbP\\|~:=~\\hbox{$\\max_{1\\leq j\\leq d_2}\\{(\\sum_{i=1}^{d_1}\\mbP_{[ij]}^2)^{1/2}\\},~~ \\tcr{\\mbox{and}}~~  \\|\\mbP\\|_\\infty~:=~\\max_{1\\leq i\\leq d_1, 1\\leq j\\leq d_2}|\\mbP_{[ij]}|$}.\n\\ese\nThe bold numbers $\\bon_d$ and $\\bze_d$ refer to $d$-dimensional vectors of ones and zeros, respectively. \\tcr{We d}enote $\\mb(\\alpha,\\v):=\\{a:|a-\\alpha|\\leq\\v\\}$ as a generic neighborhood of a scalar $\\alpha$ with some \\tcr{radius} $\\v>0$. We use $\\balpha_{[j]}$ to %mean\n\\tcr{denote} the $j$th component of a vector $\\balpha$. For two data sets $\\ms_1$ and $\\ms_2$, \\tcr{we} define $\\P_{\\ms_1}(\\cdot\\mid\\ms_2) $ as the conditional probability with respect to $\\ms_1$ given $\\ms_2$. For any random function $\\hg(\\cdot,\\theta) $ and a random vector $\\W$ with copies $\\W_1,\\ldots,\\W_{n+N}$, \\tcr{we} denote\n\\bse\n\\E_{\\W}\\{\\hg(\\W,\\theta)\\}~:=~ \\hbox{$\\int$} \\hg(\\w, \\theta) d\\,\\P_{\\W}(\\w)\n\\ese\nas the expectation of $\\hg(\\W,\\theta) $ with respect to $\\W$\\tcr{,} treating $\\hg(\\cdot,\\theta) $ as a non\\tcr{-}random function, where $\\P_{\\W}(\\cdot) $ is the distribution function of $\\W $. For $M\\in\\{n,n+N\\}$, \\tcr{we} write\n\\bse\n&&\\E_M\\{\\hg(\\W,\\theta)\\}~:=~ M^{-1}\\hbox{$\\sum_{i=1}^M$} \\hg(\\W_i,\\theta),\\\\\n&&\\mbG_M\\{\\hg(\\W,\\theta)\\}~:=~ M^{1/2}[\\E_M\\{\\hg(\\W,\\theta)\\}-\\E_\\W\\{\\hg(\\W,\\theta)\\}], ~~\\tcr{\\mbox{and}}\\\\ &&\\var_M\\{\\hg(\\W,\\theta)\\}~:=~\\E_M[\\{\\hg(\\W,\\theta)\\}^2]-[\\E_M\\{\\hg(\\W,\\theta)\\}]^2.\n\\ese\nAlso, \\tcr{we} define\n\\bse\n&&\\E_N\\{\\hg(\\W,\\theta)\\}~:=~ N^{-1}\\hbox{$\\sum_{i=n+1}^{n+N}$} \\hg(\\W_i,\\theta), ~~\\tcr{\\mbox{and}}\\\\\n&&\\mbG_N\\{\\hg(\\W,\\theta)\\}~:=~ N^{1/2}[\\E_N\\{\\hg(\\W,\\theta)\\}-\\E_\\W\\{\\hg(\\W,\\theta)\\}].\n\\ese\nLastly, we \\tcr{let} $f(\\cdot) $ and $F(\\cdot) $ \\tcr{denote} %as\nthe density and distribution functions of $Y$, while %letting\n$f(\\cdot\\mid\\w) $ and $F(\\cdot\\mid\\w) $ represent the conditional density and distribution functions of $Y$ given $\\W=\\w$.\n\n\n\\subsection{Supervised \\tcg{estimator}} \\label{sec_ate_sup}\nAs noted earlier, %\\tcr{in \\eqref{generic_notation}},\nfor estimating \\tcr{the} ATE, we \\tcr{can simply} focus on $\\mu_0\\equiv\\E(Y) $ with $Y\\equiv Y(1) $. To this end, we first observe the following representation \\tcr{(and identification) } of $\\mu_0$. Let $m(\\X):=\\E(Y\\mid\\X) $ \\tcr{and recall $\\pi(\\X) \\equiv \\E(T\\mid \\X) $. %, we have\nWe then have:}\n\\bse\n\\mu_0 &~=~& \\E\\{m(\\X) \\}+ \\E[\\{\\pis(\\X)\\}^{-1} T\\{Y-m(\\X)\\}] \\\\\n& ~=~& \\E\\{ m^*(\\X) \\} + \\E[\\{\\pi(\\X)\\}^{-1} T\\{Y-m^*(\\X)\\}]\\tcr{,}\n\\ese\nfor some \\emph{arbitrary} functions $\\pis(\\cdot) $ and $m^*(\\cdot) $, implying that the equivalence\\tcr{:}\n\\be\n\\mu_0 &~=~& \\E\\{m^*(\\X) \\}+ \\E[\\{\\pis(\\X)\\}^{-1} T\\{Y -   m^*(\\X) \\}]\n\\label{ate_dr_representation}\n\\ee\nholds given either $\\pis(\\X)=\\pi(\\X) $ or $ m^*(\\X) = m(\\X) $ but {\\it not} necessarily both. The equation \\eqref{ate_dr_representation} is thus a DR representation of $\\mu_0$\\tcr{, involving the nuisance functions $\\pi(\\cdot) $ and $m(\\cdot) $}. Using the empirical version of \\eqref{ate_dr_representation} based on $\\cl$ precisely leads to the traditional DR estimator of the mean $\\mu_0$ \\citep{bang2005doubly, chernozhukov2018double}, i.e., the \\emph{supervised estimator}\n\\be\n\\muhatsup~:=~\\E_n\\{\\mhatn(\\X)\\}+\\E_n[\\{\\pihatn(\\X)\\}^{-1}T\\{Y-\\mhatn(\\X)\\}], ~~\\tcr{\\mbox{where}}\n\\label{sup_ate}\n\\ee\n%where\n$\\{\\pihatn(\\cdot),\\mhatn(\\cdot)\\}$ are \\tcr{\\it some} estimators of $\\{\\pi(\\cdot),\\mu(\\cdot)\\}$ from $\\cl$ with possibly misspecified limits $\\{\\pis(\\cdot),m^*(\\cdot)\\}$. Apart from \\tcr{being} DR, the estimator $\\muhatsup$ also possesses the two nice properties below as long as \\tcr{the} models for $\\{\\pi(\\cdot),\\mu(\\cdot)\\}$ are \\tcr{\\it both} correctly specified and %some certain\n\\tcr{certain rate conditions \\citep{chernozhukov2018double} on the convergence of $\\{\\pihatn(\\cdot),\\mhatn(\\cdot)\\}$} are satisfied.\n\\begin{enumerate}[(i)]\n\\item First-order insensitivity %:\n\\tcr{-- When both nuisance models are correctly specified, t}he influence function of $\\muhatsup$ is not affected by the estimation errors of $\\{\\pihatn(\\cdot),\\mhatn(\\cdot)\\}$ \\citep{robins1995semiparametric, chernozhukov2018double, chakrabortty2019high}. This feature is directly relevant to the {\\it debiasing} term $\\E_n[\\{\\pihatn(\\X)\\}^{-1}T\\{Y-\\mhatn(\\X)\\}]$ in \\eqref{sup_ate} and is desirable for inference, particularly when the construction of $\\{\\pihatn(\\cdot),\\mhatn(\\cdot)\\}$ involves non-parametric calibrations or \\tcr{if} $\\X$ is high dimensional \\tcr{(leading to rates slower than $n^{-1/2}$) }.\n\n%ETT THIS IS TECHNICALLY ONLY TRUE UNDER NONPARAMETRIC INFERENCE WITH SUFFICIENT FAST RATES OR AT INTERSECTION OF PARAMETRIC MODELS BUT NOT IN GENERAL IN THE UNION OF PARAMETRIC MODELS. ALSO THIS RESULT IS AS OLD AS SEMIPARAMETRIC THEORY DATING BACK TO EIGHTIES, WELL BEFORE VICTOR LEARNED ABOUT IT. \\tcr{**Addressed -- Yes, we meant these claims hold under both nuisance models correctly specified. I have added that now (and it\'s also mentioned in the last para). And yes, indeed these results were known before. So we have added one such earlier citation here -- AC.**}\n\n%\\vskip0.1in\n\\par\\smallskip\n\\item Semi-parametric optimality among all regular and asymptotically linear estimators for\n$\\mu_0$ \\tcr{--} $\\muhatsup$ attains the semi-parametric efficiency bound for estimating $\\mu_0$ under a fully non-parametric (i.e., unrestricted up to the condition \\eqref{mar_positivity}) family \\tcr{of} %of the\ndistributions of $(Y,T,\\X\\trans)\\trans$ \\citep{robins1994estimation, robins1995semiparametric, graham2011efficiency}\\tcr{.}\n\\end{enumerate}\nIn the sense of the aforementioned advantages, $\\muhatsup$ is the ``best\'\' achievable estimator for $\\mu_0$ under a purely supervised setting \\tcr{\\citep{robins1995semiparametric, chernozhukov2018double}}.\n\n\\subsection[A family of SS estimators \\tcr{for mu0}]{A family of SS estimators \\tcr{for $\\mu_0$}}\\label{sec_ate_ss}\nDespite the above desirable properties, the supervised DR estimator $\\muhatsup$ may, however, be suboptimal when the unlabeled data $\\cu$ is available, owing to ignoring the extra observations for $(T,\\X\\trans)\\trans$ \\tcr{therein}. An intuitive interpretation is that, since $\\E(Y-\\mu_0\\mid\\X)\\neq 0$ with a positive probability if we exclude the trivial case where $\\E(Y\\mid\\X)=\\mu$ almost surely, the marginal distribution $\\P_\\X$ of $\\X$ actually plays a role in the definition of $\\mu_0$ and the information of $\\P_\\X$ provided by $\\cu$ can therefore help estimate $\\mu_0$; see Chapter 2 of \\citet{Chakrabortty_Thesis_2016} for further insights in a more general context.\n%\\tcr{Moreover, with observations for $T$ also available in $\\cu$, it provides a much better chance to estimate the propensity score $\\pi(\\cdot) $ more efficiently (or `correctly\') and possibly at a faster rate (if $N \\gg n$)!}\n\n\\tcr{To} utilize $\\cu$, we notice that the term $\\E_n\\{\\mhatn(\\X)\\}$ in \\eqref{sup_ate} can be replaced by $\\E_{n+N}\\{\\mhatn(\\X)\\}$ which integrates $\\cl$ and $\\cu$. Moreover, estimation of the propensity score can certainly be improved by using $\\cu$ as well, since $\\pi(\\X) $ is entirely determined by the distribution of $(T,\\X\\trans)\\trans$. \\tcr{This provides a much better chance to estimate $\\pi(\\cdot) $ more \\emph{robustly} (possibly at a faster rate!).}\n\n\\vskip0.02in\n\\tcr{Thus,} with %an\n\\tcr{\\emph{any} estimators (with possibly misspecified limits) } $\\pihatN(\\cdot) $ for $\\pi(\\cdot) $, based on $\\cu$, and $\\mhatn(\\cdot) $  \\tcr{for $m(\\cdot) $} from $\\cl$\\tcr{,} same as before, we propose a family of \\tcr{\\emph{SS estimators} of $\\mu_0$:} %given by:}\n\\be\n\\muhatss~:=~\\E_{n+N}\\{\\mhatn(\\X)\\}+\\E_n[\\{\\pihatN(\\X)\\}^{-1}T\\{Y-\\mhatn(\\X)\\}]\\tcr{,} \\label{ss_ate}\n\\ee\n%for $\\mu_0$,\nindexed by $\\{\\pihatN(\\cdot),\\mhatn(\\cdot)\\}$. Here\\tcr{,} we apply the strategy of \\tcr{\\emph{cross fitting}} \\citep{chernozhukov2018double, newey2018cross} when estimating $\\mhatn(\\cdot) $. Specifically, for some fixed integer $\\kK\\geq 2$, we divide the index set $\\I=\\{1,\\ldots,n\\}$ into $\\kK$ disjoint subsets $\\I_1,\\ldots,\\I_\\kK$ of the same size $n_\\kK:=n/\\kK$ without loss of generality. Let $\\mhatnk(\\cdot) $ be an estimator for $m^*(\\cdot) $ using the set $\\cl_k^-:=\\{\\bfZ_i:i\\in\\I_k^-\\}$ of size $n_{\\kK^-}:=n-n_\\kK$, where $\\I_k^-:=\\I/\\I_k$. Then\\tcr{,} we define\\tcr{:}\n\\be\n\\mhatn(\\X_i)&~:=~&\\kK^{-1}\\sk\\mhatnk(\\X_i)\\quad (i=n+1,\\ldots,n+N), \\quad \\tcr{\\mbox{and}} \\label{ds1}\\\\\n\\mhatn(\\X_i)&~:=~&\\mhatnk(\\X_i)\\quad (i\\in\\I_k;\\ k=1,\\ldots,\\kK). \\label{ds2}\n\\ee\nThe motivation \\tcr{for the} %of\ncross fitting is to bypass technical challenges from the dependence of $\\mhatn(\\cdot) $ and $\\X_i$ in the term $\\mhatn(\\X_i) $ $(i=1,\\ldots,n) $. Without cross fitting, the same theoretical conclusions require more \\emph{stringent} assumptions in the same spirit as the stochastic equicontinuity conditions in the classical theory of empirical process. These assumptions are generally hard to verify and less likely to hold in high dimensional scenarios. Essentially, using cross fitting makes the second-order errors in the stochastic expansion of $\\muhatss$ easier to control while \\emph{not} changing the first-order properties, i.e., the influence function of $\\muhatss$. See Theorem 4.2 and the following discussion in \\citet{chakrabortty2018efficient}, as well as \\citet{chernozhukov2018double} and \\citet{newey2018cross}, for more discussion concerning cross fitting. Analogously, when estimating $\\pi(\\cdot) $, we use $\\cu$ only so that $\\pihatN(\\cdot) $ and $\\X_i$ are independent in $\\pihatN(\\X_i) $ $(i=1,\\ldots,n) $. Discarding $\\cl$ herein is asymptotically negligible owing to the assumption \\eqref{disproportion}.\n\n\\vskip0.1in\nThe definition \\eqref{ss_ate} equips us with a \\emph{family} of SS estimators for $\\mu_0$, \\emph{indexed} by $\\pihatN(\\cdot) $ and $\\mhatn(\\cdot) $. To derive their limiting properties, we need the following \\tcr{(high-level) } conditions.\n\n\\begin{assumption}\\label{api4}\nThe function $\\hD(\\x):=\\{\\pihatN(\\x)\\}^{-1}-\\{\\pis(\\x)\\}^{-1}$ satisfies\\tcr{:}\n\\be\n&&(\\E_\\X[\\{\\hD(\\X)\\}^2])^{1/2}~=~O_p(s_N), ~~ \\tcr{\\mbox{and}} \\label{sn2} \\\\\n&&\\{\\E_\\Z([\\hD(\\X)\\{Y- m^*(\\X) \\}]^2)\\}^{1/2}~=~O_p(b_N)\\tcr{,} \\label{sn4}\n\\ee\nfor some positive sequences $s_N$ and $b_N$ that \\tcr{can possibly diverge,} %are possibly divergent,\nwhere $\\pis(\\cdot) $ is \\tcr{\\it some} function \\tcr{(target of $\\pihatN(\\cdot) $) } such that $\\pis(\\x)\\in(c,1-c) $ for any $\\x\\in\\mx$ and some constant $c\\in(0,1) $.\n\\end{assumption}\n\n\\begin{assumption}\\label{ahmu}\nThe estimator $\\mhatnk(\\cdot) $ satisfies\\tcr{: for {\\it some} function $m^*(\\cdot) $,} %{$\\in L_2(\\P_{\\x}) $,}\n\\be\n&&\\E_\\X\\{|\\hat{m}_{n,k}(\\X)- m^*(\\X) |\\}~=~O_p(w_{n,1}), ~~ \\tcr{\\mbox{and}} \\label{wn1}\\\\\n&&(\\E_\\X[\\{\\hat{m}_{n,k}(\\X)- m^*(\\X) \\}^2])^{1/2}~=~O_p(w_{n,2})\\quad (k=1,\\ldots,\\kK)\\tcr{,}\n\\label{wn2}\n\\ee\nfor some positive sequences $w_{n,1}$ and $w_{n,2}$ that are possibly divergent.\n\\end{assumption}\n\n\\begin{remark}\\label{remark_ate_assumptions}\nAssumptions \\ref{api4}--\\ref{ahmu} impose some rather mild \\tcr{(and \\emph{high-level}) } regulations on the behavior of the estimators $\\{\\pihatN(\\cdot),\\mhatn(\\cdot)\\}$ and their possibly \\tcr{\\it misspecified} limit\\tcr{s} $\\{\\pis(\\cdot),m^*(\\cdot)\\}$. The\ncondition \\eqref{sn4} is satisfied when, for example, the function $\\hD(\\X) $ is such that $(\\E_\\X[\\{\\hD(\\X)\\}^4])^{1/4}=O_p(b_N) $\\tcr{,} while $Y$ and $m^*(\\X) $ have finite fourth moments. The restriction on $\\pis(\\cdot) $ in Assumption \\ref{api4} is the counterpart of the second condition in \\eqref{mar_positivity} under model misspecification, ensuring our estimators $\\muhatss$ have influence functions with finite variances; see Theorem \\ref{thate}. Moreover, it is noteworthy that all the sequences in Assumptions \\ref{api4}--\\ref{ahmu} are allowed to \\tcr{\\emph{diverge},} while specifying \\tcr{\\emph{only}} the rates of finite norms \\tcr{(i.e., $L_r$ moments for some finite $r$) } %for\n\\tcr{of} $\\hD(\\X) $ and $\\mhatnk(\\X)-m^*(\\X) $, \\tcr{which is} weaker than requiring their convergences uniformly \\tcr{over} %\n$\\x\\in\\mx$ \\tcr{(i.e., $L_{\\infty}$ convergence) }. These assumptions will be verified for some choices of $\\{\\pihatN(\\cdot),\\mhatn(\\cdot),\\pis(\\cdot),m^*(\\cdot)\\}$ in Section \\ref{secnf}.\n\\end{remark}\n\nIn the theorem below, we present the stochastic expansion \\tcr{(and a complete characterization of the asymptotic properties) } of our SS estimators $\\muhatss$ defined in \\eqref{ss_ate}.\n\n\\begin{theorem}\\label{thate}\nUnder Assumptions \\ref{ass_equally_distributed} and \\ref{api4}--\\ref{ahmu}, the stochastic expansion of $\\muhatss$ is\\tcr{:}\n\\bse\n&&\\muhatss-\\mu_0~=~\\nn\\sl\\zeta_{n,N}(\\Z_i)~+~O_p\\{n^{-1/2}(w_{n,2}+b_N)+s_N\\,w_{n,2}\\}~+ \\\\\n&& \\phantom{\\muhatss-\\mu_0~=~}~I\\{\\pis(\\X)\\neq\\pi(\\X)\\}O_p(w_{n,1})~+~I\\{ m^*(\\X) \\neq m(\\X) \\}O_p(s_N)\\tcr{,}\n\\ese\nwhen $\\nu\\geq 0$, where \\tcr{$I(\\cdot) $ is the indicator function as defined earlier, and}\n\\bse\n\\zeta_{n,N}(\\Z)~:=~\\{\\pis(\\X)\\}^{-1}T\\{Y- m^*(\\X) \\}~+~\\E_{n+N}\\{ m^*(\\X) \\}~-~\\mu_0\\tcr{,}\n\\ese\n%satisfying\n\\tcr{with} $\\E\\{\\zeta_{n,N}(\\Z)\\}=0$ \\tcr{if} %given\neither $\\pis(\\X)=\\pi(\\X) $ or $m^*(\\X) = m(\\X) $ but not necessarily both.\n\\end{theorem}\n\nTheorem \\ref{thate} establishes the \\tcr{\\emph{asymptotic linearity}} of $\\muhatss$ for the \\tcr{\\emph{general}} case where $\\nu\\geq 0$, i.e., the labeled and unlabeled data sizes are either comparable or not. Considering, \\tcr{however, the typical case is that} the number of the extra observations for $(T,\\X\\trans)\\trans$, whose distribution completely determines the propensity score $\\pi(\\X) $, from the unlabeled data $\\cu$ is much larger than the labeled data size $n$ in the SS setting \\eqref{disproportion}, \\tcr{i.e., $\\nu = 0$}, it is fairly reasonable to assume that $\\pi(\\X) $ can be correctly specified \\tcr{(i.e., $\\pis(\\cdot) = \\pi(\\cdot) $) \\emph{and}} estimated \\tcr{from $\\cu$} at a rate \\tcr{\\emph{faster}} than $n^{-1/2}$. We therefore study the asymptotic behavior of our proposed estimators $\\muhatss$ under such an assumption in the next corollary, which directly follows from Theorem \\ref{thate}.\n\n\\begin{corollary}\\label{corate}\nSuppose that the conditions in Theorem \\ref{thate} hold true, that $\\nu=0$, \\tcr{as in \\eqref{disproportion}}, and that $\\pis(\\X)=\\pi(\\X) $. Then the stochastic expansion of $\\muhatss$ is\\tcr{:}\n\\bse\n&&\\muhatss-\\mu_0~=~\\nn\\sl\\zess(\\Z_i)~+~O_p\\{n^{-1/2}(w_{n,2}+b_N)+s_N\\,w_{n,2}\\}~+ \\\\\n&&\\phantom{\\muhatss-\\mu_0~=~}~I\\{ m^*(\\X) \\neq m(\\X) \\}O_p(s_N),\n\\ese\nwhere\n\\bse\n\\zess(\\Z)~:=~\\{\\pi(\\X)\\}^{-1}T\\{Y- m^*(\\X) \\} ~+~ \\E\\{ m^*(\\X) \\} ~-~ \\mu_0\\tcr{,}\n\\ese\nsatisfying $\\E\\{\\zess(\\Z)\\}=0$\\tcr{, and with $m^*(\\cdot) $ being arbitrary (i.e., not necessarily equal to $m(\\cdot) $) }. Further, if either $s_N=o(n^{-1/2}) $ or $ m^*(\\X) = m(\\X) $ but not necessarily both, and\n\\bse\nn^{-1/2}(w_{n,2}+b_N)+s_N\\,w_{n,2}~=~o(n^{-1/2}),\n\\ese\nthe limiting distribution of $\\muhatss$ is\\tcr{:}\n\\be\nn^{1/2}\\lamss^{-1}(\\muhatss-\\mu_0)~\\xrightarrow{d}~\\mn(0,1)\\quad (n,\\tcr{N} \\to\\infty),\n\\label{ate_normality}\n\\ee\nwhere the asymptotic variance $\\lamss^2:=\\E[\\{\\zess(\\Z)\\}^2]=\\var[\\{\\pi(\\X)\\}^{-1}T\\{Y- m^*(\\X) \\}]$ can be estimated by $\\var_n[\\{\\pihatN(\\X)\\}^{-1}T\\{Y-\\mhatn(\\X)\\}]$.\n\\end{corollary}\n\n\\begin{remark}[Robustness \\tcr{benefits} and first-order insensitivity of  $\\muhatss$]\\label{remark_ate_robustness}\nAccording to the conclusions in Theorem \\ref{thate}, as long as the residual terms in the expansion vanish asymptotically, our proposed estimators $\\muhatss$ converge to $\\mu_0$ in probability given  either $\\pihatN(\\cdot) $ targets the true $\\pi(\\cdot) $ or $\\mhatnk(\\cdot) $ estimates the true $m(\\cdot) $\\tcr{,} but \\tcr{\\it not} necessarily both. Apart from such \\tcr{a} DR property\\tcr{,} which can be attained using only the labeled data $\\cl$ as well \\citep{bang2005doubly, kang2007demystifying}, Corollary \\ref{corate} further establishes the $n^{1/2}$-consistency and asymptotic normality of $\\muhatss$, two critical properties for inference, \\tcr{\\it whenever} $\\pihatN(\\X) $ converges to $\\pi(\\X) $ at a rate faster than $n^{-1/2}$, via exploiting the information regarding the distribution of $(T,\\X\\trans)\\trans$ from the unlabeled data $\\cu$. \\tcr{Notably, this holds {\\it regardless} of whether $m(\\cdot) $ is correctly specified or not}. To attain the same \\tcr{kind of}\nresult without $\\cu$, it is generally necessary to require that $\\{\\pi(\\cdot),m(\\cdot)\\}$ are both correctly specified unless additional bias corrections are applied \\tcr{(and in a nuanced targeted manner) } and \\tcg{specific (linear$/$logistic) forms of $\\{\\pi(\\cdot),m(\\cdot)\\}$ are assumed} \\citep{vermeulen2015bias, smucler2019unifying, tan2020model, dukes2021inference}. \\tcg{Such a significant relaxation of the requirements demonstrates that our SS ATE estimators %for the ATE\nactually enjoy \\tcr{much} better robustness relative to the ``best\'\' achievable estimators in purely supervised setups.} \\tcr{These %robustness\nbenefits of SS causal inference ensure {\\it $n^{1/2}$-rate inference on the ATE (or QTE) can be achieved in a seamless\nway}, regardless of the misspecification of the outcome model, and moreover, without requiring any specific forms for either of the nuisance model(s).}\n\\tcg{\\tcr{It should also be noted that these benefits are} %This is\nquite different \\tcr{in flavor} from \\tcr{those in} many ``standard\'\' \\tcr{(non-causal) } SS problems, such as mean estimation \\citep{zhang2019semi, zhang2019high} and linear regression \\citep{azriel2016semi, chakrabortty2018efficient}, where the supervised methods possess full robustness \\tcr{(\\tcg{as} %\\tcg{since} %and\nthe parameter needs no nuisance function for \\tcr{its} identification) } and the \\tcr{main goal of SS inference is efficiency improvement.} %only concern is efficiency.\n\\tcr{For causal inference, however, %(under a potential outcome framework), however,\nwe have a more challenging setup,\n%We actually consider a more challenging setting\nwhere the supervised methods have to deal with nuisance functions -- inherently required for the parameter\'s identification and consistent estimation -- and are} %while\nno longer fully robust. %This feature of the potential outcome framework enables our\n\\tcr{The} SS \\tcr{setup enables one to} %estimators\nto attain extra robustness, compared to purely supervised methods, from leveraging the unlabeled data.} \\tcr{Thus, for causal inference, the SS setting in fact provides a {\\it broader scope of improvement -- in both robustness and efficiency} -- we discuss the latter aspect in Section \\ref{sec_ate_efficiency_comparison} below.}\n%\n%ETT I DONT THINK THIS IS TECHNICALLY TRUE, THE OTHER SETTINGS ARE PURELY ROBUST AS THEY DONT REQUIRE ANY NUISANCE PARAMETER IN THE SUPERVISED SETTING. I THINK BETTER TO STRESS THAT THE SITUATION HERE IS MORE CHALLENGING BECAUSE THE SUPERVISED ESTIMATOR HAS TO DEAL W NUISANCE FUNCTION WHICH UNLABELED DATA CAN ROBUSTIFY. \\tcr{**Addressed -- Yes, totally agree. We have completely rewritten this portion to make this more clear. Hope this reads better -- AC.**}\n%\n\\tcr{Lastly, a}nother notable feature of $\\muhatss$ is \\tcr{its} \\tcr{\\it first-order insensitivity}, i.e., the influence function $\\zeta_{n,N}(\\Z) $ in Theorem \\ref{thate} is not affected by estimation errors or %construction knowledge\n\\tcr{any knowledge of the mode of construction} of the nuisance estimators. This is \\tcr{particularly} desirable for \\tcr{($n^{1/2}$-rate) } inference %particularly\nwhen $\\{\\pihatN(\\cdot),\\mhatn(\\cdot)\\}$ involves non-parametric calibrations\\tcr{, or machine learning methods, with slow/unclear first order rates,} %unclear first order properties %like kernel smoothing,\nor \\tcr{if} $\\X$ is high dimensional.\n\\end{remark}\n%ETT: THIS RESULT IS REALLY COOL !!!! \\tcr{** Thanks!! -- AC.**}\n\n\\subsection{Efficiency comparison}\\label{sec_ate_efficiency_comparison}\nIn this \\tcr{s}ection, we analyze the efficiency gain of $\\muhatss$ relative to its  supervised counterparts. We have \\tcr{already} clarified in Remark \\ref{remark_ate_robustness} %that\nthe robustness \\tcr{benefits} of $\\muhatss$ %is\n\\tcr{that are} generally not attainable by purely supervised methods.\n%\n%ETT NOT SURE I UNDERSTAND WHAT THIS IS DOING, ARE YOU TRYING TO SHOW THAT THE EFFICIENCY BOUNDS ARE NOT THE SAME FOR SUP VS SS?WHY NOT SAY AS MUCH? IM UNCLEAR WHY CONSIDER THE PSEUDO SUPERVISED ESTIMATOR?iTHINK REVIEWERS WILL STRUGGLE TO UNDERSTAND WHAT IS GOING ON HERE. CAN YOU DESCRIBE ALL RESULTS IN ENGLISH BEFORE STATING THEM FORMALLY \\tcr{**Addressed -- I have rewritten this portion a bit. Hope this looks better now -- AC.**}\n%\n\\tcr{Therefore, setting aside this already existing improvement (which is partly due to the fact that the SS setup allows $\\pi(\\cdot) $ to be estimated better, via $\\pihatN(\\cdot) $ from $\\cu$), and to ensure}\n%To eradicate such inequality \\tcg{in robustness} and make\na ``fair\'\' comparison \\tcr{(with minimum distraction) }, focusing \\tcr{\\it solely} on efficiency, we consider \\tcg{the} {\\it pseudo-supervised} estimator\\tcr{(s):}\n\\be\n\\muhatsup^*~:=~  \\E_{n}\\{\\mhatn(\\X)\\}+\\E_n[\\{\\pihatN(\\X)\\}^{-1}T\\{Y-\\mhatn(\\X)\\}],\n\\label{pseudo_sup_ate}\n\\ee\nwhich estimates $\\pi(\\cdot) $ by $\\pihatN(\\cdot) $\\tcr{,}\nbut does not employ $\\cu$ to approximate $\\E_\\X\\{\\mhatn(\\X)\\}$. \\tcr{(So it is essentially a version of the purely supervised estimator $\\muhatsup$ in \\eqref{sup_ate} with $\\pihatn(\\cdot) $ therein replaced by $\\pihatN(\\cdot) $, due to the reasons stated above.) } \\tcg{Here we emphasize that, as the name ``pseudo-supervised\'\' suggests, \\emph{they \\tcr{\\it cannot} actually be constructed in purely supervised settings and are proposed just for efficiency comparison}}. \\tcr{In a sense, this gives the supervised estimator its best chance to succeed -- in terms of efficiency (setting aside any of its robustness drawbacks) -- and yet, as we will discuss in Remark \\ref{remark_ate_efficiency}, they are {\\it still} outperformed by our SS estimator(s).}\n\n\\vskip0.05in\nWe state %their properties\n\\tcr{the properties of these pseudo-supervised estimator(s) } in the following corollary, which can be proved analogously to Theorem \\ref{thate} and Corollary \\ref{corate}, \\tcr{and then compare their efficiency (i.e., the ideal supervised efficiency) to that of our SS estimator(s) in Remark \\ref{remark_ate_efficiency}.} %, where we also demonstrate that our SS estimators under certain conditions are also semi-parametric efficient.} %(i.e., the best case supervised efficiency) to our SS estimators in Remark \\ref{remark_ate_efficiency} (i.e., the best} %case supervised efficiency) to our SS estimators in Remark \\ref{remark_ate_efficiency}}\n\n\\begin{corollary}\\label{coratesup}\nUnder the \\tcr{same} conditions \\tcr{as} in Corollary \\ref{corate}, the pseudo-supervised estimator $\\muhatsup^*$ in \\eqref{pseudo_sup_ate} \\tcr{satisfies the following expansion:} %is such that\n\\be\n&&\\muhatsup^*-\\mu_0~=~n^{-1}\\sl\\zes(\\Z_i)~+~O_p\\{n^{-1/2}(w_{n,2}+b_N)+s_N\\,w_{n,2}\\}~+ \\nonumber\\\\\n&&\\phantom{\\muhatsup^*-\\vt~=~}~I\\{ m^*(\\X) \\neq m(\\X) \\}O_p(s_N), ~~\\tcr{\\mbox{and}}\\nonumber\\\\\n&&n^{1/2}\\lams^{-1}(\\muhatsup^*-\\mu_0)~\\xrightarrow{d}~\\mn(0,1)\\quad (n, \\tcr{N} \\to\\infty), ~~\\tcr{\\mbox{where}} \\label{ate_sup_normality}\n\\ee\n%where\n$\\zes(\\Z,\\theta):=\\{\\pi(\\X)\\}^{-1}T\\{Y- m^*(\\X) \\}+ m^*(\\X) -\\mu_0$\\tcr{,} satisfying $\\E\\{\\zes(\\Z)\\}=0$\\tcr{,} and\n\\bse\n&&\\lams^2~:=~\\E[\\{\\zes(\\Z)\\}^2]~=~\\var[\\{\\pi(\\X)\\}^{-1}T\\{Y- m^*(\\X) \\}]- \\var\\{ m^*(\\X) \\}~+\\\\ &&\\phantom{\\lams^2~:=~\\E[\\{\\zes(\\Z)\\}^2]~=~}~2\\,\\E\\{ m^*(\\X) (Y-\\mu_0)\\}.\n\\ese\n\\end{corollary}\n\n\\begin{remark}[Efficiency improvement \\tcr{of $\\muhatss$ and semi-parametric optimality}]\\label{remark_ate_efficiency}\nIf the conditions in Corollary \\ref{corate} hold and the imputation function takes the form\\tcr{:}\n\\be\nm^*(\\X)~\\equiv~\\E\\{Y\\mid \\bfg(\\X)\\} \\tcr{,}\n\\label{mstarX}\n\\ee\nwith some \\tcr{(possibly) } unknown function $\\bfg(\\cdot) $, the SS variance $\\lamss^2$ in \\eqref{ate_normality} is less than or equal to the supervised variance $\\lams^2$ in \\eqref{ate_sup_normality}, i.e.,\n\\bse\n\\lamss^2~=~\\lams^2-2\\,\\E\\{ m^*(\\X) (Y-\\mu_0)\\}+\\var\\{ m^*(\\X) \\}~=~\\lams^2-\\var\\{ m^*(\\X) \\}~\\leq~ \\lams^2,\n\\ese\nwhich implies that $\\muhatss$ is equally or more efficient compared to the pseudo-supervised estimator $\\muhatsup^*$. An example of the function $\\bfg(\\x) $ is the linear transformation $\\bfg(\\x)\\equiv\\mbP_0\\trans\\x$, where $\\mbP_0$ is some unknown $r\\times p$ matrix with a fixed $r\\leq p$ and can be estimated\\tcr{, e.g.,} by dimension reduction techniques such as %the\n\\tcr{sliced} inverse regression \\citep{li1991sliced, lin2019sparse}\\tcr{, as well as by standard parametric (e.g., linear/logistic) regression (for the special case $r=1$).}\n\nFurther, when the outcome model is correctly specified, i.e., $m^*(\\X)=\\E(Y\\mid \\X) $, we have\\tcr{:}\n\\be\n\\lamss^2&~\\equiv~&\\var[\\{\\pi(\\X)\\}^{-1}T\\{Y- m^*(\\X) \\}]\\nonumber\\\\\n&~=~&\\E[\\{\\pi(\\X)\\}^{-2}T\\{Y- \\E(Y\\mid\\X) \\}^2]\\label{ate_eff}\\\\\n&~\\leq~&\\E[\\{\\pi(\\X)\\}^{-2}T\\{Y- g(\\X) \\}^2]\\tcr{,} \\nonumber\n\\ee\nfor any function $g(\\cdot) $ and the equality holds only if $g(\\X)=\\E(Y\\mid\\X) $ almost surely. This fact demonstrates the asymptotic \\emph{optimality} of $\\muhatss$ among all regular and asymptotically linear estimators of $\\mu_0$, whose influence functions take the form $\\{\\pi(\\X)\\}^{-1}T\\{Y-g(\\X)\\}$ for some function $g(\\cdot) $. Under the semi-parametric model of $(Y,\\X\\trans,T)\\trans$, \\tcg{given by the following class of allowable distributions \\tcr{(the most unrestricted class naturally allowed under our SS setup) }:}\n\\be\n\\tcg{\\{\\P_{(Y,T,\\X\\trans)\\trans}: \\hbox{ \\eqref{mar_positivity} is satisfied, }\\P_{(T,\\X\\trans)\\trans} \\hbox{ is known and } \\P_{Y\\mid(T,\\X\\trans)\\trans}  \\hbox{ is unrestricted}\\},}\n\\label{semiparametric_model}\n\\ee\n%satisfying \\eqref{mar_positivity}, where the distribution of $(T,\\X\\trans)\\trans$ is known and that of $Y$ is unrestricted,\none can show that (\\ref{ate_eff}) equals the efficient asymptotic variance for estimating $\\mu_0$, that is, the estimator $\\muhatss$ \\emph{achieves the semi-parametric efficiency bound}; \\tcr{see Remark 3.1 of \\citet{chakrabortty2018efficient}\\tcr{, and also the results of \\citet{kallus2020role},} for similar bounds}. In Section \\ref{sec_nf_ate}, we would detail the above choices of $m^*(\\cdot) $ and some corresponding estimators $\\mhatnk(\\cdot) $. \\tcr{Lastly, it is worth noting that the efficiency bound here not surprisingly is lower compared to the supervised case showing the scope of efficiency gain (apart from robustness) in SS setups.}\n\\end{remark}\n\n\\subsection[Case where T is not observed in U]{Case where $T$ is not observed in $\\cu$}\\label{sec_ate_u_dagger}\nSo far, we have focused on \\tcr{the case} %cases\nwhere the unlabeled data contains observations for both the treatment indicator $T$ and the covariates $\\X$. We now briefly discuss settings where $T$ is \\emph{not} observed in the unlabeled data. Based on the sample $\\cl\\cup\\cu^\\dag$\\tcr{,} with $\\cu^\\dag:=\\{\\X_i:i=n+1,\\ldots,n+N\\}$, we introduce \\tcr{the \\emph{SS estimators $\\muhatss^\\dag$}:}%, given by:}\n\\be\n\\muhatss^\\dag~:=~  \\E_{n+N}\\{\\mhatn(\\X)\\}+\\E_n[\\{\\pihatn(\\X)\\}^{-1}T\\{Y-\\mhatn(\\X)\\}]\n\\label{hatmuss_dag}\n\\ee\nfor $\\mu_0$. Here $\\pihatn(\\cdot) $ is constructed \\tcr{-- this time solely from $\\cl$ --} through a cross fitting procedure similar to \\eqref{ds2}\\tcr{,} so that $\\pihatn(\\cdot) $ and $\\X_i$ are independent in $\\pihatn(\\X_i) $ $(i=1,\\ldots,n) $. Specifically, we let $\\pihatn(\\X_i):=\\pihatnk(\\X_i) $ $(i\\in\\cl_k) $ with $\\pihatnk(\\cdot) $ some estimator for $\\pi(\\cdot) $ based on \\tcr{$\\cl_k^-$} $(k=1,\\ldots,\\kK) $. See the discussion below \\eqref{ds2} for the motivation and benefit of cross fitting.\n\nCompared to $\\muhatss$, the estimators $\\muhatss^\\dag$ substitute $\\pihatn(\\cdot) $ for $\\pihatN(\\cdot) $, approximating the working propensity score model $\\pis(\\cdot) $ using $\\cl$ only. We thus impose the following condition on the behavior of $\\pihatn(\\cdot) $, \\tcr{as} a counterpart of \\tcr{our earlier} Assumption \\ref{api4}.\n\n\\begin{assumption}\\label{apin4}\nThe function $\\hat{D}_{n,k}(\\x):=\\{\\pihatnk(\\x)\\}^{-1}-\\{\\pis(\\x)\\}^{-1}$ satisfies\\tcr{:}\n\\bse\n(\\E_\\X[\\{\\hat{D}_{n,k}(\\X)\\}^2])^{1/2}~=~O_p(s_n), ~~\\tcr{\\mbox{and}}~~ \\{\\E_\\Z([\\hat{D}_{n,k}(\\X)\\{Y- m^*(\\X) \\}]^2)\\}^{1/2}~=~O_p(b_n)\\tcr{,}\n\\ese\nfor some positive sequences $s_n$ and $b_n$ $(k=1,\\ldots,\\kK) $.\n\\end{assumption}\n\nReplacing $\\pihatN(\\cdot) $ by $\\pihatn(\\cdot) $ in Corollary \\ref{corate}, we immediately obtain the next corollary regarding the properties of $\\muhatss^\\dag$. \\tcr{(This serves as the counterpart of our Corollary \\ref{corate} on $\\muhatss$.) }\n\n\\begin{corollary}\\label{corate_dagger}\nUnder Assumptions \\ref{ass_equally_distributed}, \\ref{ahmu} and \\ref{apin4} as well as the condition that $\\nu=0$ \\tcr{as in \\eqref{disproportion}}, the SS estimator $\\muhatss^\\dag$ defined by \\eqref{hatmuss_dag} has the stochastic expansion\\tcr{:}\n\\bse\n&&\\muhatss^\\dag-\\mu_0~=~\\nn\\sl\\zess(\\Z_i)~+~O_p\\{n^{-1/2}(w_{n,2}+b_n)+s_n\\,w_{n,2}\\}~+ \\\\\n&&\\phantom{\\muhatss-\\mu_0~=~}~I\\{\\pis(\\X)\\neq\\pi(\\X)\\}O_p(w_{n,1})~+~I\\{ m^*(\\X) \\neq m(\\X) \\}O_p(s_n), ~~\\tcr{\\mbox{where}}\n\\ese\n%where\n{\\cred $ %\\bse\n\\zess(\\Z)~\\equiv~\\{\\pis(\\X)\\}^{-1}T\\{Y- m^*(\\X) \\}+\\E\\{ m^*(\\X) \\}~-~\\mu_0\\tcr{,}\n$} %\\ese\n\\tcr{as in Corollary \\ref{corate},} satisfying $\\E\\{\\zess(\\Z)\\}=0$ given either $\\pis(\\X)=\\pi(\\X) $ or $ m^*(\\X) = m(\\X) $ but not necessarily both.\n\n\\vskip0.05in\n\\tcr{Further,} if $\\pis(\\X)=\\pi(\\X) $, $ m^*(\\X) = m(\\X) $ and\n{\\cred $ %\\bse\nn^{-1/2}(w_{n,2}+b_n)+s_n\\,w_{n,2}~=~o(n^{-1/2}),\n$} %\\ese\n%then\n\\be\n\\tcr{\\mbox{then}} ~~~ n^{1/2}\\lamss^{-1}(\\muhatss^\\dag-\\mu_0)~\\xrightarrow{d}~\\mn(0,1)\\quad (n, \\tcr{N} \\to\\infty)\\tcr{,}\n\\label{ate_ss_dagger_normality}\n\\ee\nwith $\\lamss^2\\equiv\\E[\\{\\zess(\\Z)\\}^2]=\\var[\\{\\pi(\\X)\\}^{-1}T\\{Y- m(\\X) \\}]$.\n\\end{corollary}\n\n\\begin{remark}[Comparison of estimators using different types of data]\\label{remark_hatmuss_dag}\nWe can see \\tcr{from} %in\nCorollary \\ref{corate_dagger} that $\\muhatss^\\dag$ possesses the same robustness as the supervised estimator $\\muhatsup$ in \\eqref{sup_ate}. Specifically, it is consistent whenever one \\tcr{among} %of\n$\\{\\pi(\\cdot), m(\\cdot)\\}$ is correctly specified, while its $n^{1/2}$-consistency and asymptotic normality in \\eqref{ate_ss_dagger_normality} require both \\tcr{to be correct}. As regards %to\nefficiency, as long as the limiting distribution \\eqref{ate_ss_dagger_normality} holds, the asymptotic variance $\\lamss^2$ of $\\muhatss^\\dag$ equals that of $\\muhatss$ in Theorem \\ref{thate}, implying that $\\muhatss^\\dag$ outperforms $\\muhatsup$ and enjoys semi-parametric optimality as discussed in Remark \\ref{remark_ate_efficiency}. We summarize in Table \\ref{table_ate_summary} \\tcr{the} achievable properties of \\tcr{all} the ATE estimators based on different types of available data. Estimation of the QTE using the data $\\cl\\cup\\cu^\\dag$ is similar in spirit while technically more laborious. We will therefore omit the relevant discussion considering that such a\nsetting is not %of\n\\tcr{our} main interest.\n\\end{remark}\n\n\\vskip-0.2in\n\\begin{table}[H]\n\\def~{\\hphantom{0}}\n\\caption{\n\\tcr{SS ATE estimation and its benefits: a complete picture of %. The table shows\nthe a}chievable \\tcr{robustness and efficiency} properties of the ATE estimators based on different types of available data. Here\\tcr{,} the efficiency (Eff.) gain is relative to the supervised estimator \\eqref{sup_ate} when $\\{m^*(\\cdot),\\pi^*(\\cdot)\\}=\\{m(\\cdot),\\pi(\\cdot)\\}$, \\tcr{while} the optimality (Opt.) %means\n\\tcr{refers to} attaining the \\tcr{corresponding} semi-parametric efficiency bound. The abbreviation $n^{1/2}$-CAN stands for $n^{1/2}$-consistency and asymptotic normality\\tcr{, while DR stands for doubly robust (in terms of consistency only).}}\n{\n\\begin{tabular}{c||c|c|c|c|c}\n\\hline\n\\multirow{3}{*}{Data} & \\multirow{3}{*}{DR} & \\multicolumn{2}{c|}{$n^{1/2}$-CAN} & \\multirow{3}{*}{Eff. gain} & \\multirow{3}{*}{Opt.} \\\\ \\cline{3-4}\n& &$ \\pis(\\cdot)=\\pi(\\cdot) $ & $ \\pis(\\cdot)=\\pi(\\cdot) $& & \\\\\n& & $m^*(\\cdot)=m(\\cdot) $ &$m^*(\\cdot)\\neq m(\\cdot) $ & & \\\\\n\\hline\n$\\cl$ & \\cmark & \\cmark & \\xmark & \\xmark & \\xmark \\\\\n$\\cl\\cup\\cu^\\dag$ & \\cmark & \\cmark & \\xmark & \\cmark & \\cmark \\\\\n$\\cl\\cup\\cu$ & \\cmark & \\cmark & \\cmark & \\cmark &\\cmark \\\\\n\\hline\n\\end{tabular}}\n\\label{table_ate_summary}\n\\end{table}\n\n\\subsection{Final \\tcr{SS} estimator for the ATE}\\label{sec_ate_difference}\nIn %the above,\n\\tcr{Sections \\ref{sec_ate_ss}--\\ref{sec_ate_efficiency_comparison},}\nwe have established the asymptotic properties of our SS estimator $\\muhatss\\equiv\\muhatss(1) $ for $\\mu_0\\equiv\\mu_0(1) $. We now propose  \\tcr{our \\emph{final SS estimator for the ATE,} i.e., the difference $\\mu_0(1)-\\mu_0(0) $ in \\eqref{ate}, as: $\\muhatss(1)-\\muhatss(0) $, with} %\\tcg{for the ATE}, i.e., the difference $\\mu_0(1)-\\mu_0(0) $, with\n\\bse\n\\muhatss(0)~:=~\\E_{n+N}\\{\\mhatn(\\X,0)\\}+\\E_n[\\{1-\\pihatN(\\X)\\}^{-1}(1-T)\\{Y-\\mhatn(\\X,0)\\}],\n\\ese\nwhere the estimator $\\mhatn(\\X,0) $ is constructed by cross fitting procedures similar to \\eqref{ds1}--\\eqref{ds2} and has a probability limit $m^*(\\X,0) $, a working outcome model for the conditional expectation $\\E\\{Y(0)\\mid\\X\\}$. Adapting Theorem \\ref{thate} and Corollary \\ref{corate} with $\\{Y,T\\}$ therein replaced by $\\{Y(0),1-T\\}$, we can directly obtain theoretical results \\tcr{for} %of\n$\\muhatss(0) $ including its stochastic expansion and limiting distribution. By arguments analogous to those in Remarks \\ref{remark_ate_robustness}--\\ref{remark_ate_efficiency}, one can easily conclude the double robustness, asymptotic normality, efficiency gain compared to the supervised counterparts and semi-parametric optimality of $\\muhatss(0) $. Also, it is straightforward to show that these properties are possessed by the difference estimator $\\muhatss(1)-\\muhatss(0) $ as well. Among all the above conclusions, a particularly important one is that\\tcr{:}\n\\be\nn^{1/2}\\lamate^{-1}[\\{\\muhatss(1)-\\muhatss(0)\\}-\\{\\mu_0(1)-\\mu_0(0)\\}]~\\xrightarrow{d}~\\mn(0,1)\\quad (n, \\tcr{N} \\to\\infty)\\tcr{,}\n\\label{ate_difference_distribution}\n\\ee\nunder the conditions in Corollary \\ref{corate} for $\\muhatss(1) $ as well as their counterparts for $\\muhatss(0) $, where the asymptotic variance\\tcr{:}\n\\bse\n\\lamate^2~:=~\\var[\\{\\pi(\\X)\\}^{-1}T\\{Y- m^*(\\X) \\}-\\{1-\\pi(\\X)\\}^{-1}(1-T)\\{Y(0)- m^*(\\X,0) \\}]\n\\ese\ncan be estimated by\\tcr{:}\n\\bse\n\\var_n[\\{\\pihatN(\\X)\\}^{-1}T\\{Y- \\mhatn(\\X) \\}-\\{1-\\pihatN(\\X)\\}^{-1}(1-T)\\{Y(0)- \\mhatn(\\X,0) \\}].\n\\ese\nIn theory, the limiting distribution \\eqref{ate_difference_distribution} provides the basis for \\tcr{our SS} inference regarding the ATE\\tcr{:} $\\mu_0(1)-\\mu_0(0) $; see the data analysis in Section \\ref{sec_data_analysis} for an instance of its application.\n\n%\\begin{remark}[Comparison with \\citet{zhang2019high}]\\label{remark_comparison_zhang2019}\n%\tIt is worth mentioning that our work on ATE has some resembles with a recent article of \\citet{zhang2019high}, who discussed SS inference for the ATE as an illustration of their SS mean estimation method and mainly focused on using a linear working model for $\\E(Y\\mid\\X) $. We, however, treat this problem in a more general and detailed manner. Specifically, we allow for a wide range of methods to approximate the nuisance functions with possible misspecification while having reached in this section a thorough and complete understanding on the robustness and efficiency of our estimators. Further, although employing random forest and XGBoost, besides linear regression, to construct their estimators in the numerical study, \\citet{zhang2019high} did not justify these methods theoretically. In contrast, we will study carefully a more principled outcome model estimator based on kernel smoothing, inverse probability weighting and dimension reduction, establishing novel results of its uniform convergence rate, which verify the high-level conditions required in Corollary \\ref{corate} and ensure the efficiency superiority of our method discussed in Remark \\ref{remark_ate_efficiency}; see Section \\ref{sec_nf_ate} for detail. More importantly, we also consider in the next section QTE estimation, an entire novelty in the area of SS inference.\n%\\end{remark}\n\n\\begin{remark}[Comparison with \\citet{zhang2019high}]\\label{remark_comparison_zhang2019}\nIt is worth mentioning \\tcr{here} that our work on the ATE bears \\tcr{some resemblance} %resembles\nwith \\tcr{the} %a\nrecent article by \\citet{zhang2019high}, who discussed SS inference for the ATE as an illustration of their SS mean estimation method and mainly focused on using a linear working model for $\\E(Y\\mid\\X) $. We, however, treat this problem in more generality \\tcr{-- both in methodology and theory}. Specifically, we allow for a wide range of methods to estimate \\tcr{the} nuisance functions \\tcr{in our estimators,} %allowing for some degree of\n\\tcr{allowing flexibility in terms of} model misspecification\\tcr{, and also establish through this whole section a suit of generally applicable results -- with only high-level conditions on the nuisance estimators -- giving a complete understanding/characterization of our SS ATE estimators\' properties, uncovering in the process, various interesting aspects of their robustness and efficiency benefits.}\n%while having \\tcr{established} %reached in this section a thorough and complete understanding \\tcr{of} the robustness and efficiency of our estimators. \\tcr{During the final stages of this work, we did learn (via personal communication) from the authors that in a revised (as-yet-unpublished) version of their paper, they also include a few other choices of nuisance estimators, e.g., random forest and XGBoost, in their numericals. However,}  although employing random forest and XGBoost, besides linear regression, to construct their estimators in the numerical study, \\citet{zhang2019high} did not justify these methods theoretically.\n\\tcr{In Section \\ref{secnf} later,} %Below,\nwe \\tcr{also} provide a careful study of a  \\tcr{family of} outcome model estimators based on kernel smoothing, inverse probability weighting and dimension reduction, %\\tcr{allowing for high dimensional settings and}\nestablishing novel results \\tcr{on} %of\ntheir uniform convergence rates, which verify the high-level conditions required in Corollary \\ref{corate} and ensure the efficiency superiority of our method discussed in Remark \\ref{remark_ate_efficiency}; see Section \\ref{sec_nf_ate} for \\tcr{more}\ndetail\\tcr{s}. \\tcr{In general, we believe the SS ATE estimation problem warranted a more detailed and thorough analysis in its own right, as we attempt to do in this paper.} \\tcr{Moreover,} %More importantly,\nwe also consider, \\tcr{as in the next section, the QTE estimation problem, which to our knowledge is an entirely novel contribution in the area of SS (causal) inference}.\n\\end{remark}\n\n\\section{SS estimation for the QTE}\\label{secqte}\nWe now study SS estimation of the QTE \\tcr{in \\eqref{qte}}. As before \\tcr{in Section \\ref{secos}}, we will simply focus \\tcr{here} on \\tcr{SS estimation of the} $\\tau$\\tcr{-}quantile $\\vt\\equiv\\vt(1,\\tau)\\in\\Theta\\subset\\rR$ of $Y\\equiv Y(1) $\\tcr{, as in \\eqref{generic_notation},} with some fixed and known $\\tau\\in(0,1) $. \\tcr{This will be our goal in Sections \\ref{sec_qte_general}--\\ref{sec_qte_efficiency_comparison}, after which we finally address SS inference for the QTE in Section \\ref{sec_qte_difference}.}\n\n\n\\begin{remark}[Technical difficulties \\tcr{with} %of the\nQTE estimation]\\label{qte_challenges}\nWhile the basic ideas \\tcr{underlying the SS estimation of the QTE} %estimation\nare similar in spirit to those in Section \\ref{secos} for the ATE, the inherent inseparability of $Y$ and $\\theta$ in the quantile estimating equation \\eqref{defqte} poses significantly more challenges in both implementation and theory. To overcome these difficulties, we use the strategy of one-step update in the construction of our QTE estimators, and \\tcr{also}\ndevelop technical novelties of empirical process theory in the proof of their properties; see Section \\ref{sec_qte_general} as well as Lemma \\ref{1v2} \\tcr{(}in %Section\n\\tcr{Appendix} \\ref{sm_lemmas} of the Supplementary Material\\tcr{) } for \\tcr{more} details.\n\\end{remark}\n\n\\begin{remark}[Semantic clarification for Sections \\ref{sec_qte_general}--\\ref{sec_qte_efficiency_comparison}]\\label{remark_semantics}\n\\tcr{%Before we begin describing our SS QTE estimation approach, we would like to make a clarification here.\nAs mentioned above, our estimand in Sections \\ref{sec_qte_general}--\\ref{sec_qte_efficiency_comparison} is the quantile $\\vt$ of $Y(1) $, not QTE, per se. However, for semantic convenience,\nwe will occasionally refer to it as ``QTE\'\' itself (and the estimators as ``QTE estimators\'\') while presenting our results and discussions in these sections. We hope this %semantic aberration, used mainly for expository convenience, %\nslight abuse of terminology\nis not a distraction, as the true estimand should be clear from context.}\n\\end{remark}\n\n\\subsection[SS estimators for theta0: general construction and properties]{ \\tcr{SS estimators for $\\vt$: g}eneral construction and properties }\\label{sec_qte_general}\n\\tcr{Let us define} %Denote\n$\\phi(\\X,\\theta):=\\E\\{\\psi(Y,\\theta)\\mid\\X\\}$. Analogous to \\tcr{the construction} \\eqref{ate_dr_representation} for \\tcr{the mean} $\\mu_0$, we observe that, for arbitrary functions $\\pis(\\cdot) $ and $\\phis(\\cdot,\\cdot) $, the equation \\eqref{defqte} \\tcr{for $\\vt$} satisfies the DR type representation\\tcr{:}\n\\be\n0~=~\\E\\{\\psi(Y,\\vt)\\} ~=~ \\E\\{  \\phis(\\X,\\vt)\\}+ \\E[\\{\\pis(\\X)\\}^{-1} T\\{\\psi(Y,\\vt) -  \\phis(\\X,\\vt)\\}]\\tcr{,}\n\\label{qte_dr_representation}\n\\ee\ngiven either $\\pis(\\X)=\\pi(\\X) $ or $\\phis(\\X,\\theta)=\\phi(\\X,\\theta) $ but {\\it not} necessarily both.\n\n\\tcr{To} clarify the \\tcr{basic} logic behind the construction of our \\tcr{SS} estimators, suppose momentarily that $\\{\\pis(\\cdot),\\phis(\\cdot,\\cdot)\\}$ are known and equal to $\\{\\pi(\\cdot),\\phi(\\cdot,\\cdot)\\}$. One may then expect to obtain a supervised estimator of $\\vt$ by solving the empirical version of \\eqref{qte_dr_representation} based on $\\cl$, i.e., \\tcr{solve}\n\\be\n\\E_n\\{  \\phi(\\X,\\theta)\\}+ \\E_n[\\{\\pi(\\X)\\}^{-1} T\\{\\psi(Y,\\theta) -  \\phi(\\X,\\theta)\\}] ~=~0,\n\\label{sv}\n\\ee\nwith respect to $\\theta$. However, solving \\eqref{sv} directly is not a simple task due to its \\tcr{inherent} non-smoothness and non-linearity \\tcr{in $\\theta$}. %We therefore adopt the strategy of \\emph{one-step update} \\citep{van2000asymptotic, tsiatis2007semiparametric} instead,\n\\tcr{A reasonable strategy to adopt instead is a} \\emph{one-step update} \\tcr{approach} \\citep{van2000asymptotic, tsiatis2007semiparametric}\\tcr{,} using %based on}\nthe corresponding \\emph{influence function} %\\tcr{:}\n\\tcr{(a term used a bit loosely here to denote the expected influence function in the supervised case):}\n\\be\n\\{f(\\vt)\\}^{-1} (\\E[\\{\\pi(\\X)\\}^{-1} T\\{\\phi(\\X,\\vt)-\\psi(Y,\\vt)\\}]-\\E\\{  \\phi(\\X,\\vt)\\}).\n\\label{qte_influence_function}\n\\ee\nSpecifically, by replacing the unknown functions $\\{\\pi(\\cdot),~\\phi(\\cdot,\\cdot)\\}$ in \\eqref{qte_influence_function} with \\tcr{\\it some} estimators $\\{\\pihatn(\\cdot),~ \\phihatn(\\cdot,\\cdot)\\}$ based on $\\cl$ that \\tcr{may} target possibly misspecified limits $\\{\\pis(\\cdot),~\\phis(\\cdot,\\cdot)\\}$, %\\tcr{and given some initial estimator of $\\vt$,}\nwe immediately obtain a {\\it supervised estimator} \\tcr{of $\\vt$ \\tcr{via a one-step update approach as follows:}}\n\\be\n&&\\thetahatsup~:=~ \\thetahatinit +\\{\\hf(\\thetahatinit)\\}^{-1}(\\E_n[\\{\\pihatn(\\X)\\}^{-1}T\\{\\phihatn(\\X,\\thetahatinit) - \\psi(Y,\\thetahatinit)\\}]- \\label{sup_qte} \\\\\n&&\\phantom{\\thetahatsup~:=~ \\thetahatinit +\\{\\hf(\\thetahatinit)\\}^{-1}(}\\E_{n}\\{\\phihatn(\\X,\\thetahatinit)\\})\\tcr{,}  \\nonumber\n\\ee\nwith $\\thetahatinit$ an initial estimator for $\\vt$ and $\\hf(\\cdot) $ an estimator for the density function $f(\\cdot) $ of $Y$.\n\n\\paragraph*{SS estimators \\tcr{of $\\vt$}} \\tcr{With the above motivation for a one-step update approach, and recalling the basic principles of our SS approach in Section \\ref{sec_ate_ss}, %adapting it suitably\nwe now formalize the details of our SS estimators of $\\vt$.} %, based on a similar principle as in Section \\ref{sec_ate_ss}, along with appropriate modifications to adapt to a one-step update approach as above.}\nSimilar to the \\tcr{rationale used in the} construction of\n\\tcr{\\eqref{ss_ate}} for \\tcr{estimating $\\mu_0$ in context of} the ATE, replacing $\\E_{n}\\{\\phihatn(\\X,\\thetahatinit)\\}$ and $\\pihatn(\\X) $ in \\eqref{sup_qte} by $\\E_{n+N}\\{\\phihatn(\\X,\\thetahatinit)\\}$ and $\\pihatN(\\X) $, respectively\\tcr{, now} \\tcg{produces a family of \\emph{SS estimators} $\\thetahatss$ for $\\vt$, given by:} %respectively \\tcr{now} \\tcg{produces a family of \\emph{SS estimators} $\\thetahatss$ for $\\vt$, given by:}\n\\be\n&&\\thetahatss~:=~  \\thetahatinit +\\{\\hf(\\thetahatinit)\\}^{-1}(\\E_n[\\{\\pihatN(\\X)\\}^{-1}T\\{\\phihatn(\\X,\\thetahatinit) - \\psi(Y,\\thetahatinit)\\}]- \\label{ss_qte}\\\\\n&&\\phantom{\\thetahatss~:=~  \\thetahatinit +\\{\\hf(\\thetahatinit)\\}^{-1}(}\\E_{n+N}\\{\\phihatn(\\X,\\thetahatinit)\\}). \\nonumber\n\\ee\nHere\\tcr{, a} %the\ncross fitting technique \\tcr{similar to \\eqref{ds1}--\\eqref{ds2}} is applied %applied to $\\phihatn(\\cdot,\\cdot) $\nto \\tcr{obtain the estimates $\\phihatn(\\X_i,\\cdot) $:} %\\tcr{, i.e.,}\n%in the same fashion as \\eqref{ds1} and \\eqref{ds2}:\n\\be\n\\phihatn(\\X_i,\\theta)&~:=~&\\kK^{-1}\\sk\\phihatnk(\\X_i,\\theta)\\quad (i=n+1,\\ldots,n+N), \\quad \\tcr{\\mbox{and}} \\label{ds3}\\\\ \\phihatn(\\X_i,\\theta)&~:=~&\\phihatnk(\\X_i,\\theta)\\quad (i\\in\\I_k\\tcr{;\\ k=1,\\ldots,\\kK}), \\label{ds4}\n\\ee\nwhere $\\phihatnk(\\cdot,\\cdot) $ is an estimator for $\\phis(\\cdot,\\cdot) $ based \\tcr{only} on the data set $\\cl_k^-$  $(k=1,\\ldots,\\kK) $.\n\n\\vskip0.05in\nWe now have a family of SS estimators for $\\vt$ indexed by $\\{\\pihatN(\\cdot),\\phihatn(\\cdot,\\cdot)\\}$ from \\eqref{ss_qte}. \\tcr{To establish their theoretical properties, we will require the following (high-level) assumptions.} %The assumptions for establishing their properties are summarized as follows.\n\n\\begin{assumption}\n\\label{adensity}\nThe quantile $\\vt$ is in the interior of its parameter space $\\Theta$. The density function $f(\\cdot) $ of $Y$ is positive and has a bounded derivative in $\\mbtv$ \\tcr{for some $\\epsilon > 0$}.\n\\end{assumption}\n\n\n\\begin{assumption}\n\\label{ainit}\n\\tcr{The initial estimator $\\thetahatinit$ and the density estimator $\\hf(\\cdot) $} %The estimators $\\thetahatinit$ and $\\hf(\\cdot) $\nsatisfy that, for some positive sequences $u_n=o(1) $ and $v_n=o(1) $,\n\\be\n&&\\thetahatinit-\\vt~=~O_p(u_{n}), ~~ \\tcr{\\mbox{and}} \\label{hvti}\\\\\n&&\\hf(\\thetahatinit)-f(\\vt)~=~O_p(v_n). \\label{hf}\n\\ee\n\\end{assumption}\n\n\\begin{assumption}\\label{api}\nRecall \\tcr{that}\n$\\pis(\\cdot) $ is some function such that $\\pis(\\x)\\in(c,1-c) $ for any $\\x\\in\\mx$ and some $c\\in(0,1) $. Then\\tcr{,}\nthe function $\\hD(\\x)\\equiv\\{\\pihatN(\\x)\\}^{-1}-\\{\\pis(\\x)\\}^{-1}$ satisfies\\tcr{:}\n\\be\n&&(\\E_\\X[\\{\\hD(\\X)\\}^2])^{1/2}~=~O_p(s_N), ~~ \\tcr{\\mbox{and}} \\label{d2} \\\\\n&&\\sx|\\hD(\\x)|~=~O_p(1)\\tcr{,} \\label{dsup}\n\\ee\nfor some positive sequence $s_N$ that is possibly divergent.\n\\end{assumption}\n\n\\begin{assumption}\\label{abound}\nThe function $\\phis(\\cdot,\\cdot) $ \\tcr{-- the (possibly misspecified) target of $\\phihatn(\\cdot,\\cdot) $ --} is bounded. \\tcr{Further, t}he set $\\mm:=\\{\\phis(\\X,\\theta):\\theta\\in\\mbtv\\}$ \\tcr{for some $\\epsilon > 0$,} satisfies\\tcr{:}\n\\be\nN_{[\\,]}\\{\\eta,\\mm,L_2(\\P_\\X)\\}~\\leq~ c_1\\,\\eta^{-c_2},\n\\label{bmm}\n\\ee\nwhere the symbol $N_{[\\,]}(\\cdot,\\cdot,\\cdot) $ refers to the \\tcr{\\it bracketing number}\\tcr{, as} defined in \\citet{van1996weak} and \\citet{van2000asymptotic}. In addition, for any sequence $\\tvt\\to\\vt$ in probability,\n\\be\n&&\\mbG_n[\\{\\pis(\\X)\\}^{-1}T\\{\\phis(\\X,\\tvt)-\\phis(\\X,\\vt)\\}]~=~o_p(1), ~~\\tcr{\\mbox{and}} \\label{unipi1}\\\\ &&\\mbG_{n+N}\\{\\phis(\\X,\\tvt)-\\phis(\\X,\\vt)\\}~=~o_p(1).\\label{unipi2}\n\\ee\n\\end{assumption}\n\n\n\\begin{assumption}\\label{aest}\nDenote\n\\be\n&&\\hpsi(\\X,\\theta)~:=~\\phihatnk(\\X,\\theta)-\\phis(\\X,\\theta), ~~\\tcr{\\mbox{and}} \\label{error}\\\\ &&\\Delta_k(\\cl)~:=~(\\sb\\E_\\X[\\{\\hpsi(\\X,\\theta)\\}^2])^{1/2} \\quad (k=1,\\ldots,\\kK).\\nonumber\n\\ee\nThen\\tcr{,} \\tcr{for some $\\epsilon > 0$,} the set\\tcr{:}\n\\be\n\\mp_{n,k}~:=~\\{\\hpsi(\\X,\\theta):\\theta\\in\\mbtv\\}\n\\label{pnk}\n\\ee\nsatisfies that, for any $\\eta\\in(0,\\Delta_k(\\cl)+c\\,]$ \\tcr{for some $c > 0$},\n\\be\nN_{[\\,]}\\{\\eta,\\mp_{n,k}\\mid\\cl,L_2(\\P_\\X)\\}~\\leq~ H(\\cl) \\eta^{-c} \\quad (k=1,\\ldots,\\kK)\n\\label{vc}\n\\ee\nwith some function $H(\\cl)>0$ such that $H(\\cl)=O_p(a_n) $ for some positive sequence $a_n$ that is possibly divergent. Here\\tcr{,} $\\mp_{n,k}$ is indexed by $\\theta$ \\tcr{\\it only} and treats $\\hpsi(\\cdot,\\theta) $ as a non\\tcr{-}random function $(k=1,\\ldots,\\kK) $. Moreover, we assume \\tcr{that:}\n\\be\n&&\\sb\\E_\\X\\{|\\hpsi(\\X,\\theta)|\\}~=~O_p(d_{n,1}),~~~ \\Delta_k(\\cl)~=~O_p(d_{n,2}), ~~~\\tcr{\\mbox{and}} \\nonumber\\\\\n&& \\sbx|\\hpsi(\\X,\\theta)|~=~O_p(d_{n,\\infty}) \\quad (k=1,\\ldots,\\kK), \\nonumber\n\\ee\nwhere $d_{n,1}$, $d_{n,2}$ and $d_{n,\\infty}$ are some positive sequences that are possibly divergent.\n\\end{assumption}\n\n\\begin{remark}\\label{remark_qte_assumptions}\nThe basic conditions in Assumption \\ref{adensity} ensure the identifiability and estimability of $\\vt$. Assumption \\ref{ainit} is standard for one-step estimators, regulating the behavior of $\\thetahatinit$ and $\\hf(\\cdot) $. Assumption \\ref{api} is an analogue of Assumption \\ref{api4}, adapted \\tcr{suitably} for the technical proofs of\nthe QTE estimators. Assumption \\ref{abound} outlines the features of a suitable working outcome model $\\phis(\\cdot,\\cdot) $. According to Example 19.7 and Lemma 19.24 of \\citet{van2000asymptotic}, the conditions \\eqref{bmm}--\\eqref{unipi2} hold as long as $\\phi^{\\tcr{*}}(\\X,\\theta) $ is Lipschitz continuous in $\\theta$. Lastly, Assumption \\ref{aest} imposes restrictions on the bracketing number and norms of the error term \\eqref{error}. The requirements in Assumptions \\ref{abound} and \\ref{aest} should be expected to hold for most reasonable choices of $\\{\\phi^{\\tcr{*}}(\\cdot,\\cdot),\\phihatnk(\\cdot,\\cdot)\\}$ using standard results from empirical process theory \\citep{van1996weak, van2000asymptotic}. Again, all the positive sequences in Assumptions \\ref{api} and \\ref{aest} are possibly divergent, so the relevant restrictions are actually fairly mild and weaker than \\tcr{requiring} $L_\\infty$ convergence. The validity of these assumptions for some choices of the nuisance functions and their estimators will be di\\tcr{s}cussed in Section \\ref{secnf}.\n\\end{remark}\n\n\\tcr{We now} %Now we state\n\\tcr{present the asymptotic properties of $\\thetahatss$ in Theorem \\ref{thqte} and Corollary \\ref{corqte} below.}\n%the limiting behavior of $\\thetahatss$ in Theorem \\ref{thqte} as well as Corollary \\ref{corqte} \\tcr{below}.\n\n\\begin{theorem}\\label{thqte}\nSuppose that Assumptions \\ref{ass_equally_distributed} and \\ref{adensity}--\\ref{aest} hold, and that\neither $\\pis(\\X)=\\pi(\\X) $ or $\\phis(\\X,\\theta)=\\phi(\\X,\\theta) $ but not necessarily both. Then\\tcr{, it holds that:} $\\thetahatss-\\vt=$\n\\bse\n&&\\{nf(\\vt)\\}^{-1}\\sl\\omega_{n,N}(\\Z_i,\\vt)~+~O_p\\{u_n^2+u_nv_n+n^{-1/2}(r_n+z_{n,N})+s_N d_{n,2}\\}~+ \\\\\n&&~I\\{\\pis(\\X)\\neq\\pi(\\X)\\}O_p(d_{n,1})+I\\{\\phis(\\X,\\theta)\\neq\\phi(\\X,\\theta)\\}O_p(s_N)+o_p(n^{-1/2})\\tcr{,}\n\\ese\nwhen $\\nu\\geq 0$, where\n\\bse\n&&r_n~:=~d_{n,2}\\{\\log\\,a_n+\\log(d_{n,2}^{-1})\\}~+~n_{\\kK}^{-1/2}d_{n,\\infty}\\{(\\log\\,a_n)^2+(\\log\\,d_{n,2})^2\\},\\\\\n&&z_{n,N}~:=~s_N\\log\\, (s_N^{-1})~+~n^{-1/2}(\\log\\,s_N)^2, ~~\\tcr{\\mbox{and}}\\\\\n&&\\omega_{n,N}(\\Z,\\theta)~:=~\\{\\pis(\\X)\\}^{-1}T\\{\\phis(\\X,\\theta)-\\psi(Y,\\theta)\\}-\\E_{n+N}\\{\\phis(\\X,\\theta)\\} \\tcr{,}\n\\ese\nsatisfying $\\E\\{\\omega_{n,N}(\\Z,\\vt)\\}=0$ \\tcr{if either $\\phis(\\cdot) = \\phi(\\cdot) $ or $\\pis(\\cdot) = \\pi(\\cdot) $ but not necessarily both.}\n\\end{theorem}\n\n\\begin{corollary}\\label{corqte}\nSuppose that the conditions in Theorem \\ref{thqte} hold true, that $\\nu=0$ \\tcr{as in \\eqref{disproportion},} and that $\\pis(\\X)=\\pi(\\X) $. Then\\tcr{,} the stochastic expansion of $\\thetahatss$ is \\tcr{given by:} $\\thetahatss-\\vt=$\n\\bse\n&&\\{nf(\\vt)\\}^{-1}\\sl\\omss(\\Z_i,\\vt)~+~O_p\\{u_n^2+u_nv_n+n^{-1/2}(r_n+z_{n,N})+s_N d_{n,2}\\}~+ \\\\\n&&~I\\{\\phis(\\X,\\theta)\\neq\\phi(\\X,\\theta)\\}O_p(s_N)~+~o_p(n^{-1/2}),\n\\ese\nwhere\n\\bse\n\\omss(\\Z,\\theta)~:=~\\{\\pi(\\X)\\}^{-1}T\\{\\phis(\\X,\\theta)-\\psi(Y,\\theta)\\}-\\E\\{\\phis(\\X,\\theta)\\} \\tcr{,}\n\\ese\nsatisfying $\\E\\{\\omss(\\Z,\\vt)\\}=0$\\tcr{, and $\\phis(\\X,\\theta) $ is arbitrary, i.e., not necessarily equal to $\\phi(\\x,\\theta) $.}\n\\vskip0.1in\nFurther, if either $s_N=o(n^{-1/2}) $ or $\\phis(\\X,\\theta)=\\phi(\\X,\\theta) $ but not necessarily both, and\n\\be\nu_n^2+u_nv_n+n^{-1/2}(r_n+z_{n,N})+s_N d_{n,2}~=~o(n^{-1/2}),\n\\label{srn}\n\\ee\n\\tcr{then} the limiting distribution of $\\thetahatss$ is\\tcr{:}\n\\be\nn^{1/2}f(\\vt)\\sigss^{-1}(\\thetahatss-\\vt)~\\xrightarrow{d}~\\mn(0,1)\\quad (n, \\tcr{N}\\to\\infty)\\tcr{,}\n\\label{qte_normality}\n\\ee\nwith $\\sigss^2:=\\E[\\{\\omss(\\Z,\\vt)\\}^2]=\\var[\\{\\pi(\\X)\\}^{-1}T\\{\\psi(Y,\\vt)-\\phis(\\X,\\vt)\\}]$\\tcr{, and t}he asymptotic variance $\\{f(\\vt)\\}^{-2}\\sigss^2$ can be estimated \\tcr{as:}\n\\bse\n\\{\\hf(\\thetahatss)\\}^{-2}\\var_n[\\{\\pihatN(\\X)\\}^{-1}T\\{\\psi(Y,\\thetahatss)-\\phihatn(\\X,\\thetahatss)\\}].\n\\ese\n\\end{corollary}\n\n\\begin{remark}[Robustness and first-order insensitivity of $\\thetahatss$]\\label{remark_qte_property}\n%Results in the same spirit as those \\tcg{of} $\\muhatss$ in Section \\ref{sec_ate_ss} have been established for $\\thetahatss$ by Theorem \\ref{thqte} along with Corollary \\ref{corqte}. %\n\\tcr{Theorem \\ref{thqte} and Corollary \\ref{corqte} establish the general properties of $\\thetahatss$, in the same spirit as\n%similar in spirit to\nthose \\tcg{of} $\\muhatss$ in Section \\ref{sec_ate_ss}. %In particular, the results show u}nder suitable conditions %that our \\tcr{SS %QTE \\tcr{(quantile) } estimators}\nThe results show, in particular, that $\\thetahatss$} are always DR, while enjoying first-order insensitivity, \\tcr{and} $n^{1/2}$-consistency and asymptotic normality\\tcr{, {\\it regardless} of whether $\\phi(\\cdot,\\cdot) $ is misspecified,} as long as we can correctly estimate $\\pi(\\X) $ at a\\tcr{n} $L_2$\\tcr{-}rate faster than $n^{-1/2}$ \\tcr{by exploiting the plentiful observations in $\\cu$}. In contrast, \\tcr{such} $n^{1/2}$-consistency and asymptotic normality are %generally\nunachievable \\tcr{(in general) } for purely supervised QTE estimators \\tcr{if} %given\n$\\phi(\\cdot,\\cdot) $ is misspecified. This is\nanalogous %\\tcr{in spirit}\nto the case of the ATE\\tcr{; see} %. See\nRemark \\ref{remark_ate_robustness} for more discussions on these properties.\n\\end{remark}\n\n\\begin{remark}[Choice\\tcr{s} of $\\{\\thetahatinit,\\hf(\\cdot)\\}$]\\label{remark_qte_initial_estimator}\nWhile the general conclusions in Theorem \\ref{thqte} and Corollary \\ref{corqte} hold true for \\tcr{\\it any} estimators $\\{\\thetahatinit,\\hf(\\cdot)\\}$ satisfying Assumption \\ref{ainit}, a reasonable choice in practice \\tcr{for both would be {\\it IPW type estimators}.} %is the IPW type estimators for both.\nSpecifically, the initial estimator $\\thetahatinit$ can be obtained by solving\\tcr{:} $\\E_n[\\{\\pihatN(\\X)\\}^{-1}T\\psi(Y,\\thetahatinit)]=0$, while $\\hf(\\cdot) $ may be defined as a kernel density estimator based on the weighted sample\\tcr{:} $\\{\\{\\pihatN(\\X_i)\\}^{-1}T_iY_i:i=1,\\ldots,n\\}$. Under the conditions in Corollary \\ref{corqte}, it is not hard to show that Assumption \\ref{ainit} as well as the part of \\eqref{srn} related to $\\{u_n,v_n\\}$ \\tcr{are} %is\nindeed satisfied by such $\\{\\thetahatinit,\\hf(\\cdot)\\}$, using the basic proof techniques of quantile \\tcr{method}s \\citep{k2005} and kernel-based approaches \\citep{hansen2008uniform}, \\tcr{along} with suitable modifications \\tcr{used to incorporate the IPW weights.} %corresponding to the weighting scheme.\n\\end{remark}\n\n\\subsection{Efficiency comparison}\\label{sec_qte_efficiency_comparison}\n\nFor efficiency comparison among QTE estimators, %\\tcr{(SS vs. supervised) },\nsimilar to $\\muhatsup^*$ in Section \\ref{secos} \\tcr{for the ATE}, %\\tcr{\\ref{sec_ate_efficiency_comparison},}\nwe now consider the\n{\\it pseudo-supervised estimator\\tcr{(s) }} \\tcr{of $\\vt$:}\n\\be\n&&\\thetahatsup^*~:=~  \\thetahatinit +\\{\\hf(\\thetahatinit)\\}^{-1}(\\E_n[\\{\\hat{\\pi}_N(\\X)\\}^{-1}T\\{\\phihatn(\\X,\\thetahatinit) - \\psi(Y,\\thetahatinit)\\}]- \\label{pseudo_sup_qte}\\\\\n&&\\phantom{\\thetahatsup^*~:=~  \\thetahatinit +\\{\\hf(\\thetahatinit)\\}^{-1}(}\\E_{n}\\{\\phihatn(\\X,\\thetahatinit)\\}),  \\nonumber\n\\ee\n\\tcr{i.e., the version of the purely supervised estimator $\\thetahatsup$ in \\eqref{sup_qte} with $\\pihatn(\\cdot) $ therein replaced by $\\pihatN(\\cdot) $ from $\\cu$. $\\thetahatsup^*$ thus has the same robustness as $\\thetahatss$ and is considered solely for efficiency comparison -- among SS and supervised estimators of $\\vt$ (setting aside any robustness benefits the former already enjoys). This is based on the same motivation and rationale as those discussed in detail in Section \\ref{sec_ate_efficiency_comparison} in the context of ATE estimation; so we do not repeat those here for brevity. We now present the properties of $\\thetahatsup^*$ followed by the efficiency comparison.}\n%whose robustness is the same as that of $\\thetahatss$ \\tcr{(similar to the case of $\\muhatsup^*$ vs. $\\muhatss$ in Section \\ref{sec_ate_efficiency_comparison}) }.\n\n\\begin{corollary}\\label{corsup}\nUnder the conditions in Corollary \\ref{corqte}, the pseudo-supervised estimators $\\thetahatsup^*$ given by \\eqref{pseudo_sup_qte} \\tcr{satisfies the following expansion:} %is such that\n$\\thetahatsup^*-\\vt=$\n\\be\n&&\\quad\\{nf(\\vt)\\}^{-1}\\sl\\oms(\\Z_i,\\vt)~+~O_p\\{u_n^2+u_nv_n+n^{-1/2}(r_n+z_{n,N})+s_N d_{n,2}\\}~+ \\nonumber\\\\\n&&\\quad ~I\\{\\phis(\\X,\\theta)\\neq\\phi(\\X,\\theta)\\}O_p(s_N)~+~o_p(n^{-1/2}), ~~\\tcr{\\mbox{and}} \\nonumber\\\\\n&&\\quad n^{1/2}f(\\vt)\\sigsup^{-1}(\\thetahatsup^*-\\vt)~\\xrightarrow{d}~\\mn(0,1)\\quad (n, \\tcr{N}\\to\\infty), ~~\\tcr{\\mbox{where}}\n\\label{qte_sup_normality}\n\\ee\nwhere\n%{\\cred %$\n\\bse\n\\oms(\\Z,\\theta)~:=~\\{\\pi(\\X)\\}^{-1}T\\{\\phis(\\X,\\theta)-\\psi(Y,\\theta)\\}-\\phis(\\X,\\theta)\\tcr{,}\n%$\n\\ese\n%}\nsatisfying $\\E\\{\\oms(\\Z,\\vt)\\}=0$, and $\\sigsup^2:=\\E[\\{\\oms(\\Z,\\vt)\\}^2]=$\n{%\\cred $\n\\bse\n\\var[\\{\\pi(\\X)\\}^{-1}T\\{\\psi(Y,\\vt)-\\phis(\\X,\\vt)\\}]-\\var\\{\\phis(\\X,\\vt)\\}+ 2\\,\\E\\{\\phis(\\X,\\vt)\\psi(Y,\\vt)\\}.\n%$\n\\ese\n}\n\\end{corollary}\n\\begin{remark}[Efficiency improvement \\tcr{of $\\thetahatss$ and optimality}]\\label{remark_qte_efficiency}\nInspecting the asymptotic variances in Corollaries \\ref{corqte} and \\ref{corsup}, we see \\tcr{that} $\\sigss^2\\leq\\sigsup^2$ %if\n\\tcr{with {\\it any} choice of $\\phis(\\X,\\theta) $ such that}\n$\\phis(\\X,\\theta)=\\E\\{\\psi(Y,\\theta)\\mid \\bfg(\\X)\\}$ for some \\tcr{(possibly) } unknown function $\\bfg(\\cdot) $, since\n\\bse\n\\sigsup^2-\\sigss^2~=~2\\,\\E\\{\\phis(\\X,\\vt)\\psi(Y,\\vt)\\}-\\var\\{\\phis(\\X,\\vt)\\}~=~\\E[\\{\\phis(\\X,\\vt)\\}^2]~\\geq~ 0.\n\\ese\nSuch a comparison reveals the %efficiency\n\\tcr{\\it superiority} \\tcr{in} efficiency of our SS estimators $\\thetahatss$ over the \\tcr{corresponding} ``best\'\' achievable ones in supervised settings \\tcr{\\it even if} the difference \\tcr{(i.e.,  improvement) } in robustness is ignored. Further, when $\\phis(\\X,\\theta)=\\E\\{\\psi(Y,\\theta)\\mid\\X\\}$, the SS variance\\tcr{:}\n\\be\n\\sigss^2&~=~&\\var(\\{\\pi(\\X)\\}^{-1}T[\\psi(Y,\\vt)-\\E\\{\\psi(Y,\\vt)\\mid\\X\\}]) \\nonumber\\\\\n&~=~&\\E(\\{\\pi(\\X)\\}^{-2}T[\\psi(Y,\\vt)-\\E\\{\\psi(Y,\\vt)\\mid\\X\\}]^2) \\label{qte_eff}\\\\\n&~\\leq~&\\E[\\{\\pi(\\X)\\}^{-2}T\\{\\psi(Y,\\vt)-g(\\X)\\}^2]\\tcr{,} \\nonumber\n\\ee\nfor any function $g(\\cdot) $ while the equality holds only if $g(\\X)=\\E\\{\\psi(Y,\\vt)\\mid\\X\\}$ almost surely. In this sense $\\thetahatss$ is asymptotically \\tcr{\\it optimal} among all regular and asymptotically linear estimators of $\\vt$, whose influence functions have the form $\\{f(\\vt)\\pi(\\X)\\}^{-1}T\\{g(\\X)-\\psi(Y,\\vt)\\}$ for some function $g(\\cdot) $. \\tcg{Under the semi-parametric model \\eqref{semiparametric_model}}, one can show that, \\tcg{if Assumption \\ref{adensity} holds true}, the representation \\eqref{qte_eff} equals the efficient asymptotic variance for estimating $\\vt$, that is, the \\tcr{SS} estimator $\\thetahatss$ achieves the \\tcr{\\it semi-parametric efficiency bound}. In Section \\ref{sec_nf_qte}, we will \\tcr{also} detail the above choices of $\\phis(\\cdot,\\cdot) $ and some corresponding estimators $\\phihatnk(\\cdot,\\cdot) $.\n\\end{remark}\n\n\\subsection{Final \\tcr{SS} estimator for the QTE}\\label{sec_qte_difference}\nSimilar to the arguments \\tcr{used in \\tcr{Section \\ref{sec_ate_difference} for}\nthe case} of $\\{\\muhatss(1),\\muhatss(0)\\}$ \\tcr{to obtain the ATE estimator,} %in Section \\ref{sec_ate_difference},\nsubstituting $\\{Y(0),1-T\\}$ for $\\{Y,T\\}$ in the aforementioned discussions concerning $\\thetahatss\\equiv\\thetahatss(1) $ and $\\vt\\equiv\\vt(1) $ immediately gives \\tcg{\\tcr{us} a family of \\tcr{SS} estimators} $\\thetahatss(0) $ for $\\vt(0) $ as well as their \\tcr{corresponding} properties \\tcr{(as the counterparts of the properties established for $\\thetahatss(1) $ so far) }. \\tcr{Subsequently, we may obtain our final SS estimator(s) for the QTE, i.e., the difference $\\vt(1)-\\vt(0) $ in \\eqref{defqte}, simply as:  $\\thetahatss(1)-\\thetahatss(0) $.} Then we know that, if the conditions in Corollary \\ref{corqte} for $\\thetahatss(1) $ and their counterparts for $\\thetahatss(0) $ hold, the asymptotic distribution of \\tcg{our \\emph{final SS \\tcr{QTE} estimators} $\\thetahatss(1)-\\thetahatss(0) $ %for the QTE $\\vt(1)-\\vt(0) $\nis\\tcr{:}} %\\tcr{**TO BE EDITED FROM HERE (12/27) -- AC.**}\n\\be\nn^{1/2}\\sigqte^{-1}[\\{\\thetahatss(1)-\\thetahatss(0)\\}-\\{\\vt(1)-\\vt(0)\\}]~\\xrightarrow{d}~\\mn(0,1)\\quad (n, \\tcr{N}\\to\\infty),\n\\label{qte_difference_distribution}\n\\ee\nwhere the asymptotic variance\\tcr{:}\n\\bse\n&&\\sigqte^2~:=~\\var(\\{f(\\vt)\\pi(\\X)\\}^{-1}T\\{\\psi(Y,\\vt)- \\phis(\\X,\\vt) \\}- \\\\\n&&\\phantom{\\sigqte^2~:=~\\var(}[f\\{\\vt(0),0\\} \\{1-\\pi(\\X)\\}]^{-1}(1-T)[\\psi\\{Y(0),\\vt(0)\\}- \\phis\\{\\X,\\vt(0),0\\} ])\n\\ese\ncan be estimated by\\tcr{:}\n\\bse\n&&\\var_n(\\{\\hf(\\thetahatss)\\pihatN(\\X)\\}^{-1}T\\{\\psi(Y,\\thetahatss)- \\phihatn(\\X,\\thetahatss) \\}- \\\\\n&&\\phantom{\\var_n(}[\\hf\\{\\thetahatss(0),0\\} \\{1-\\pihatN(\\X)\\}]^{-1}(1-T)[\\psi\\{Y(0),\\thetahatss(0)\\}- \\phihatn\\{\\X,\\thetahatss(0),0\\} ]).\n\\ese\nIn the above\\tcr{,} $\\hf(\\cdot,0) $  and $\\phihatn(\\X,\\theta,0) $ are \\tcr{\\it some} estimators for the density function $f(\\cdot,0) $ of $Y(0) $ and the working model $\\phis(\\X,\\theta,0) $ of $\\E[\\psi\\{Y(0),\\theta\\} \\mid\\X]$, respectively. We will use \\eqref{qte_difference_distribution} to construct confidence intervals for the QTE in the data analysis of Section \\ref{sec_data_analysis}.\n\n\\section{Choice and estimation of \\tcg{the} nuisance functions}\\label{secnf}\nIn this section, we study some reasonable choices and estimators of the nuisance functions \\tcr{involved} in the SS estimators $\\muhatss$ and $\\thetahatss$ from Sections \\ref{secos} and \\ref{secqte}, which form a critical component in the implementation of \\tcr{all} our approaches. The results claimed in the last two sections, however, are completely general and allow for any choices as long as they satisfy the high-level conditions therein. \\tcr{In Sections \\ref{sec_PS}--\\ref{sec_nf_qte} below, we discuss some choices of $\\pi(\\cdot) $ and the outcome models for ATE and QTE.}\n\n\\subsection{Propensity score}\\label{sec_PS}\n\nUnder the assumption \\eqref{disproportion}, the specification and estimation of $\\pi(\\cdot) $ is a relatively easier task and can be done through applying any reasonable and flexible enough regression method (parametric, semi-parametric or non-parametric) to the plentiful observations for $(T,\\X\\trans)\\trans$ in $\\cu$. For instance, one can use the {\\it ``extended\'\' parametric families} %\\tcr{(or series estimators) }\n$\\pis(\\x)\\equiv h\\{\\bbeta_0\\trans\\bPsi(\\x)\\}$ as the working model for the propensity score $\\pi(\\cdot) $, where $h(\\cdot)\\in(0,1) $ is a {\\it known} link function, the components of $\\bPsi(\\cdot):\\rR^p\\mapsto\\rR^{p^*}$ are (known) basis functions of $\\x$ with $p^*\\equiv p^*_n$ allowed to diverge and exceed  $n$, and $\\bbeta_0\\in\\rR^{p^*}$ is an {\\it unknown} parameter vector. Such a $\\pis(\\x) $ can be estimated by $\\pihatN(\\x)\\equiv h\\{\\bbetahat\\trans\\bPsi(\\x)\\}$ with $\\bbetahat$ obtained from the corresponding parametric regression %\\tcr{procedure}\nprocess of $T$ vs. $\\bPsi(\\X) $ using $\\cu$. Regularization may be applied \\tcr{here} via, for example, the $L_1$ penalty if necessary \\tcr{(e.g., in high dimensional settings) }.\n\nThe families above include, as a special case, the logistic regression models with\n\\bse\nh(x)~\\equiv~\\{1+\\exp(-x)\\}^{-1}\\hbox{ and } \\bPsi(\\x)~\\equiv~\\{1,\\bPsi_1\\trans(\\x),\\bPsi_2\\trans(\\x),\\ldots,\\bPsi_M\\trans(\\x)\\} \\trans\\tcr{,}\n\\ese\nfor $\\bPsi_m(\\x):=(\\x_{[1]}^m,\\x_{[2]}^m,\\ldots,\\x_{[p]}^m)\\trans$ $(m=1,\\ldots,M) $ and some positive integer $M$. Section 5.1 of \\citet{chakrabortty2019high} along with Section B.1 in the supplementary material of that article provided a detailed discussion on these ``extended\'\' parametric families and established their (non-asymptotic) properties, sufficient for the high-level conditions on $\\{\\pis(\\cdot),\\pihatN(\\cdot)\\}$ in Sections \\ref{secos} and \\ref{secqte}. In addition, it is noteworthy that, in high dimensional scenarios \\tcr{in our setup,} where $n\\ll p^*\\ll N$, {\\it the parameter vector $\\bbeta_0$ is totally free of sparsity} and can be estimated by unregularized methods based on $\\cu$. Such a relaxation of assumptions is incurred\nby the usage of massive unlabeled data and \\tcr{is} generally unachievable in purely supervised settings.\n\n\\subsection{Outcome model for the ATE}\\label{sec_nf_ate}\nWe now consider the working outcome model $m^*(\\cdot) $ \\tcr{involved} in our ATE estimators. As discussed in Remark \\ref{remark_ate_efficiency}, one may expect to achieve semi-parametric optimality by letting $m^*(\\X)\\equiv\\E(Y\\mid\\X) $. However, specifying \\tcr{the} $\\E(Y\\mid\\X) $ correctly in high dimensional scenarios is usually unrealistic while approximating it fully non-parametrically would typically bring \\tcr{in}\nundesirable issues such as\nunder-smoothing \\citep{newey1998undersmoothing} even if there are only a moderate number of covariates. We therefore adopt a principled and flexible semi-parametric strategy, \\tcr{via}\nconducting dimension reduction followed by non-parametric calibrations and targeting $\\E(Y\\mid\\S) $ instead of $\\E(Y\\mid\\X) $, where $\\S:=\\mbP_0\\trans\\X\\in\\ms\\subset\\rR^r$ and $\\mbP_0$ is a $r\\times p$ {\\it transformation matrix} with some fixed and known $r\\leq p$. \\tcr{(The choice $r = p$ of course leads to a trivial case with $\\mbP_0 = I_p$.) } It is noteworthy that we \\tcr{\\it always} allow the dimension reduction to be \\emph{insufficient} and do {\\it not} assume anywhere \\tcr{that}\n\\be\n\\E(Y\\mid\\S)~=~\\E(Y\\mid\\X).\n\\label{sufficient_dimension_reduction}\n\\ee\nThe efficiency comparison in Remark \\ref{remark_ate_efficiency} shows that, when\\tcr{ever}\n$\\pihatN(\\cdot) $ converges to $\\pi(\\cdot) $ fast enough, setting $m^*(\\X)\\equiv\\E(Y\\mid\\mbP_0\\trans\\X) $ \\tcr{\\it always} guarantees our SS estimators $\\muhatss$ \\tcr{to} dominate any supervised competitors using the same working model $m^*(\\cdot) $ \\tcr{--} \\emph{no matter} whether \\eqref{sufficient_dimension_reduction} holds or not. Hence\\tcr{,} one is free to let $\\mbP_0$ equal \\emph{any} user-defined and data-dependent matrix. As long as $\\mbP_0$ is completely determined by the distribution of $\\X$, its estimation error is very likely to be negligible owing to the large number of observations for $\\X$ provided by $\\cu$. An instance \\tcr{of such a choice is} %is\nthe $r$ leading principal component directions of $\\X$. Nevertheless, to make the dimension reduction as ``sufficient\'\' as possible, one may prefer to use a transformation matrix $\\mbP_0$ which depends on the joint distribution of $(Y,\\X\\trans)\\trans$\\tcr{, and thus} needs to be estimated with significant errors. We will give some examples of such $\\mbP_0$ in Remark \\ref{remark_choice_of_P0}.\n\nTo \\tcr{estimate} %approximate\nthe conditional mean $m^*(\\x)\\equiv\\E(Y\\mid \\mbP_0\\trans\\X=\\mbP_0\\trans\\x) $, we may employ any \\tcr{suitable} smoothing technique, such as kernel smoothing, kernel machine regression or smoothing splines. For illustration, we focus  on the \\tcr{following} \\tcr{\\it IPW type kernel smoothing estimator(s):}\n\\be\n\\mhatnk(\\x)~\\equiv~\\mhatnk(\\x,\\hmbP)~:=~\\{\\hlz(\\x,\\hmbP)\\}^{-1}\\hlo(\\x,\\hmbP)\\quad (k=1,\\ldots,\\kK),\n\\label{ks_ate}\n\\ee\nwhere\n\\bse\n\\hlt(\\x,\\mbP)~:=~ h_n^{-r} \\Enk[\\{\\pihatN(\\X)\\}^{-1}T Y^tK_h\\{\\mbP\\trans(\\x-\\X)\\}]\\quad (t=0,1),\n\\ese\nwith the notation $\\E_{n,k}\\{\\ghat(\\bfZ)\\}:=n_{\\kK^-}^{-1}\\slk \\ghat(\\bfZ_i) $ for any possibly random function $\\ghat(\\cdot) $, \\tcr{and with} $\\hmbP$ \\tcr{being \\emph{any}} estimator of $\\mbP_0$ using $\\cl_k^-$, $K_h(\\s):=K(h_n^{-1}\\s) $, $K(\\cdot) $ a kernel function \\tcr{(e.g., the standard Gaussian kernel) } and $ h_n\\to 0$ \\tcr{denoting} a bandwidth sequence.\n\n\\begin{remark}[Subtlety and benefit\\tcr{s} of the inverse probability weighting scheme]\\label{remark_ks_ate_weight}\nThe \\tcr{IPW based}\nweight\\tcr{s} $\\{\\pihatN(\\X)\\}^{-1}$ \\tcr{involved} in $\\mhatnk(\\x) $ \\tcr{in \\eqref{ks_ate}} \\tcr{play} %plays\na key role in \\tcr{its} achieving \\tcr{an \\emph{important}} {\\it DR \\tcr{property}}, which means $\\mhatnk(\\x) $ has the limit $\\E(Y\\mid\\S=\\s) $ whenever either \\eqref{sufficient_dimension_reduction} is true or $\\pis(\\cdot)=\\pi(\\cdot) $\\tcr{,}\nbut \\emph{not} necessarily both. This property will be proved in Theorem \\ref{theorem_ks_ate}\\tcr{,}\nand formally stated \\tcr{and discussed} in Remark \\ref{remark_ks_ate_DR}. In contrast, the (standard) complete-case version without the \\tcr{IPW} weights $\\{\\pihatN(\\X)\\}^{-1}$ actually targets $\\E(Y\\mid\\S=\\s,T=1) $ that equals $\\E(Y\\mid\\S=\\s) $ \\emph{only if} \\eqref{ks_ate} holds. Recalling the clarification in Remark \\ref{remark_ate_efficiency}, we can see that such a subtlety \\tcr{(enabled by the involvement of the weights) in the construction} of $\\mhatnk(\\cdot) $ ensures the efficiency advantage of our SS estimators $\\muhatss$ over any supervised competitors constructed with the same $\\mhatnk(\\cdot) $,  when $\\pi(\\cdot) $ is correctly specified but $m(\\cdot) $ is not.\n\n\\tcr{Lastly, a}lthough $\\mhatnk(\\cdot) $ contains $\\pihatN(\\cdot) $ and thereby involves the unlabeled data $\\cu$, we suppress the subscript $N$ \\tcr{in $\\mhatnk(\\cdot) $}\nfor brevity considering its convergence rate mainly relies on $n$; see Theorem \\ref{theorem_ks_ate}. In principle, cross fitting procedures analogous to (\\ref{ds1}) and (\\ref{ds2}) should be conducted for $\\cu$ as well to guarantee the independence of $\\mhatnk(\\cdot) $ and $\\X_i$ in $\\mhatnk(\\X_i) $ $(i=n+1,\\ldots,n+N) $. However, from our experience, such extra cross fitting \\tcg{procedures} bring only marginal benefits in practice while making the implementation considerably more laborious. We hence stick to estimating $\\pis(\\cdot) $ using the whole $\\cu$ in \\tcr{our} numerical studies.\n\\end{remark}\n\n\\vskip-0.02in\n%\\tcr{**Need to change/improve content here -- AC (12/30).**}\n\\tcr{There is substantial literature on kernel smoothing estimators with unknown estimated covariate transformations, but mostly in low (fixed) dimensional settings  \\citep{mammen2012nonparametric,mammen_rothe_schienle_2016, escanciano2014uniform}.} Considering\\tcr{, however,} that \\tcr{in our setting,}\nthe dimension $p$ of $\\X$ can be \\tcr{\\it divergent} \\tcr{(possibly exceeding $n$),}\n%and greater than $n$,\nand that the transformation matrix $\\mbP_0$ as well as the weights $\\{\\pis(\\X)\\}^{-1}$ nee\\tcr{d} %needs\nto be \\tcr{\\it estimated} \\tcr{as well},\nestablishing the uniform convergence property of $\\mhatnk(\\x,\\hmbP) $ \\tcr{in \\eqref{ks_ate},} in fact\\tcr{,} poses substantial technical challenges and has not been studied in the literature yet. \\tcr{Our results here are thus {\\it novel} to the best of our knowledge.} To derive \\tcr{the results} %its uniform convergence rate,\nwe impose the following \\tcr{conditions.}\n%assumptions.\n\n\n\\begin{assumption}\\label{al1}\nThe estimator $\\hmbP$ satisfies $\\|\\hmbP-\\mbP_0\\|_1=O_p(\\alpha_n) $ for some \\tcr{$\\alpha_n \\geq 0$}. %positive sequence $\\alpha_n$.\n\\end{assumption}\n\n\\begin{assumption}[\\tcr{Smoothness conditions}]\\label{akernel}%(Smoothness conditions)\n(i) The function $K(\\cdot):\\rR^r\\mapsto\\rR$ is a symmetric kernel of order $d\\geq 2$ with a finite $d$th moment. Moreover, it  is bounded, square integrable and continuously differentiable with a derivative $\\nabla K(\\s):=\\partial K(\\s)/\\partial \\s$ such that $\\|\\nabla K(\\s)\\|\\leq c_1\\,\\|\\s\\|^{-v_1}$ for some constant $v_1>1$ and any $\\|\\s\\|>c_2$. (ii) The support $\\ms$ of $\\S\\equiv\\mbP_0\\trans\\X$ is compact. The density function $f_{\\S}(\\cdot) $ of $\\S$ is bounded and bounded away from zero on $\\ms$. In addition, it is $d$ times continuously differentiable with a bounded $d$th derivative on some open set $\\ms_0\\supset\\ms$. (iii) For some constant $u>2$, the response $Y$ satisfies $\\ss\\E(Y^{2u}\\mid\\S=\\s)<\\infty$. (iv) The function $\\kappa_t(\\s):=\\E[\\{\\pis(\\X)\\}^{-1}TY^t\\mid \\S=\\s]$ $(t=0,1) $\nis $d$ times continuously differentiable and has %a\n\\tcr{b}ounded $d$th \\tcr{order} derivatives on $\\ms_0$.\n\\end{assumption}\n\n\n\\begin{assumption}[\\tcr{Required only when $\\mbP_0$ needs to be estimated}] \\label{ahbey}%(Required only when $\\mbP_0$ needs to be estimated)\n(i) The support $\\mx$ of $\\X$ is such that $\\sx\\|\\x\\|_\\infty<\\infty$. (ii) The function $\\nabla K(\\cdot) $ has a bounded derivative satisfying $\\|\\partial \\{\\nabla K(\\s)\\}/\\partial \\s\\|\\leq c_1\\,\\|\\s\\|^{-v_2}$ for some constant $v_2>1$ and any $\\|\\s\\|>c_2$. Further, it is locally Lipschitz continuous, i.e., $\\|\\nabla K(\\s_1)-\\nabla K(\\s_2)\\|\\leq \\|\\s_1-\\s_2\\|\\rho(\\s_2) $ for any $\\|\\s_1-\\s_2\\|\\leq c$, where $\\rho(\\cdot) $ is some bounded, square integrable and differentiable function with a bounded derivative $\\nabla\\rho(\\cdot) $ such that $\\|\\nabla\\rho(\\s)\\|\\leq c_1\\|\\s\\|^{-v_3}$ for some constant $v_3>1$ and any $\\|\\s\\|>c_2$. (iii) Let $\\bchi_{t[j]}(\\s) $ be the $j$th component of $\\bchi_{t}(\\s):=\\E[\\X \\{\\pis(\\X)\\}^{-1}TY^t\\mid \\S=\\s]$. Then\\tcr{,} $\\bchi_{t[j]}(\\s) $ is continuously differentiable and has a bounded first derivative on $\\ms_0$\\tcr{, for each $t=0,1$ and $j=1,\\ldots,p$.}%$(t=0,1;\\ j=1,\\ldots,p) $.\n\\end{assumption}\n\n\\vskip-0.02in\nIn the above, Assumption \\ref{al1} regulates the behavior of $\\hmbP$ as an estimator of the transformation matrix $\\mbP_0$. Moreover, the smoothness and moment conditions in Assumption \\ref{akernel} are almost adopted from \\citet{hansen2008uniform} and are fairly standard in the literature of kernel-based approaches \\citep{newey1994large, andrews1995nonparametric, masry1996multivariate}. Further, we require Assumption \\ref{ahbey} to control \\tcr{the} errors from approximating $\\mbP_0$ by $\\hmbP$\\tcr{,} while Assumption \\ref{ahbey} (ii) \\tcr{in particular} is satisfied by the second-order Gaussian kernel, among others. Similar conditions were imposed by \\citet{chakrabortty2018efficient} to study unweighted kernel smoothing estimators with dimension reduction  \\tcr{in} %for\nlow \\tcr{(fixed) } dimensional \\tcr{settings.} %data.\nBased on these conditions, we now \\tcr{provide} %give\nthe uniform convergence rate of $\\mhatnk(\\x,\\hmbP) $ \\tcr{in the following result.} %as follows.\n\n\\begin{theorem}[\\tcr{Uniform consistency of $\\mhatnk(\\cdot) $}]\\label{theorem_ks_ate}\nSet $\\xi_n:=\\{(nh_n^r)^{-1}\\log\\,n\\}^{1/2}$, $b_n^{(1) }:=\\xi_n+h_n^d$ and $b_{n,N}^{(2) }:=h_n^{-2}\\alpha_n^2+h_n^{-1}\\xi_n\\alpha_n+\\alpha_n+h_n^{-r/2}s_N$. Suppose that Assumptions \\ref{ass_equally_distributed}, \\ref{api4} and \\ref{al1}--\\ref{ahbey} hold true and that $b_n^{(1) }+b_{n,N}^{(2) }=o(1) $. Then\\tcr{,}\n\\bse\n\\sx|\\mhatnk(\\x,\\hmbP)-\\tmu(\\x,\\mbP_0)| ~=~O_p\\{b_n^{(1) }+b_{n,N}^{(2) }\\} \\quad (k=1,\\ldots,\\kK),\n\\ese\nwhere $\\tmu(\\x,\\mbP):=\\{\\kappa_0(\\mbP\\trans\\x)\\}^{-1}\\kappa_1(\\mbP\\trans\\x) $\\tcr{, with $\\kappa_0(\\cdot) $ and $\\kappa_1(\\cdot) $ as given in Assumption \\ref{akernel}.}\n\\end{theorem}\n\n\\begin{remark}[Double robustness \\tcr{of $\\mhatnk$}]\\label{remark_ks_ate_DR}\nAs long as either $\\pis(\\x)=\\pi(\\x) $ or $m^*(\\x)\\equiv \\E(Y\\mid \\S=\\s)=\\E(Y\\mid\\X=\\x)\\equiv m(\\x) $ but {\\it not} necessarily both, we have\\tcr{:}\n\\bse\n\\tmu(\\x,\\mbP_0)&~=~&(\\E[\\{\\pis(\\X)\\}^{-1}\\pi(\\X)\\mid \\S=\\s])^{-1}\\E[\\{\\pis(\\X)\\}^{-1}\\pi(\\X)m(\\X)\\mid \\S=\\s] \\\\\n&~=~&\\E(Y\\mid\\S=\\s)~\\equiv~ m^*(\\x).\n\\ese\nTheorem \\ref{theorem_ks_ate} therefore shows that $\\mhatnk(\\x,\\hmbP) $ is a \\tcr{\\it DR estimator} of $m^*(\\x) $.\n%\\tcr{This is an important consequence (and benefit) of considering the IPW scheme in the construction of $\\mhatnk(\\cdot) $, and ensures that it continues to target a function of the form \\eqref{mstarX}, that guarantees efficiency improvement for our final SS ATE estimator, as discussed in Remarks \\ref{remark_ate_efficiency} and \\ref{remark_ks_ate_weight}. It is worth noting that the corresponding unweighted version (without IPW) does not have this property.}\n\\tcr{This is an important consequence of the IPW scheme used in the construction of $\\mhatnk(\\cdot) $, and its benefits (in the bigger context of our final SS estimator) were discussed in detail in Remark \\ref{remark_ks_ate_weight}.\n}\n\n\\end{remark}\n\n\\begin{remark}[Uniform convergence \\tcr{-- some examples}]\\label{remark_choice_of_P0}\nAccording to the result in Theorem \\ref{theorem_ks_ate}, the uniform consistency of $\\mhatnk(\\x,\\hmbP) $ as an estimator of $\\tmu(\\x,\\mbP_0) $ holds at the \\tcr{\\it optimal bandwidth} \\tcr{order} %rate\n$h_{\\hbox{\\tiny opt}}=O\\{n^{-1/(2d+r) }\\}$ for any kernel order $d\\geq 2$ and \\tcr{a} fixed $r$, given\n\\be\ns_N~=~o \\{n^{-r/(4d+2r) }\\} \\quad \\hbox{and} \\quad \\alpha_n~=~o\\{n^{-1/(2d+r) }\\}.\n\\label{s_N_alpha_n}\n\\ee\nThe first part of \\eqref{s_N_alpha_n} is actually weaker than the assumption \\tcr{$s_N=o(n^{-1/2}) $ used}\nin Corollary \\ref{corate} %, that $s_N=o(n^{-1/2}) $\nand thus \\tcr{should be} easy to be ensured in the SS setting \\eqref{disproportion}. As regards the validity of the second part, we consider it for \\tcr{some} frequently used {\\it choices of $\\mbP_0$} including, for instance, the least square regression parameter $(r=1) $ satisfying $\\E\\{\\X(Y-\\mbP_0\\trans\\X)\\}=\\bze_p$, and the $r$ leading eigenvectors of the matrix $\\var\\{\\E(\\X\\mid Y)\\}$, which can be estimated by %the\n\\tcr{sliced} inverse regression \\citep{li1991sliced}. When $p$ is fixed, there typically exist $n^{1/2}$-consistent estimators $\\hmbP$ for $\\mbP_0$\\tcr{,} so the second part of \\eqref{s_N_alpha_n} is satisfied by the fact that $\\alpha_n=O(n^{-1/2}) $. In high dimensional scenarios where $p$ is divergent and greater than $n$, one can obtain $\\hmbP$ from the \\tcr{$L_1$-}regularized version\\tcr{(s) } of linear regression  or %the\n\\tcr{sliced} inverse regression \\citep{lin2019sparse}. The sequence $\\alpha_n=O\\{q(\\log\\,p/n)^{1/2}\\}$ when the $L_1$ penalty is applied under some suitable conditions \\citep{buhlmann2011statistics, negahban2012unified, wainwright2019high}, where $q:=\\|\\mbP_0\\|_0$ represents the sparsity level of $\\mbP_0$. Thus\\tcr{,} the second part of \\eqref{s_N_alpha_n} holds as long as\n\\bse\nq(\\log\\,p)^{1/2}~=~o\\{n^{(2d+r-2)/(4d+2r) }\\}.\n\\ese\n\\end{remark}\n\n\\subsection{Outcome model for the QTE}\\label{sec_nf_qte}\n\nAs regards the outcome model $\\phis(\\cdot,\\cdot) $ for the QTE, we adopt the same strategy as in Section \\ref{sec_nf_ate}. Specifically, \\tcr{with $\\mbP_0$ similar as before,} we set\n\\be\n\\phis(\\x,\\theta)~\\equiv~\\E\\{\\psi(Y,\\theta)\\mid \\mbP_0\\trans\\X=\\mbP_0\\trans\\x\\}~\\equiv~\\E\\{\\psi(Y,\\theta)\\mid \\S=\\s\\} \\tcr{,}\n\\label{phis}\n\\ee\nand estimate it by the IPW type kernel smoothing estimator\\tcr{:}\n\\be\n\\phihatnk(\\x,\\theta)\\equiv\\phihatnk(\\x,\\theta,\\hmbP):=\\{\\hatez(\\x,\\theta,\\hmbP)\\}^{-1}\\hateo(\\x,\\theta,\\hmbP)\\quad (k=1,\\ldots,\\kK),\n\\label{ks_qte}\n\\ee\nwhere\\tcr{, with $K(\\cdot) $, $h_n$ and $K_h(\\cdot) $ similarly defined as in Section \\ref{sec_nf_ate},}\n\\bse\n\\hatet(\\x,\\theta,\\mbP)~:=~h_n^{-r} \\Enk[\\{\\pihatN(\\X)\\}^{-1}T \\{\\psi(Y,\\theta)\\}^tK_h\\{\\mbP\\trans(\\x-\\X)\\}]\\quad (t=0,1).\n\\ese\nWe first verify Assumption \\ref{abound} for \\tcr{a choice of} $\\phis(\\x,\\theta) $ \\tcr{as} in \\eqref{phis}\\tcr{, via the following result.}\n\n\\begin{proposition}\\label{thphi}\nIf the conditional density $f(\\cdot\\mid\\s) $ of $Y$ given $\\S=\\s$ is such that\n\\be\n\\E[\\{\\sb f(\\theta\\mid\\S)\\}^2]~<~\\infty,\n\\label{conditional_density}\n\\ee\nthen Assumption \\ref{abound} is satisfied by setting $\\phis(\\X,\\theta)\\equiv\\E\\{\\psi(Y,\\theta)\\mid\\S\\}$.\n\\end{proposition}\n\nWe now study the uniform convergence of the estimator $\\phihatnk(\\x,\\theta) $. It is noteworthy that establishing properties of $\\phihatnk(\\x,\\theta) $ is \\tcr{\\it even more} technically involved compared to the case of $\\mhatnk(\\x) $ in Section \\ref{sec_nf_ate}, since handling the function class $\\{\\psi(Y,\\theta):\\theta\\in\\mb(\\vt,\\v)\\}$ inevitably needs tools from empirical process theory. We itemize the relevant assumptions as follows.\n\n\n\\begin{assumption}[\\tcr{Smoothness conditions}]\\label{akernel_qte} %(Smoothness conditions)\n(i) Assumption \\ref{akernel} (i) \\tcr{holds}. (ii) Assumption \\ref{akernel} (ii) \\tcr{holds}. (iii) %With respect to $\\s$,\n\\tcr{T}he function $\\varphi_t(\\s,\\theta):=\\E[\\{\\pis(\\X)\\}^{-1}T\\{\\psi(Y,\\theta)\\}^t\\mid \\S=\\s]$ $(t=0,1) $ is $d$ times continuously differentiable \\tcr{with respect to $\\s$,} and has %a\n\\tcr{b}ounded $d$th \\tcr{order} derivatives\non $\\ms_0\\times\\mbtv$ \\tcr{for some $\\epsilon > 0$}.\n\\end{assumption}\n\n\n\\begin{assumption}[\\tcr{Required only %when\n\\tcr{if} $\\mbP_0$ needs to be estimated}] \\label{ahbe}%(Required only when $\\mbP_0$ needs to be estimated)\n(i) Assumption \\ref{ahbey} (i) \\tcr{holds}. (ii) The function $\\nabla K(\\cdot) $ is continuously differentiable and satisfies $\\|\\partial \\{\\nabla K(\\s)\\}/\\partial \\s\\|$ \\tcr{$\\leq c_1\\,\\|\\s\\|^{-v_2}$} for some constant $v_2>1$ and any $\\|\\s\\|>c_2$. Further, it is locally Lipschitz continuous, i.e., $\\|\\nabla K(\\s_1)-\\nabla K(\\s_2)\\|\\leq \\|\\s_1-\\s_2\\|\\rho(\\s_2) $ for any $\\|\\s_1-\\s_2\\|\\leq c$, where $\\rho(\\cdot) $ is some bounded and square integrable function with a bounded derivative $\\nabla\\rho(\\cdot) $. (iii) Let $\\bfeta_{t[j]}(\\s,\\theta) $ be the $j$th component of $\\bfeta_{t}(\\s,\\theta):=\\E[\\X \\{\\pis(\\X)\\}^{-1}T\\{\\psi(Y,\\theta)\\}^t\\mid \\S=\\s]$. Then, with respect to $\\s$, the function $\\bfeta_{t[j]}(\\s,\\theta) $ is continuously differentiable and has a bounded first derivative on $\\ms_0\\times\\mbtv$ \\tcr{ for some $\\epsilon > 0$,} \\tcr{for each $t=0,1$ and $j=1,\\ldots p$.}\n\\end{assumption}\n\nThe above two assumptions can be viewed as %a\n\\tcr{the natural} variant\\tcr{s} of Assumptions \\ref{akernel}--\\ref{ahbey} adapted \\tcr{suitably} for the case of the QTE. We now propose the following result \\tcr{for $\\phihatnk(\\cdot,\\cdot) $}.\n\n\\begin{theorem}[\\tcr{Uniform convergence rate of $\\phihatnk(\\cdot,\\cdot) $}]\\label{thhd}\nSet $\\gamma_n:=[(nh_n^r)^{-1}\\{\\log(h_n^{-r})+\\log(\\log\\,n)\\}]^{1/2}$, $a_{n}^{(1) }:=\\gamma_n+h_n^d$ and $a_{n,N}^{(2) }:=h_n^{-2}\\alpha_n^2+h_n^{-1}\\gamma_n\\alpha_n+\\alpha_n+h_n^{-r/2}s_N$. Suppose that Assumptions \\ref{ass_equally_distributed}, \\ref{api}, \\ref{al1}, \\ref{akernel_qte} and \\ref{ahbe} hold true and that $a_{n}^{(1) }+a_{n,N}^{(2) }=o(1) $. Then\n\\bse\n\\sbx|\\phihatnk(\\x,\\theta,\\hmbP)-\\tphi(\\x,\\theta,\\mbP_0)| ~=~O_p\\{a_{n}^{(1) }+a_{n,N}^{(2) }\\} \\quad (k=1,\\ldots,\\kK),\n\\ese\nwhere $\\tphi(\\x,\\theta,\\mbP):=\\{\\varphi_0(\\mbP\\trans\\x,\\theta)\\}^{-1}\\varphi_1(\\mbP\\trans\\x,\\theta) $ \\tcr{with $\\varphi_0(\\cdot) $ and $\\varphi_1(\\cdot) $ as in Assumption \\ref{akernel_qte}.}\n\\end{theorem}\n\n\\begin{remark}[Double robustness and uniform convergence \\tcr{of $\\phihatnk(\\cdot,\\cdot) $}]\nWhenever either $\\pis(\\x)=\\pi(\\x) $ or $\\phis(\\x,\\theta)\\equiv \\E\\{\\psi(Y,\\theta)\\mid \\S=\\s\\}=\\E\\{\\psi(Y,\\theta)\\mid \\X=\\x\\} \\equiv\\phi(\\x,\\theta) $\\tcr{,} but {\\it not} necessarily both, we can see \\tcr{that:}\n\\bse\n\\tphi(\\x,\\theta,\\mbP_0)&~=~&(\\E[\\{\\pis(\\X)\\}^{-1}\\pi(\\X)\\mid \\S=\\s])^{-1}\\E[\\{\\pis(\\X)\\}^{-1}\\pi(\\X)\\phi(\\X,\\theta)\\mid \\S=\\s] \\\\\n&~=~&\\E\\{\\psi(Y,\\theta)\\mid\\S=\\s\\}~\\equiv~\\phis(\\x,\\theta).\n\\ese\nIn this sense\\tcr{,} $\\phihatnk(\\x,\\theta,\\hmbP) $ is a \\tcr{\\it DR estimator} of $\\phis(\\x,\\theta) $. Moreover, it is straightforward to show $\\phihatnk(\\x,\\theta,\\hmbP) $ is uniformly consistent for $\\tphi(\\x,\\theta,\\mbP_0) $ at the optimal bandwidth rate under the same conditions on $\\{s_N,\\alpha_n\\}$ as those in Remark \\ref{remark_choice_of_P0}, while the choices of $\\{\\mbP_0,\\hmbP\\}$ therein also apply to the case of $\\phihatnk(\\x,\\theta,\\hmbP) $\\tcr{; s}ee the discussion in Remark \\ref{remark_choice_of_P0} for details.\n\\end{remark}\n\nTheorem \\ref{thhd} \\tcr{therefore} has shown \\tcr{(among other things) that} the sequences $\\{d_{n,1}, d_{n,2}, d_{n,\\infty}\\}$ in \\tcr{our high-level} Assumption \\ref{aest} \\tcr{on $\\phihatnk(\\cdot,\\cdot) $} are all of \\tcr{order} $o(1) $ when one sets\\tcr{:}\n\\be\n\\hpsi(\\X,\\theta)~\\equiv~\\phihatnk(\\X,\\theta,\\hmbP)-\\phis(\\X,\\theta),\n\\label{psihat}\n\\ee\nwhere $\\phis(\\x,\\theta) $ and $\\phihatnk(\\x,\\theta,\\mbP) $ are as defined in (\\ref{phis}) and (\\ref{ks_qte}), respectively. %Lastly,\n\\tcr{Furthermore, as a final verification of our high-level conditions in Assumption \\ref{aest},} we validate the condition \\eqref{vc} \\tcr{therein} on the bracketing number \\tcr{via the following proposition}.\n\\begin{proposition}\\label{thbn}\nUnder the condition \\eqref{conditional_density}, the function $\\hpsi(\\X,\\theta) $ in \\eqref{psihat} satisfies\\tcr{:}\n\\bse\nN_{[\\,]}\\{\\eta,\\mp_{n,k}\\mid\\cl,L_2(\\P_\\X)\\}~\\leq~ c\\,(n+1)\\eta^{-1},\n\\ese\nwhere the set $\\mp_{n,k}$ is as defined in \\eqref{pnk}. Therefore\\tcr{,} the sequence $a_n$ \\tcr{characterizing the growth of} %for\nthe function $H(\\cl) $ in the condition \\eqref{vc} \\tcr{of Assumption \\ref{aest}} is of \\tcr{order} $O(n) $.\n\\end{proposition}\n\n\\begin{remark}[Other outcome model estimators]\\label{remark_other_nuisance_functions}\n%\\tcr{ammenm} \\tcg{**Abhishek, do you want to say something here?**}\\tcr{**Yeah, I don\'t know what happened here. Maybe some accident with keyboard while typing. Anyway, I will double check to see if/what I wanted to add anything here -- AC**}\\tcr{**Also check my comment on pg 6 just before Sec. 1.3. I had this before as well, but maybe you missed it (so I have included the phrase "new (date)" there this time -- AC**}\n\\tcr{Finally, as we conclude our discussion on the nuisance functions\' estimation, it is worth pointing out that i}n %In\naddition to the IPW type kernel smoothing estimators with necessary dimension reduction, which have been investigated thoroughly in Sections \\ref{sec_nf_ate}--\\ref{sec_nf_qte}, one may also employ \\tcr{\\it any} other reasonable choices of $\\mhatnk(\\cdot) $ and $\\phihatnk(\\cdot,\\cdot) $ to construct $\\muhatss$ and $\\thetahatss$, as long as they satisfy the high-level conditions in Sections \\ref{secos}--\\ref{secqte}. Examples include estimators generated by parametric (\\tcr{e.g,} linear$/$logistic) regression \\tcr{methods, possibly with penalization in high dimensional settings \\citep{farrell2015robust},} and random forest \\citep{breiman2001random} without use of dimension reduction, as well as many \\tcr{other} popular non-parametric machine learning approaches %\\tcr{(e.g., neural networks) }\nthat have been advocated by some\nrecent works for other\nrelated problems in analogous settings %\\citep{farrell2015robust,\n\\citep{chernozhukov2018double, farrell2021deep}. We will consider some of these methods in \\tcr{our} %the following\nsimulations and data analysis\\tcr{,}\nwhile omitting their theoretical study, which is not of our primary interest in this article\\tcr{; see} %. See\nSections \\ref{sec_simulations} and \\ref{sec_data_analysis} for their implementation details and numerical performance.\n\\end{remark}\n\n\\section{Simulations}\\label{sec_simulations}\nWe now investigate the numerical performance of our \\tcr{SS ATE and QTE estimators} $\\muhatss$ and $\\thetahatss$ on simulated data %\\tcr{**NEW - Add clarification here regarding mean/quantile being the targets, not ATE/QTE -- AC.**}\n\\tcr{under a variety of data generating mechanisms}. \\tcr{(We clarify here that without loss of generality we focus on $\\mu_0$ and $\\vt$ in \\eqref{generic_notation} as our targets, though with some abuse of terminology, we occasionally refer to them as ATE and QTE respectively.) } %, similar to Remark \\ref{remark_semantics}.) }\nWe set the sample sizes $n\\in\\{200,500\\}$ and $N=10,000$ throughout. The covariates $\\X$ are drawn from a $p$-dimensional normal distribution with a zero mean and an identity covariance matrix, where $p\\in\\{10,200\\}$ \\tcr{denotes low and high dimensional choices, respectively}. For any kernel smoothing steps involved, we always use the second order Gaussian kernel and select the bandwidths %by\n\\tcr{using}\ncross validation. Regularization is applied to all regression procedures via\nthe $L_1$ penalty when $p=200$, while the tuning parameters are chosen \\tcr{using} %by\nten-fold cross validation. The number of folds in the cross fitting steps \\eqref{ds1}--\\eqref{ds2} and \\eqref{ds3}--\\eqref{ds4} is $\\kK=10$. By the term ``complete-case\'\', we refer to conducting a process on $\\{(Y_i,T_i=1,\\X_i\\trans)\\trans:i\\in\\I^*\\}$ without weighting, where $\\I^*\\equiv\\I_k^-$ if cross fitting is involved while $\\I^*\\equiv\\I$ otherwise.\n\n\\subsection{\\tcr{Data generating mechanisms and nuisance estimator choices}} \\tcr{We use the following choices as the \\emph{true} data generating models for $T \\mid \\X$ and $Y \\mid \\X $.} Let $\\X_q:=(\\X_{[1]},\\ldots,\\X_{[q]})\\trans$  where $q=p$ when $p=10$, and $q\\in\\{5,\\ceil{p^{1/2}}\\}$ when $p=200$, \\tcr{representing the (effective) \\emph{sparsity} (fully dense for $p = 10$, and sparse or moderately dense for $p = 200$, respectively) of the true data generating models for the nuisance functions, as described below}.\n\n\\vskip0.05in\n\\tcr{For the \\emph{propensity score} $\\pi(\\X) $}, \\tcr{and with $T \\mid \\X \\sim \\mbox{Bernoulli} \\{\\pi(\\X)\\}$,} we set \\tcr{the choices:}\n\\begin{enumerate}[(i)]\n\\item $\\pi(\\X)\\equiv h(\\bon_q\\trans\\X_q/q^{1/2}) $, a {\\it linear }model;\n\n\\item $\\pi(\\X)\\equiv h\\{\\bon_q\\trans\\X_q/q^{1/2}+(\\bon_q\\trans\\X_q)^2/(2q)\\}$, a {\\it single index} model;\n\n\\item $\\pi(\\X)\\equiv h\\{\\bon_q\\trans\\X_q/q^{1/2}+\\|\\X_q\\|^2/(2q)\\}$, a {\\it quadratic} model.\n\\end{enumerate}\nIn the above $h(x)\\equiv\\{1+\\exp(-x)\\}^{-1}$ \\tcr{denotes the usual ``expit\'\' link function for a logistic model}. To approximate $\\pi(\\X) $ using the data $\\cu$, we obtain the \\emph{estimator} $\\pihatN(\\x) $ from\\tcr{:}\n\\begin{enumerate}[I.]\n\\item unregularized or regularized \\tcr{(linear) } logistic regression of $T$ vs. $\\X$ (Lin), \\tcg{which correctly specifies the propensity score (i) but misspecifies (ii) and (iii) }; ~~\\tcr{or}\n\n\\item unregularized or regularized \\tcr{(quadratic) } logistic regression of $T$ vs. $(\\X\\trans,\\X_{[1]}^2,\\ldots,\\X_{[p]}^2)\\trans$ (Quad), \\tcg{which correctly specifies the propensity scores (i) and (iii) but misspecifies (ii) }.\n\\end{enumerate}\n\n%Further,\n\\tcr{T}he \\emph{conditional outcome model} is $Y\\mid\\X\\sim\\mn\\{m(\\X),1\\}$ with \\tcr{%the following\nchoices of $m(\\cdot) $ as follows:}\n\\begin{enumerate}[(a)]\n\\item $m(\\X)\\equiv \\bon_q\\trans\\X_q$, a {\\it linear} model;\n\n\\item $m(\\X)\\equiv \\bon_q\\trans\\X_q+(\\bon_q\\trans\\X_q)^2/q$, a {\\it single index} model;\n\n\\item $m(\\X)\\equiv \\bon_q\\trans\\X_q+\\|\\X_q\\|^2/3$, a {\\it quadratic} model;\n\n\\item $m(\\X)\\equiv 0$, a {\\it null} model;\n\n\\item $m(\\X)\\equiv \\bon_p\\trans\\X\\{1+2(\\bze_{p/2}\\trans,\\bon_{p/2}\\trans)\\X/p\\}$, a {\\it double index} model.\n\\end{enumerate}\nThe outcome models (d) and (e) are considered for cases with $p=10$ only and their results are summarized in %Section \\ref{sm_simulations}\n\\tcr{Appendix \\ref{sm_simulations}} of the Supplementary Material. The following discussions mainly focus on the outcome models (a)--(c).\n\n\\tcr{T}he \\tcr{\\emph{estimators}} $\\mhatnk(\\x) $ and $\\phihatnk(\\x,\\thetahatinit) $ are constructed based on the data $\\cl_k^-$ through\\tcr{:}\n\n\\begin{enumerate}[I.]\n\\item kernel smoothing (KS), \\tcr{in} \\eqref{ks_ate} and \\eqref{ks_qte}, where \\tcr{the $p \\times r$ transformation} $\\hmbP$ is \\tcr{chosen as:}\n\n\\vskip0.04in\n\\begin{enumerate}[1.]\n\\item the slope vector ($r=1$) from the complete-case version of unregularized or regularized linear regression of $Y$ vs. $\\X$ (KS$_1$), \\tcg{which correctly specifies the outcome models (a), (b) and (d) but misspecifies (c) and (e) }; ~~\\tcr{or}\n\n\\item the first two directions ($r=2$) selected by the complete-case version of the unregularized (with $\\ceil{n/5}$ slices of equal width) or regularized (with $4$ slices of equal size) sliced inverse regression \\citep{li1991sliced, lin2019sparse} of $Y$ vs. $\\X$ (KS$_2$), \\tcg{which correctly specifies the outcome models (a), (b), (d) and (e) but misspecifies (c) }; ~~\\tcr{or}\n\\end{enumerate}\n\n\\vskip0.04in\n\\item parametric regression (PR), giving\n\\bse\n\\mhatnk(\\x)~\\equiv~(1,\\x\\trans)\\trans\\bxihat_k \\tcr{\\quad \\hbox{and} \\quad} \\phihatnk(\\x,\\thetahatinit)~\\equiv~ h\\{(1,\\x\\trans)\\trans\\bgammahat_k\\}-\\tau\\tcr{,}\n\\ese\nwith $\\bxihat_k/\\bgammahat_k$ \\tcr{respectively being} the slope vector from the complete-case version of unregularized or regularized linear$/$logistic regression of $Y/I(Y<\\thetahatinit) $ vs. $\\X$ using $\\cl_k^-$, \\tcg{which correctly specifies the outcome models \\{(a), (d)\\}  and (d) for the ATE and QTE estimation, respectively, while misspecifying the others.}\n%where $\\bbetahat_k$ is the slope vector from the complete-case version of unregularized (Lin) or regularized (Lin) linear regression of $Y$ vs. $\\X$, while $\\bgammahat_k$ is the slope vector from the complete-case version of unregularized (Lin) or regularized (Lin) logistic regression of $Y$ vs. $\\X$.\n\\end{enumerate}\n%\\tcr{(**TO GUORONG: NEED TO \\emph{EXPLAIN} A LITTLE MORE THE \\emph{RATIONALE} BEHIND THE ESTIMATOR AND TRUE DGP CHOICES. E.g., WHY ARE WE DOING THEM THIS WAY, AND HOW SOMETIMES THEY ARE CORRECTLY SPECIFIED, WHILE SOMETIMES THEY ARE NOT ETC. COULD YOU PLEASE ADD HERE?) } \t\n\\tcg{In general, our choices of $\\{\\pi(\\x),m(\\x)\\}$ incorporate \\tcr{both} linear \\tcr{and non-linear effects, including} quadratic and interaction effects, that are commonly encountered in practice. Also, our approaches to constructing $\\{\\pihatN(\\x), \\mhatnk(\\x), \\phihatnk(\\x,\\theta)\\}$ represent a broad class of flexible and user-friendly (parametric or semi-parametric) strategies \\tcr{often adopted} for modeling the relation between a continuous or binary response and a set of (possibly high dimensional) covariates.} \\tcr{They also allow for a variety of scenarios in terms of correct/incorrect specifications of the (working) nuisance models.} %\\tcr{, as detailed above for each of the estimator choices.}}\n%\n\\tcr{B}ased on the various $\\mhatnk(\\cdot) $ and $\\phihatnk(\\cdot,\\cdot) $ described above, we obtain $\\mhatn(\\cdot) $ and $\\phihatn(\\cdot,\\cdot) $ via the cross fitting procedures \\eqref{ds1}--\\eqref{ds2} and \\eqref{ds3}--\\eqref{ds4}. In addition, for the QTE estimation, we plug $\\thetahatinit$ and $\\hf(\\cdot) $ from Remark \\ref{remark_qte_initial_estimator} into $\\thetahatss$ defined by \\eqref{ss_qte}, while obtaining the initial estimator and estimated density for $\\thetahatsup$ in \\eqref{sup_qte} through the same IPW approach but with $\\pihatn(\\cdot) $ instead of $\\pihatN(\\cdot) $ \\tcr{(i.e., the version based on $\\cl$ instead of $\\cu$). The same $\\pihatn(\\cdot) $ is also used for constructing the supervised ATE estimator $\\muhatsup$ in \\eqref{sup_ate}.}\n\n\\tcr{For all combinations of the true data generating models, and for \\emph{any} of the choices of the nuisance function estimators as listed above, we implement our SS ATE and QTE estimators, evaluate their performances for both estimation \\tcr{(see Section \\ref{sec_sim_estimation}) } and inference \\tcr{(see Section \\ref{sec_sim_inference}) }, and also compare their estimation efficiency with respect to a variety of corresponding supervised estimators, \\eqref{sup_ate} and \\eqref{sup_qte}, as well as their oracle versions} \\tcg{(see the\\tcr{ir formal} descriptions \\tcr{in Section \\ref{sec_sim_estimation}} below) }.  All the results below are summarized from 500 replications.\n\n\\begin{table}\n\\def~{\\hphantom{0}}\n\\caption{Efficiencies of the ATE estimators relative to the corresponding oracle supervised estimators; \\tcg{see Remark \\ref{remark_interpretation_RE} for interpretations of these relative efficiencies.} Here\\tcr{,} $n$ denotes the labeled data size, $p$ the number of covariates, $q$ the model sparsity, $m(\\X)\\equiv\\E(Y\\mid\\X) $, $\\pi(\\X)\\equiv\\E(T\\mid\\X) $, $\\hat{\\pi}(\\X) $ \\tcr{--} the estimated propensity score, Lin \\tcr{--} logistic regression of $T$ vs. $\\X$\\tcr{,} and Quad \\tcr{--} logistic regression of $T$ vs. $(\\X\\trans,\\X_{[1]}^2,\\ldots,\\X_{[p]}^2)\\trans$; KS$_1/$KS$_2$ represents kernel smoothing on the one$/$two direction(s) selected by linear regression$/$%the\n\\tcr{sliced} inverse regression; PR \\tcr{denotes} parametric regression\\tcr{,} and ORE oracle relative efficiency. The \\textbf{\\tcn{blue}} color implies\nthe best efficiency in each case.}{\n\\resizebox{\\textwidth}{!}{\n\\begin{tabular}{ccc||ccc|ccc||ccc|ccc||c}\n\\hline\n\\multicolumn{3}{c||}{\\multirow{2}{*}{$p=10$}}  & \\multicolumn{6}{c||}{$n=200$}                      & \\multicolumn{6}{c||}{$n=500$}                      & \\multirow{3}{*}{ORE} \\\\\n\\cline{4-15}\n&  &  & \\multicolumn{3}{c|}{Supervised} & \\multicolumn{3}{c||}{\\textbf{SS}}& \\multicolumn{3}{c|}{Supervised} & \\multicolumn{3}{c||}{\\textbf{SS}}&                      \\\\\n$m(\\X) $              & $\\pi(\\X) $            & $\\hat{\\pi}(\\X) $      & KS$_1$  & KS$_2$ & PR   & KS$_1$ & KS$_2$ & PR   & KS$_1$  & KS$_2$ & PR   & KS$_1$ & KS$_2$ & PR   &                      \\\\ \\hline\n\\multirow{6}{*}{(a) } & (i)   & Lin  & 0.87 & 0.86 & 0.96 & 2.99 & 2.74 & \\tcn{\\bf 3.72} & 0.99 & 0.98 & 0.99 & 3.35 & 3.19 & \\tcn{\\bf 3.70} & 4.37 \\\\\n&       & Quad & 0.79 & 0.63 & 0.91 & 3.00 & 2.74 & \\tcn{\\bf 3.74} & 0.97 & 0.96 & 0.98 & 3.34 & 3.20 & \\tcn{\\bf 3.69} & 4.37 \\\\\n& (ii)  & Lin  & 0.93 & 0.91 & 0.99 & 3.37 & 3.10 & \\tcn{\\bf 4.05} & 1.00 & 1.00 & 0.99 & 3.64 & 3.55 & \\tcn{\\bf 3.93} & 4.78 \\\\\n&       & Quad & 0.88 & 0.85 & 0.91 & 3.43 & 3.19 & \\tcn{\\bf 4.07} & 0.99 & 1.00 & 0.98 & 3.68 & 3.59 & \\tcn{\\bf 3.96} & 4.78 \\\\\n& (iii) & Lin  & 0.87 & 0.84 & 0.95 & 2.89 & 2.53 & \\tcn{\\bf 4.05} & 0.96 & 0.95 & 0.99 & 3.21 & 3.08 & \\tcn{\\bf 3.88} & 4.99 \\\\\n&       & Quad & 0.86 & 0.81 & 0.91 & 3.08 & 2.70 & \\tcn{\\bf 4.13} & 0.98 & 0.98 & 1.00 & 3.44 & 3.31 & \\tcn{\\bf 3.92} & 4.99 \\\\ \\hline\n\\multirow{6}{*}{(b) } & (i)   & Lin  & 0.93 & 0.92 & 0.51 & \\tcn{\\bf 3.62} & 3.42 & 1.03 & 0.99 & 0.98 & 0.67 & \\tcn{\\bf 3.73} & 3.61 & 1.17 & 5.07 \\\\\n&       & Quad & 0.92 & 0.77 & 0.40 & \\tcn{\\bf 3.64} & 3.49 & 1.02 & 0.98 & 0.98 & 0.61 & \\tcn{\\bf 3.74} & 3.59 & 1.16 & 5.07 \\\\\n& (ii)  & Lin  & 0.94 & 0.86 & 0.26 & \\tcn{\\bf 2.29} & 1.69 & 0.36 & 0.92 & 0.91 & 0.15 & \\tcn{\\bf 2.29} & 2.16 & 0.18 & 3.55 \\\\\n&       & Quad & 0.85 & 0.81 & 0.28 & \\tcn{\\bf 2.35} & 1.76 & 0.41 & 0.91 & 0.90 & 0.17 & \\tcn{\\bf 2.34} & 2.20 & 0.21 & 3.55 \\\\\n& (iii) & Lin  & 0.90 & 0.89 & 0.51 & \\tcn{\\bf 3.10} & 2.83 & 0.88 & 0.97 & 0.97 & 0.60 & \\tcn{\\bf 3.05} & 3.00 & 0.84 & 4.39 \\\\\n&       & Quad & 0.87 & 0.84 & 0.56 & \\tcn{\\bf 3.20} & 2.90 & 1.08 & 0.98 & 0.96 & 0.63 & \\tcn{\\bf 3.11} & 3.04 & 1.07 & 4.39 \\\\ \\hline\n\\multirow{6}{*}{(c) } & (i)   & Lin  & 0.62 & 0.61 & 0.67 & \\tcn{\\bf 1.23} & 1.21 & 1.17 & 0.78 & 0.79 & 0.74 & 1.52 & \\tcn{\\bf 1.58} & 1.45 & 9.52 \\\\\n&       & Quad & 0.61 & 0.54 & 0.60 & \\tcn{\\bf 1.21} & 1.21 & 1.15 & 0.84 & 0.85 & 0.80 & 1.50 & \\tcn{\\bf 1.56} & 1.41 & 9.52 \\\\\n& (ii)  & Lin  & 0.70 & 0.66 & 0.56 & \\tcn{\\bf 1.32} & 1.17 & 1.01 & 0.85 & 0.84 & 0.55 & \\tcn{\\bf 1.58} & 1.52 & 0.96 & 8.71 \\\\\n&       & Quad & 0.79 & 0.75 & 0.83 & \\tcn{\\bf 1.35} & 1.19 & 1.32 & 0.90 & 0.89 & 0.83 & 1.47 & 1.46 & \\tcn{\\bf 1.49} & 8.71 \\\\\n& (iii) & Lin  & 0.57 & 0.58 & 0.53 & 0.92 & \\tcn{\\bf 0.95} & 0.87 & 0.48 & 0.49 & 0.43 & 0.70 &\\tcn{\\bf  0.72} & 0.61 & 9.42 \\\\\n&       & Quad & 0.78 & 0.74 & 0.83 & \\tcn{\\bf 1.42} & 1.40 & 1.51 & 0.94 & 0.92 & 0.92 & 1.59 & \\tcn{\\bf 1.60} & 1.55 & 9.42\\\\\n\\hline\n\\multicolumn{16}{c}{ } \\\\\\hline\n\\multicolumn{3}{c||}{\\multirow{2}{*}{$p=200,q=5$}} & \\multicolumn{6}{c||}{$n=200$}                      & \\multicolumn{6}{c||}{$n=500$}                      & \\multirow{3}{*}{ORE} \\\\\n\\cline{4-15}\n\\multicolumn{3}{c||}{} & \\multicolumn{3}{c|}{Supervised} & \\multicolumn{3}{c||}{\\textbf{SS}}& \\multicolumn{3}{c|}{Supervised} & \\multicolumn{3}{c||}{\\textbf{SS}}&                      \\\\\n$m(\\X) $              & $\\pi(\\X) $            & $\\hat{\\pi}(\\X) $      & KS$_1$  & KS$_2$ & PR   & KS$_1$ & KS$_2$ & PR   & KS$_1$  & KS$_2$ & PR   & KS$_1$ & KS$_2$ & PR   &                      \\\\ \\hline\n\\multirow{6}{*}{(a) }   & (i)         & Lin              & 0.72    & 0.22   & 0.46 & \\tcn{\\bf 1.60}   & 0.67   & 1.43 & 0.94    & 0.85   & 0.73 & \\tcn{\\bf 1.88}   & 1.62   & 1.73 & 2.68                 \\\\\n&             & Quad             & 0.70    & 0.20   & 0.43 & \\tcn{\\bf 1.61}   & 0.67   & 1.42 & 0.94    & 0.83   & 0.68 & \\tcn{\\bf 1.89}   & 1.62   & 1.72 & 2.68                 \\\\\n& (ii)        & Lin              & 0.87    & 0.45   & 0.70 & \\tcn{\\bf 1.89}   & 0.91   & 1.73 & 0.97    & 0.88   & 0.80 & \\tcn{\\bf 2.15}   & 2.00   & 2.05 & 2.89                 \\\\\n&             & Quad             & 0.86    & 0.44   & 0.69 & \\tcn{\\bf 1.91}   & 0.92   & 1.75 & 0.97    & 0.88   & 0.78 & \\tcn{\\bf 2.15}   & 1.99   & 2.07 & 2.89                 \\\\\n& (iii)       & Lin              & 0.82    & 0.34   & 0.57 & \\tcn{\\bf 1.74}   & 0.79   & 1.64 & 0.95    & 0.89   & 0.76 & \\tcn{\\bf 2.35}   & 2.06   & 2.17 & 3.00                 \\\\\n&             & Quad             & 0.80    & 0.32   & 0.55 & \\tcn{\\bf 1.79}   & 0.84   & 1.68 & 0.95    & 0.86   & 0.72 & \\tcn{\\bf 2.45}   & 2.13   & 2.19 & 3.00                 \\\\ \\hline\n\\multirow{6}{*}{(b) }   & (i)         & Lin              & 0.86    & 0.35   & 0.76 & \\tcn{\\bf 1.60}   & 0.94   & 1.06 & 0.95    & 0.95   & 0.65 & \\tcn{\\bf 2.04}   & 1.97   & 1.04 & 3.37                 \\\\\n&             & Quad             & 0.83    & 0.31   & 0.74 & \\tcn{\\bf 1.61}   & 0.93   & 1.08 & 0.95    & 0.95   & 0.65 & \\tcn{\\bf 2.04}   & 1.97   & 1.03 & 3.37                 \\\\\n& (ii)        & Lin              & 0.35    & 0.23   & 0.22 & \\tcn{\\bf 0.44}   & 0.40   & 0.35 & 0.55    & 0.35   & 0.14 & \\tcn{\\bf 0.73}   & 0.49   & 0.15 & 2.29                 \\\\\n&             & Quad             & 0.35    & 0.22   & 0.22 & \\tcn{\\bf 0.45}   & 0.42   & 0.37 & 0.54    & 0.34   & 0.14 & \\tcn{\\bf 0.75}   & 0.51   & 0.16 & 2.29                 \\\\\n& (iii)       & Lin              & 0.82    & 0.49   & 0.66 & \\tcn{\\bf 0.99}   & 0.72   & 0.68 & 0.88    & 0.85   & 0.68 & \\tcn{\\bf 1.48}   & 1.35   & 0.60 & 2.74                 \\\\\n&             & Quad             & 0.80    & 0.45   & 0.64 & \\tcn{\\bf 1.13}   & 0.78   & 0.80 & 0.90    & 0.86   & 0.71 & \\tcn{\\bf 1.66}   & 1.55   & 0.84 & 2.74                 \\\\ \\hline\n\\multirow{6}{*}{(c) }   & (i)         & Lin              & 0.59    & 0.23   & 0.39 & \\tcn{\\bf 1.00}   & 0.65   & 0.93 & 0.75    & 0.71   & 0.72 & 1.16   & 1.10   & \\tcn{\\bf 1.20} & 4.13                 \\\\\n&             & Quad             & 0.57    & 0.20   & 0.36 & \\tcn{\\bf 1.00}   & 0.64   & 0.92 & 0.76    & 0.70   & 0.71 & 1.17   & 1.10   & \\tcn{\\bf 1.20} & 4.13                 \\\\\n& (ii)        & Lin              & 0.64    & 0.35   & 0.43 & \\tcn{\\bf 0.99}   & 0.63   & 0.90 & 0.74    & 0.64   & 0.38 & \\tcn{\\bf 1.14}   & 1.05   & 0.79 & 3.63                 \\\\\n&             & Quad             & 0.64    & 0.34   & 0.42 & \\tcn{\\bf 1.02}   & 0.64   & 0.94 & 0.74    & 0.64   & 0.37 & \\tcn{\\bf 1.21}   & 1.12   & 0.91 & 3.63                 \\\\\n& (iii)       & Lin              & 0.39    & 0.19   & 0.25 & \\tcn{\\bf 0.68}   & 0.47   & 0.60 & 0.38    & 0.32   & 0.26 & \\tcn{\\bf 0.50}   & 0.47   & 0.43 & 3.78                 \\\\\n&             & Quad             & 0.39    & 0.18   & 0.24 & \\tcn{\\bf 0.95}   & 0.59   & 0.82 & 0.40    & 0.33   & 0.26 & \\tcn{\\bf 1.33}   & 1.15   & 1.04 & 3.78                 \\\\\n\\hline\n\\multicolumn{16}{c}{ } \\\\ \\hline\n\\multicolumn{3}{c||}{\\multirow{2}{*}{$p=200,q=\\ceil{p^{1/2}}$}} & \\multicolumn{6}{c||}{$n=200$}                      & \\multicolumn{6}{c||}{$n=500$}                      & \\multirow{3}{*}{ORE} \\\\\n\\cline{4-15}\n\\multicolumn{3}{c||}{} & \\multicolumn{3}{c|}{Supervised} & \\multicolumn{3}{c||}{\\textbf{SS}}& \\multicolumn{3}{c|}{Supervised} & \\multicolumn{3}{c||}{\\textbf{SS}}&                      \\\\\n$m(\\X) $              & $\\pi(\\X) $            & $\\hat{\\pi}(\\X) $      & KS$_1$  & KS$_2$ & PR   & KS$_1$ & KS$_2$ & PR   & KS$_1$  & KS$_2$ & PR   & KS$_1$ & KS$_2$ & PR   &                      \\\\ \\hline\n\\multirow{6}{*}{(a) }   & (i)         & Lin              & 0.35    & 0.09   & 0.29 & \\tcn{\\bf 1.38}   & 0.46   & 1.20 & 0.83    & 0.60   & 0.60 & \\tcn{\\bf 3.59}   & 2.04   & 2.96 & 6.05                 \\\\\n&             & Quad             & 0.34    & 0.09   & 0.28 & \\tcn{\\bf 1.36}   & 0.43   & 1.17 & 0.81    & 0.55   & 0.55 & \\tcn{\\bf 3.57}   & 2.01   & 2.87 & 6.05                 \\\\\n& (ii)        & Lin              & 0.68    & 0.23   & 0.61 & \\tcn{\\bf 1.74}   & 0.51   & 1.64 & 0.97    & 0.73   & 0.80 & \\tcn{\\bf 3.90}   & 2.55   & 3.71 & 6.65                 \\\\\n&             & Quad             & 0.67    & 0.23   & 0.60 & \\tcn{\\bf 1.78}   & 0.52   & 1.66 & 0.97    & 0.72   & 0.79 & \\tcn{\\bf 3.91}   & 2.51   & 3.72 & 6.65                 \\\\\n& (iii)       & Lin              & 0.62    & 0.14   & 0.49 & \\tcn{\\bf 2.07}   & 0.60   & 1.91 & 0.91    & 0.74   & 0.70 & \\tcn{\\bf 3.77}   & 2.65   & 3.54 & 6.99                 \\\\\n&             & Quad             & 0.60    & 0.13   & 0.48 & \\tcn{\\bf 2.13}   & 0.60   & 1.94 & 0.90    & 0.69   & 0.66 & \\tcn{\\bf 3.80}   & 2.67   & 3.50 & 6.99                 \\\\ \\hline\n\\multirow{6}{*}{(b) }   & (i)         & Lin              & 0.40    & 0.11   & 0.34 & \\tcn{\\bf 1.29}   & 0.55   & 1.16 & 0.91    & 0.77   & 0.89 & \\tcn{\\bf 3.89}   & 2.96   & 2.27 & 6.78                 \\\\\n&             & Quad             & 0.38    & 0.11   & 0.33 & \\tcn{\\bf 1.29}   & 0.52   & 1.16 & 0.88    & 0.70   & 0.89 & \\tcn{\\bf 3.91}   & 2.92   & 2.29 & 6.78                 \\\\\n& (ii)        & Lin              & 0.31    & 0.18   & 0.24 & \\tcn{\\bf 0.68}   & 0.44   & 0.56 & 0.60    & 0.53   & 0.21 & \\tcn{\\bf 1.55}   & 1.43   & 0.34 & 4.97                 \\\\\n&             & Quad             & 0.31    & 0.17   & 0.23 & \\tcn{\\bf 0.65}   & 0.42   & 0.54 & 0.59    & 0.52   & 0.21 & \\tcn{\\bf 1.52}   & 1.39   & 0.34 & 4.97                 \\\\\n& (iii)       & Lin              & 0.63    & 0.18   & 0.54 & \\tcn{\\bf 1.64}   & 0.75   & 1.33 & 0.96    & 0.82   & 0.93 & \\tcn{\\bf 3.43}   & 2.71   & 2.09 & 6.14                 \\\\\n&             & Quad             & 0.61    & 0.17   & 0.53 & \\tcn{\\bf 1.68}   & 0.77   & 1.36 & 0.94    & 0.78   & 0.93 & \\tcn{\\bf 3.45}   & 2.72   & 2.15 & 6.14                 \\\\ \\hline\n\\multirow{6}{*}{(c) }   & (i)         & Lin              & 0.16    & 0.10   & 0.13 & \\tcn{\\bf 0.56}   & 0.41   & 0.52 & 0.61    & 0.36   & 0.38 & \\tcn{\\bf 1.27}   & 0.93   & 1.15 & 17.23                \\\\\n&             & Quad             & 0.16    & 0.09   & 0.12 & \\tcn{\\bf 0.56}   & 0.39   & 0.51 & 0.59    & 0.32   & 0.34 & \\tcn{\\bf 1.26}   & 0.91   & 1.13 & 17.23                \\\\\n& (ii)        & Lin              & 0.31    & 0.22   & 0.26 & 0.65   & 0.49   & \\tcn{\\bf 0.67} & 0.63    & 0.48   & 0.36 & \\tcn{\\bf 1.23}   & 1.07   & 1.06 & 16.30                \\\\\n&             & Quad             & 0.30    & 0.22   & 0.25 & 0.65   & 0.48   & \\tcn{\\bf 0.65} & 0.63    & 0.49   & 0.35 & \\tcn{\\bf 1.24}   & 1.07   & 1.05 & 16.30                \\\\\n& (iii)       & Lin              & 0.16    & 0.10   & 0.13 & \\tcn{\\bf 0.54}   & 0.40   & 0.48 & 0.39    & 0.26   & 0.22 & \\tcn{\\bf 0.72}   & 0.59   & 0.59 & 17.82                \\\\\n&             & Quad             & 0.16    & 0.10   & 0.12 & \\tcn{\\bf 0.68}   & 0.52   & 0.53 & 0.38    & 0.24   & 0.21 & \\tcn{\\bf 1.27}   & 0.94   & 0.96 & 17.82 \\\\ \\hline\n\\end{tabular}\n}}\n\\label{table_ate_efficiency}\n\\end{table}\n\n\\begin{table}\n\\def~{\\hphantom{0}}\n\\caption{\\tcr{Efficiencies of QTE estimators.}\nWe consider the same scenario\\tcr{(s) } as \\tcr{in} Table \\ref{table_ate_efficiency}, but now the estimand is the QTE.} %(All other details remain the same as in the caption of Table \\ref{table_ate_efficiency}.) }\n{\n\\resizebox{\\textwidth}{!}{\n\\begin{tabular}{ccc||ccc|ccc||ccc|ccc||c}\n\\hline\n\\multicolumn{3}{c||}{\\multirow{2}{*}{$p=10$}}  & \\multicolumn{6}{c||}{$n=200$}                      & \\multicolumn{6}{c||}{$n=500$}                      & \\multirow{3}{*}{ORE} \\\\\n\\cline{4-15}\n&  &  & \\multicolumn{3}{c|}{Supervised} & \\multicolumn{3}{c||}{\\textbf{SS}}& \\multicolumn{3}{c|}{Supervised} & \\multicolumn{3}{c||}{\\textbf{SS}}&                      \\\\\n$m(\\X) $              & $\\pi(\\X) $            & $\\hat{\\pi}(\\X) $      & KS$_1$  & KS$_2$ & PR   & KS$_1$ & KS$_2$ & PR   & KS$_1$  & KS$_2$ & PR   & KS$_1$ & KS$_2$ & PR   &                      \\\\ \\hline\n\\multirow{6}{*}{(a) } & (i)                  & Lin                  & 0.96    & 0.90   & 0.79 & \\tcn{\\bf 1.98}   & 1.88   & 1.34 & 0.99    & 0.98   & 0.93 & 1.85   & 1.80   & \\tcn{\\bf 1.90} & 2.24                 \\\\\n&                      & Quad                 & 0.74    & 0.69   & 0.65 & \\tcn{\\bf 2.05}   & 1.93   & 1.36 & 0.99    & 0.98   & 0.91 & 1.86   & 1.82   & \\tcn{\\bf 1.89} & 2.24                 \\\\\n& (ii)                 & Lin                  & 0.86    & 0.85   & 0.82 & \\tcn{\\bf 1.56}   & 1.44   & 0.98 & 0.99    & 0.97   & 0.97 & 1.55   & 1.51   & \\tcn{\\bf 1.59} & 2.12                 \\\\\n&                      & Quad                 & 0.79    & 0.77   & 0.73 & \\tcn{\\bf 1.56}   & 1.48   & 1.00 & 0.99    & 0.97   & 0.95 & 1.57   & 1.50   & \\tcn{\\bf 1.61} & 2.12                 \\\\\n& (iii)                & Lin                  & 0.94    & 0.90   & 0.93 & 1.77   & 1.61   & \\tcn{\\bf 1.96} & 1.01    & 1.01   & 1.02 & \\tcn{\\bf 2.26}   & 2.24   & 2.18 & 2.42                 \\\\\n&                      & Quad                 & 0.88    & 0.80   & 0.93 & 1.85   & 1.69   & \\tcn{\\bf 1.89} & 0.96    & 0.97   & 0.99 & \\tcn{\\bf 2.29}   & 2.27   & 2.15 & 2.42                 \\\\ \\hline\n\\multirow{6}{*}{(b) } & (i)                  & Lin                  & 0.93    & 0.90   & 0.85 & \\tcn{\\bf 1.82}   & 1.70   & 1.42 & 0.95    & 0.93   & 0.92 & 1.78   & 1.73   & \\tcn{\\bf 1.84} & 2.13                 \\\\\n&                      & Quad                 & 0.77    & 0.74   & 0.72 & \\tcn{\\bf 1.86}   & 1.73   & 1.45 & 0.96    & 0.95   & 0.91 & 1.78   & 1.72   & \\tcn{\\bf 1.81} & 2.13                 \\\\\n& (ii)                 & Lin                  & 0.78    & 0.73   & 0.80 & \\tcn{\\bf 1.22}   & 1.10   & 1.08 & 0.82    & 0.75   & 0.78 & \\tcn{\\bf 1.38}   & 1.19   & 1.19 & 1.92                 \\\\\n&                      & Quad                 & 0.66    & 0.65   & 0.74 & \\tcn{\\bf 1.28}   & 1.15   & 1.11 & 0.84    & 0.78   & 0.80 & \\tcn{\\bf 1.44}   & 1.26   & 1.24 & 1.92                 \\\\\n& (iii)                & Lin                  & 0.90    & 0.88   & 0.89 & 1.57   & 1.45   & \\tcn{\\bf 1.79} & 0.93    & 0.93   & 0.95 & 1.82   & 1.84   & \\tcn{\\bf 1.92} & 2.16                 \\\\\n&                      & Quad                 & 0.85    & 0.83   & 0.90 & 1.74   & 1.60   & \\tcn{\\bf 1.89} & 0.92    & 0.91   & 0.96 & 1.89   & 1.93   & \\tcn{\\bf 1.97} & 2.16                 \\\\ \\hline\n\\multirow{6}{*}{(c) } & (i)                  & Lin                  & 0.71    & 0.70   & 0.69 & \\tcn{\\bf 1.12}   & 1.06   & 1.02 & 0.77    & 0.77   & 0.83 & 1.22   & 1.19   & \\tcn{\\bf 1.33} & 2.35                 \\\\\n&                      & Quad                 & 0.69    & 0.69   & 0.60 & \\tcn{\\bf 1.11}   & 1.05   & 1.01 & 0.83    & 0.83   & 0.87 & 1.18   & 1.15   & \\tcn{\\bf 1.26} & 2.35                 \\\\\n& (ii)                 & Lin                  & 0.70    & 0.70   & 0.66 & \\tcn{\\bf 0.99}   & 0.93   & 0.87 & 0.74    & 0.74   & 0.78 & 1.00   & 1.02   & \\tcn{\\bf 1.02} & 2.25                 \\\\\n&                      & Quad                 & 0.82    & 0.79   & 0.74 & \\tcn{\\bf 1.08}   & 1.02   & 0.94 & 0.84    & 0.84   & 0.87 & 1.16   & \\tcn{\\bf 1.19}   & 1.09 & 2.25                 \\\\\n& (iii)                & Lin                  & 0.61    & 0.63   & 0.65 & 0.82   & 0.80   & \\tcn{\\bf 0.96} & 0.58    & 0.58   & 0.63 & 0.77   & 0.77   & \\tcn{\\bf 0.88} & 2.55                 \\\\\n&                      & Quad                 & 0.86    & 0.85   & 0.86 & 1.16   & 1.12   & \\tcn{\\bf 1.25} & 0.95    & 0.93   & 0.92 & \\tcn{\\bf 1.28}   & 1.25   & 1.26 & 2.55 \\\\\\hline\n\\multicolumn{16}{c}{ } \\\\\n\\hline\n\\multicolumn{3}{c||}{\\multirow{2}{*}{$p=200,q=5$}} & \\multicolumn{6}{c||}{$n=200$}                      & \\multicolumn{6}{c||}{$n=500$}                      & \\multirow{3}{*}{ORE} \\\\\n\\cline{4-15}\n\\multicolumn{3}{c||}{} & \\multicolumn{3}{c|}{Supervised} & \\multicolumn{3}{c||}{\\textbf{SS}}& \\multicolumn{3}{c|}{Supervised} & \\multicolumn{3}{c||}{\\textbf{SS}}&                      \\\\\n$m(\\X) $              & $\\pi(\\X) $            & $\\hat{\\pi}(\\X) $      & KS$_1$  & KS$_2$ & PR   & KS$_1$ & KS$_2$ & PR   & KS$_1$  & KS$_2$ & PR   & KS$_1$ & KS$_2$ & PR   &                      \\\\ \\hline\n\\multirow{6}{*}{(a) }   & (i)         & Lin              & 0.73    & 0.39   & 0.35 & \\tcn{\\bf 1.29}   & 0.72   & 0.81 & 0.92    & 0.93   & 0.71 & \\tcn{\\bf 1.45}   & 1.40   & 1.22 & 1.78                 \\\\\n&             & Quad             & 0.71    & 0.36   & 0.32 & \\tcn{\\bf 1.28}   & 0.70   & 0.80 & 0.90    & 0.91   & 0.69 & \\tcn{\\bf 1.45}   & 1.40   & 1.21 & 1.78                 \\\\\n& (ii)        & Lin              & 0.88    & 0.44   & 0.35 & \\tcn{\\bf 1.03}   & 0.67   & 0.70 & 0.96    & 0.92   & 0.60 & \\tcn{\\bf 1.45}   & 1.35   & 1.05 & 1.69                 \\\\\n&             & Quad             & 0.87    & 0.44   & 0.35 & \\tcn{\\bf 1.04}   & 0.69   & 0.69 & 0.95    & 0.91   & 0.57 & \\tcn{\\bf 1.46}   & 1.37   & 1.07 & 1.69                 \\\\\n& (iii)       & Lin              & 0.91    & 0.47   & 0.43 & \\tcn{\\bf 1.31}   & 0.81   & 0.96 & 0.94    & 0.94   & 0.72 & \\tcn{\\bf 1.57}   & 1.55   & 1.33 & 1.86                 \\\\\n&             & Quad             & 0.88    & 0.43   & 0.39 & \\tcn{\\bf 1.41}   & 0.83   & 1.00 & 0.96    & 0.95   & 0.71 & \\tcn{\\bf 1.61}   & 1.59   & 1.36 & 1.86                 \\\\ \\hline\n\\multirow{6}{*}{(b) }   & (i)         & Lin              & 0.59    & 0.38   & 0.42 & \\tcn{\\bf 1.05}   & 0.73   & 0.79 & 0.89    & 0.90   & 0.96 & \\tcn{\\bf 1.29}   & 1.24   & 1.17 & 1.50                 \\\\\n&             & Quad             & 0.55    & 0.36   & 0.39 & \\tcn{\\bf 1.06}   & 0.73   & 0.78 & 0.81    & 0.80   & 0.91 & \\tcn{\\bf 1.30}   & 1.26   & 1.19 & 1.50                 \\\\\n& (ii)        & Lin              & 0.38    & 0.21   & 0.20 & \\tcn{\\bf 0.41}   & 0.33   & 0.35 & 0.77    & 0.70   & 0.22 & \\tcn{\\bf 0.81}   & 0.67   & 0.25 & 1.45                 \\\\\n&             & Quad             & 0.38    & 0.21   & 0.20 & \\tcn{\\bf 0.43}   & 0.34   & 0.35 & 0.75    & 0.68   & 0.21 & \\tcn{\\bf 0.81}   & 0.69   & 0.26 & 1.45                 \\\\\n& (iii)       & Lin              & 0.69    & 0.45   & 0.41 & \\tcn{\\bf 0.76}   & 0.64   & 0.67 & 0.95    & 0.93   & 0.88 & \\tcn{\\bf 1.08}   & 1.04   & 0.82 & 1.50                 \\\\\n&             & Quad             & 0.67    & 0.40   & 0.38 & \\tcn{\\bf 0.83}   & 0.69   & 0.74 & 0.90    & 0.89   & 0.87 & \\tcn{\\bf 1.14}   & 1.11   & 0.95 & 1.50                 \\\\ \\hline\n\\multirow{6}{*}{(c) }   & (i)         & Lin              & 0.67    & 0.35   & 0.30 & \\tcn{\\bf 0.91}   & 0.66   & 0.72 & 0.81    & 0.77   & 0.56 & \\tcn{\\bf 1.09}   & 1.05   & 0.91 & 1.81                 \\\\\n&             & Quad             & 0.63    & 0.33   & 0.28 & \\tcn{\\bf 0.91}   & 0.67   & 0.71 & 0.81    & 0.77   & 0.55 & \\tcn{\\bf 1.08}   & 1.03   & 0.87 & 1.81                 \\\\\n& (ii)        & Lin              & 0.66    & 0.34   & 0.30 & \\tcn{\\bf 0.77}   & 0.51   & 0.61 & 0.77    & 0.75   & 0.44 & 1.03   & \\tcn{\\bf 1.03}   & 0.75 & 1.74                 \\\\\n&             & Quad             & 0.67    & 0.34   & 0.30 & \\tcn{\\bf 0.79}   & 0.52   & 0.62 & 0.75    & 0.73   & 0.42 & 1.08   & \\tcn{\\bf 1.09}   & 0.82 & 1.74                 \\\\\n& (iii)       & Lin              & 0.55    & 0.24   & 0.22 & \\tcn{\\bf 0.62}   & 0.46   & 0.52 & 0.51    & 0.50   & 0.29 & \\tcn{\\bf 0.59}   & 0.57   & 0.49 & 1.91                 \\\\\n&             & Quad             & 0.54    & 0.23   & 0.21 & \\tcn{\\bf 0.86}   & 0.55   & 0.68 & 0.55    & 0.53   & 0.29 & \\tcn{\\bf 0.97}   & 0.93   & 0.80 & 1.91                 \\\\ \\hline\n\\multicolumn{16}{c}{ } \\\\ \\hline\n\\multicolumn{3}{c||}{\\multirow{2}{*}{$p=200,q=\\ceil{p^{1/2}}$}} & \\multicolumn{6}{c||}{$n=200$}                      & \\multicolumn{6}{c||}{$n=500$}                      & \\multirow{3}{*}{ORE} \\\\\n\\cline{4-15}\n\\multicolumn{3}{c||}{} & \\multicolumn{3}{c|}{Supervised} & \\multicolumn{3}{c||}{\\textbf{SS}}& \\multicolumn{3}{c|}{Supervised} & \\multicolumn{3}{c||}{\\textbf{SS}}&                      \\\\\n$m(\\X) $              & $\\pi(\\X) $            & $\\hat{\\pi}(\\X) $      & KS$_1$  & KS$_2$ & PR   & KS$_1$ & KS$_2$ & PR   & KS$_1$  & KS$_2$ & PR   & KS$_1$ & KS$_2$ & PR   &                      \\\\ \\hline\n\\multirow{6}{*}{(a) }   & (i)         & Lin              & 0.53    & 0.14   & 0.09 & \\tcn{\\bf 0.89}   & 0.44   & 0.43 & 0.85    & 0.80   & 0.45 & \\tcn{\\bf 2.06}   & 1.74   & 1.16 & 2.62                 \\\\\n&             & Quad             & 0.53    & 0.14   & 0.09 & \\tcn{\\bf 0.92}   & 0.42   & 0.42 & 0.80    & 0.73   & 0.37 & \\tcn{\\bf 2.05}   & 1.73   & 1.12 & 2.62                 \\\\\n& (ii)        & Lin              & 0.68    & 0.21   & 0.15 & \\tcn{\\bf 0.99}   & 0.40   & 0.41 & 0.79    & 0.71   & 0.33 & \\tcn{\\bf 1.63}   & 1.40   & 0.79 & 2.45                 \\\\\n&             & Quad             & 0.67    & 0.21   & 0.15 & \\tcn{\\bf 1.01}   & 0.39   & 0.39 & 0.80    & 0.71   & 0.32 & \\tcn{\\bf 1.66}   & 1.43   & 0.75 & 2.45                 \\\\\n& (iii)       & Lin              & 0.77    & 0.21   & 0.14 & \\tcn{\\bf 1.42}   & 0.58   & 0.62 & 0.85    & 0.80   & 0.50 & \\tcn{\\bf 2.21}   & 1.69   & 1.31 & 2.87                 \\\\\n&             & Quad             & 0.76    & 0.20   & 0.14 & \\tcn{\\bf 1.40}   & 0.58   & 0.61 & 0.81    & 0.74   & 0.43 & \\tcn{\\bf 2.14}   & 1.68   & 1.32 & 2.87                 \\\\ \\hline\n\\multirow{6}{*}{(b) }   & (i)         & Lin              & 0.46    & 0.12   & 0.08 & \\tcn{\\bf 0.73}   & 0.43   & 0.42 & 0.76    & 0.77   & 0.48 & \\tcn{\\bf 1.85}   & 1.62   & 1.10 & 2.59                 \\\\\n&             & Quad             & 0.45    & 0.12   & 0.08 & \\tcn{\\bf 0.73}   & 0.41   & 0.39 & 0.70    & 0.70   & 0.40 & \\tcn{\\bf 1.82}   & 1.61   & 1.07 & 2.59                 \\\\\n& (ii)        & Lin              & 0.38    & 0.18   & 0.13 & \\tcn{\\bf 0.56}   & 0.38   & 0.40 & 0.67    & 0.63   & 0.33 & \\tcn{\\bf 1.21}   & 1.16   & 0.72 & 2.29                 \\\\\n&             & Quad             & 0.37    & 0.17   & 0.13 & \\tcn{\\bf 0.56}   & 0.35   & 0.37 & 0.69    & 0.64   & 0.32 & \\tcn{\\bf 1.15}   & 1.14   & 0.70 & 2.29                 \\\\\n& (iii)       & Lin              & 0.68    & 0.19   & 0.13 & \\tcn{\\bf 0.97}   & 0.62   & 0.61 & 0.82    & 0.74   & 0.50 & \\tcn{\\bf 2.06}   & 1.66   & 1.37 & 2.73                 \\\\\n&             & Quad             & 0.66    & 0.18   & 0.12 & \\tcn{\\bf 0.98}   & 0.63   & 0.61 & 0.80    & 0.72   & 0.46 & \\tcn{\\bf 1.99}   & 1.60   & 1.35 & 2.73                 \\\\ \\hline\n\\multirow{6}{*}{(c) }   & (i)         & Lin              & 0.27    & 0.13   & 0.10 & \\tcn{\\bf 0.55}   & 0.42   & 0.45 & 0.72    & 0.67   & 0.27 & \\tcn{\\bf 1.11}   & 0.97   & 0.73 & 2.72                 \\\\\n&             & Quad             & 0.27    & 0.13   & 0.09 & \\tcn{\\bf 0.53}   & 0.41   & 0.43 & 0.67    & 0.61   & 0.23 & \\tcn{\\bf 1.09}   & 0.95   & 0.69 & 2.72                 \\\\\n& (ii)        & Lin              & 0.37    & 0.22   & 0.17 & \\tcn{\\bf 0.54}   & 0.42   & 0.47 & 0.67    & 0.57   & 0.21 & \\tcn{\\bf 0.94}   & 0.80   & 0.51 & 2.58                 \\\\\n&             & Quad             & 0.37    & 0.22   & 0.17 & \\tcn{\\bf 0.54}   & 0.41   & 0.46 & 0.67    & 0.56   & 0.21 & \\tcn{\\bf 0.94}   & 0.81   & 0.49 & 2.58                 \\\\\n& (iii)       & Lin              & 0.26    & 0.14   & 0.12 & \\tcn{\\bf 0.56}   & 0.42   & 0.45 & 0.62    & 0.49   & 0.23 & \\tcn{\\bf 0.87}   & 0.75   & 0.60 & 3.04                 \\\\\n&             & Quad             & 0.26    & 0.14   & 0.11 & \\tcn{\\bf 0.59}   & 0.46   & 0.47 & 0.59    & 0.46   & 0.21 & \\tcn{\\bf 1.06}   & 0.89   & 0.71 & 3.04 \\\\ \\hline\n\\end{tabular}\n}}\n\\label{table_qte_efficiency}\n\\end{table}\n\n\\begin{table}\n\\def~{\\hphantom{0}}\n    \\caption{Inference based on the SS estimators %using\n    \\underline{\\tcr{using} kernel smoothing on the direction selected by linear regression \\tcr{(KS$_1$) }} \\tcr{as the choice of the working outcome model, for the ATE and the QTE,} when $n=500$. %\\tcr{Results are presented for both the ATE and the QTE}.\n    Here\\tcr{,} ESE is the empirical standard error, \\tcr{Bias is the empirical bias,} ASE \\tcr{is} the average of the estimated standard errors\\tcr{,} and CR \\tcr{is} the \\tcr{empirical} coverage rate of the 95\\% confidence intervals. \\tcr{All o}ther notations are the same as in Table \\ref{table_ate_efficiency}. The \\textbf{{\\color{navyblue} blue}} color \\tcr{highlights settings where} %implies both\nthe propensity scores and the outcome models are \\tcr{both} correctly specified, while the \\textbf{boldfaces} \\tcr{indicate ones where} %mean\nthe propensity scores are correctly specified but the outcome models are not.}{\n\\resizebox{\\textwidth}{!}{\n\\begin{tabular}{ccc|cccc|cccc|cccc}\n\\hline\n\\multicolumn{3}{c|}{ATE} & \\multicolumn{4}{c|}{$p=10$} & \\multicolumn{4}{c|}{$p=200,q=5$} & \\multicolumn{4}{c}{$p=200,q=\\ceil{p^{1/2}}$} \\\\\n$m(\\X) $              & $\\pi(\\X) $            & $\\hat{\\pi}(\\X) $      & ESE  & Bias  & ASE  & CR   & ESE    & Bias   & ASE   & CR    & ESE       & Bias       & ASE      & CR       \\\\ \\hline\n& (i)   & {\\color{navyblue} \\textbf{Lin}}  & {\\color{navyblue} \\textbf{0.08}} & {\\color{navyblue} \\textbf{0.00}} & {\\color{navyblue} \\textbf{0.08}} & {\\color{navyblue} \\textbf{0.93}} & {\\color{navyblue} \\textbf{0.08}} & {\\color{navyblue} \\textbf{0.01}} & {\\color{navyblue} \\textbf{0.08}} & {\\color{navyblue} \\textbf{0.93}} & {\\color{navyblue} \\textbf{0.09}} & {\\color{navyblue} \\textbf{0.01}} & {\\color{navyblue} \\textbf{0.09}} & {\\color{navyblue} \\textbf{0.93}} \\\\\n&       & {\\color{navyblue} \\textbf{Quad}} & {\\color{navyblue} \\textbf{0.08}} & {\\color{navyblue} \\textbf{0.00}} & {\\color{navyblue} \\textbf{0.08}} & {\\color{navyblue} \\textbf{0.93}} & {\\color{navyblue} \\textbf{0.08}} & {\\color{navyblue} \\textbf{0.01}} & {\\color{navyblue} \\textbf{0.07}} & {\\color{navyblue} \\textbf{0.95}} & {\\color{navyblue} \\textbf{0.09}} & {\\color{navyblue} \\textbf{0.02}} & {\\color{navyblue} \\textbf{0.09}} & {\\color{navyblue} \\textbf{0.93}} \\\\\n& (ii)  & Lin                                  & 0.07                                 & 0.00                                 & 0.08                                 & 0.95                                 & 0.07                                 & 0.00                                 & 0.07                                 & 0.97                                 & 0.08                                 & 0.00                                 & 0.08                                 & 0.95                                 \\\\\n&       & Quad                                 & 0.07                                 & 0.00                                 & 0.07                                 & 0.96                                 & 0.07                                 & 0.00                                 & 0.07                                 & 0.96                                 & 0.08                                 & 0.00                                 & 0.08                                 & 0.95                                 \\\\\n& (iii) & Lin                                  & 0.08                                 & 0.00                                 & 0.08                                 & 0.93                                 & 0.07                                 & 0.01                                 & 0.07                                 & 0.94                                 & 0.08                                 & 0.01                                 & 0.08                                 & 0.94                                 \\\\\n\\multirow{-6}{*}{(a) } &       & {\\color{navyblue} \\textbf{Quad}} & {\\color{navyblue} \\textbf{0.08}} & {\\color{navyblue} \\textbf{0.00}} & {\\color{navyblue} \\textbf{0.07}} & {\\color{navyblue} \\textbf{0.93}} & {\\color{navyblue} \\textbf{0.07}} & {\\color{navyblue} \\textbf{0.01}} & {\\color{navyblue} \\textbf{0.07}} & {\\color{navyblue} \\textbf{0.94}} & {\\color{navyblue} \\textbf{0.08}} & {\\color{navyblue} \\textbf{0.01}} & {\\color{navyblue} \\textbf{0.08}} & {\\color{navyblue} \\textbf{0.94}} \\\\ \\hline\n& (i)   & {\\color{navyblue} \\textbf{Lin}}  & {\\color{navyblue} \\textbf{0.08}} & {\\color{navyblue} \\textbf{0.00}} & {\\color{navyblue} \\textbf{0.08}} & {\\color{navyblue} \\textbf{0.93}} & {\\color{navyblue} \\textbf{0.08}} & {\\color{navyblue} \\textbf{0.00}} & {\\color{navyblue} \\textbf{0.08}} & {\\color{navyblue} \\textbf{0.95}} & {\\color{navyblue} \\textbf{0.09}} & {\\color{navyblue} \\textbf{0.00}} & {\\color{navyblue} \\textbf{0.09}} & {\\color{navyblue} \\textbf{0.94}} \\\\\n&       & {\\color{navyblue} \\textbf{Quad}} & {\\color{navyblue} \\textbf{0.08}} & {\\color{navyblue} \\textbf{0.00}} & {\\color{navyblue} \\textbf{0.08}} & {\\color{navyblue} \\textbf{0.94}} & {\\color{navyblue} \\textbf{0.08}} & {\\color{navyblue} \\textbf{0.00}} & {\\color{navyblue} \\textbf{0.08}} & {\\color{navyblue} \\textbf{0.94}} & {\\color{navyblue} \\textbf{0.09}} & {\\color{navyblue} \\textbf{0.01}} & {\\color{navyblue} \\textbf{0.09}} & {\\color{navyblue} \\textbf{0.94}} \\\\\n& (ii)  & Lin                                  & 0.07                                 & 0.02                                 & 0.08                                 & 0.94                                 & 0.08                                 & 0.06                                 & 0.08                                 & 0.87                                 & 0.09                                 & 0.07                                 & 0.09                                 & 0.90                                 \\\\\n&       & Quad                                 & 0.07                                 & 0.02                                 & 0.07                                 & 0.95                                 & 0.08                                 & 0.06                                 & 0.08                                 & 0.87                                 & 0.09                                 & 0.07                                 & 0.09                                 & 0.89                                 \\\\\n& (iii) & Lin                                  & 0.08                                 & 0.00                                 & 0.07                                 & 0.93                                 & 0.08                                 & 0.01                                 & 0.08                                 & 0.96                                 & 0.08                                 & 0.01                                 & 0.08                                 & 0.95                                 \\\\\n\\multirow{-6}{*}{(b) } &       & {\\color{navyblue} \\textbf{Quad}} & {\\color{navyblue} \\textbf{0.08}} & {\\color{navyblue} \\textbf{0.00}} & {\\color{navyblue} \\textbf{0.07}} & {\\color{navyblue} \\textbf{0.93}} & {\\color{navyblue} \\textbf{0.08}} & {\\color{navyblue} \\textbf{0.00}} & {\\color{navyblue} \\textbf{0.07}} & {\\color{navyblue} \\textbf{0.96}} & {\\color{navyblue} \\textbf{0.08}} & {\\color{navyblue} \\textbf{0.00}} & {\\color{navyblue} \\textbf{0.08}} & {\\color{navyblue} \\textbf{0.95}} \\\\ \\hline\n& (i)   & \\textbf{Lin}                         & \\textbf{0.13}                        & \\textbf{0.00}                        & \\textbf{0.13}                        & \\textbf{0.96}                        & \\textbf{0.11}                        & \\textbf{0.01}                        & \\textbf{0.10}                        & \\textbf{0.92}                        & \\textbf{0.17}                        & \\textbf{0.02}                        & \\textbf{0.16}                        & \\textbf{0.93}                        \\\\\n&       & \\textbf{Quad}                        & \\textbf{0.13}                        & \\textbf{0.00}                        & \\textbf{0.13}                        & \\textbf{0.95}                        & \\textbf{0.11}                        & \\textbf{0.01}                        & \\textbf{0.10}                        & \\textbf{0.92}                        & \\textbf{0.17}                        & \\textbf{0.03}                        & \\textbf{0.16}                        & \\textbf{0.92}                        \\\\\n& (ii)  & Lin                                  & 0.11                                 & 0.01                                 & 0.12                                 & 0.97                                 & 0.09                                 & 0.02                                 & 0.09                                 & 0.95                                 & 0.15                                 & 0.04                                 & 0.15                                 & 0.94                                 \\\\\n&       & Quad                                 & 0.11                                 & -0.04                                & 0.12                                 & 0.96                                 & 0.09                                 & 0.01                                 & 0.09                                 & 0.96                                 & 0.15                                 & 0.04                                 & 0.15                                 & 0.94                                 \\\\\n& (iii) & Lin                                  & 0.12                                 & 0.13                                 & 0.12                                 & 0.83                                 & 0.09                                 & 0.11                                 & 0.09                                 & 0.78                                 & 0.15                                 & 0.15                                 & 0.15                                 & 0.83                                 \\\\\n\\multirow{-6}{*}{(c) } &       & \\textbf{Quad}                        & \\textbf{0.12}                        & \\textbf{0.01}                        & \\textbf{0.12}                        & \\textbf{0.95}                        & \\textbf{0.09}                        & \\textbf{-0.01}                       & \\textbf{0.10}                        & \\textbf{0.97}                        & \\textbf{0.16}                        & \\textbf{-0.02}                       & \\textbf{0.17}                        & \\textbf{0.96}\n\\\\ \\hline\n\\multicolumn{15}{c}{ } \\\\ \\hline\n\\multicolumn{3}{c|}{QTE}& \\multicolumn{4}{c|}{$p=10$} & \\multicolumn{4}{c|}{$p=200,q=5$} & \\multicolumn{4}{c}{$p=200,q=\\ceil{p^{1/2}}$} \\\\\n$m(\\X) $              & $\\pi(\\X) $            & $\\hat{\\pi}(\\X) $      & ESE  & Bias  & ASE  & CR   & ESE    & Bias   & ASE   & CR    & ESE       & Bias       & ASE      & CR       \\\\ \\hline\n& (i)   & {\\color{navyblue} \\textbf{Lin}}  & {\\color{navyblue} \\textbf{0.15}} & {\\color{navyblue} \\textbf{0.04}} & {\\color{navyblue} \\textbf{0.15}} & {\\color{navyblue} \\textbf{0.92}} & {\\color{navyblue} \\textbf{0.13}} & {\\color{navyblue} \\textbf{0.01}} & {\\color{navyblue} \\textbf{0.13}} & {\\color{navyblue} \\textbf{0.95}} & {\\color{navyblue} \\textbf{0.17}} & {\\color{navyblue} \\textbf{-0.01}} & {\\color{navyblue} \\textbf{0.17}} & {\\color{navyblue} \\textbf{0.94}} \\\\\n&       & {\\color{navyblue} \\textbf{Quad}} & {\\color{navyblue} \\textbf{0.15}} & {\\color{navyblue} \\textbf{0.04}} & {\\color{navyblue} \\textbf{0.15}} & {\\color{navyblue} \\textbf{0.93}} & {\\color{navyblue} \\textbf{0.13}} & {\\color{navyblue} \\textbf{0.01}} & {\\color{navyblue} \\textbf{0.13}} & {\\color{navyblue} \\textbf{0.95}} & {\\color{navyblue} \\textbf{0.17}} & {\\color{navyblue} \\textbf{-0.01}} & {\\color{navyblue} \\textbf{0.17}} & {\\color{navyblue} \\textbf{0.94}} \\\\\n& (ii)  & Lin                                  & 0.15                                 & 0.04                                 & 0.14                                 & 0.91                                 & 0.13                                 & 0.01                                 & 0.12                                 & 0.94                                 & 0.18                                 & -0.01                                 & 0.16                                 & 0.92                                 \\\\\n&       & Quad                                 & 0.15                                 & 0.04                                 & 0.14                                 & 0.91                                 & 0.13                                 & 0.01                                 & 0.12                                 & 0.94                                 & 0.18                                 & -0.01                                 & 0.16                                 & 0.93                                 \\\\\n& (iii) & Lin                                  & 0.13                                 & 0.02                                 & 0.13                                 & 0.94                                 & 0.11                                 & 0.01                                 & 0.12                                 & 0.96                                 & 0.15                                 & 0.01                                  & 0.15                                 & 0.95                                 \\\\\n\\multirow{-6}{*}{(a) } &       & {\\color{navyblue} \\textbf{Quad}} & {\\color{navyblue} \\textbf{0.13}} & {\\color{navyblue} \\textbf{0.02}} & {\\color{navyblue} \\textbf{0.13}} & {\\color{navyblue} \\textbf{0.94}} & {\\color{navyblue} \\textbf{0.11}} & {\\color{navyblue} \\textbf{0.01}} & {\\color{navyblue} \\textbf{0.12}} & {\\color{navyblue} \\textbf{0.96}} & {\\color{navyblue} \\textbf{0.15}} & {\\color{navyblue} \\textbf{0.01}}  & {\\color{navyblue} \\textbf{0.15}} & {\\color{navyblue} \\textbf{0.95}} \\\\ \\hline\n& (i)   & {\\color{navyblue} \\textbf{Lin}}  & {\\color{navyblue} \\textbf{0.15}} & {\\color{navyblue} \\textbf{0.02}} & {\\color{navyblue} \\textbf{0.14}} & {\\color{navyblue} \\textbf{0.92}} & {\\color{navyblue} \\textbf{0.13}} & {\\color{navyblue} \\textbf{0.01}} & {\\color{navyblue} \\textbf{0.13}} & {\\color{navyblue} \\textbf{0.95}} & {\\color{navyblue} \\textbf{0.18}} & {\\color{navyblue} \\textbf{0.00}}  & {\\color{navyblue} \\textbf{0.17}} & {\\color{navyblue} \\textbf{0.93}} \\\\\n&       & {\\color{navyblue} \\textbf{Quad}} & {\\color{navyblue} \\textbf{0.15}} & {\\color{navyblue} \\textbf{0.02}} & {\\color{navyblue} \\textbf{0.14}} & {\\color{navyblue} \\textbf{0.93}} & {\\color{navyblue} \\textbf{0.13}} & {\\color{navyblue} \\textbf{0.01}} & {\\color{navyblue} \\textbf{0.13}} & {\\color{navyblue} \\textbf{0.95}} & {\\color{navyblue} \\textbf{0.18}} & {\\color{navyblue} \\textbf{0.00}}  & {\\color{navyblue} \\textbf{0.17}} & {\\color{navyblue} \\textbf{0.94}} \\\\\n& (ii)  & Lin                                  & 0.14                                 & 0.05                                 & 0.14                                 & 0.94                                 & 0.12                                 & 0.07                                 & 0.12                                 & 0.94                                 & 0.19                                 & 0.05                                  & 0.17                                 & 0.92                                 \\\\\n&       & Quad                                 & 0.14                                 & 0.05                                 & 0.14                                 & 0.95                                 & 0.12                                 & 0.07                                 & 0.12                                 & 0.93                                 & 0.19                                 & 0.04                                  & 0.17                                 & 0.92                                 \\\\\n& (iii) & Lin                                  & 0.13                                 & 0.02                                 & 0.13                                 & 0.95                                 & 0.12                                 & 0.02                                 & 0.12                                 & 0.94                                 & 0.15                                 & 0.00                                  & 0.15                                 & 0.95                                 \\\\\n\\multirow{-6}{*}{(b) } &       & {\\color{navyblue} \\textbf{Quad}} & {\\color{navyblue} \\textbf{0.13}} & {\\color{navyblue} \\textbf{0.02}} & {\\color{navyblue} \\textbf{0.13}} & {\\color{navyblue} \\textbf{0.95}} & {\\color{navyblue} \\textbf{0.12}} & {\\color{navyblue} \\textbf{0.01}} & {\\color{navyblue} \\textbf{0.12}} & {\\color{navyblue} \\textbf{0.95}} & {\\color{navyblue} \\textbf{0.15}} & {\\color{navyblue} \\textbf{0.00}}  & {\\color{navyblue} \\textbf{0.15}} & {\\color{navyblue} \\textbf{0.95}} \\\\ \\hline\n& (i)   & \\textbf{Lin}                         & \\textbf{0.19}                        & \\textbf{0.01}                        & \\textbf{0.21}                        & \\textbf{0.96}                        & \\textbf{0.16}                        & \\textbf{0.02}                        & \\textbf{0.16}                        & \\textbf{0.97}                        & \\textbf{0.26}                        & \\textbf{0.00}                         & \\textbf{0.27}                        & \\textbf{0.95}                        \\\\\n&       & \\textbf{Quad}                        & \\textbf{0.20}                        & \\textbf{0.01}                        & \\textbf{0.21}                        & \\textbf{0.95}                        & \\textbf{0.16}                        & \\textbf{0.03}                        & \\textbf{0.16}                        & \\textbf{0.97}                        & \\textbf{0.26}                        & \\textbf{0.00}                         & \\textbf{0.27}                        & \\textbf{0.95}                        \\\\\n& (ii)  & Lin                                  & 0.20                                 & 0.07                                 & 0.19                                 & 0.92                                 & 0.14                                 & 0.04                                 & 0.15                                 & 0.94                                 & 0.24                                 & 0.05                                  & 0.24                                 & 0.95                                 \\\\\n&       & Quad                                 & 0.19                                 & 0.01                                 & 0.19                                 & 0.95                                 & 0.14                                 & 0.02                                 & 0.15                                 & 0.95                                 & 0.24                                 & 0.04                                  & 0.24                                 & 0.96                                 \\\\\n& (iii) & Lin                                  & 0.18                                 & 0.15                                 & 0.18                                 & 0.88                                 & 0.15                                 & 0.13                                 & 0.15                                 & 0.86                                 & 0.22                                 & 0.15                                  & 0.23                                 & 0.91                                 \\\\\n\\multirow{-6}{*}{(c) } &       & \\textbf{Quad}                        & \\textbf{0.18}                        & \\textbf{0.01}                        & \\textbf{0.18}                        & \\textbf{0.95}                        & \\textbf{0.14}                        & \\textbf{0.05}                        & \\textbf{0.14}                        & \\textbf{0.93}                        & \\textbf{0.22}                        & \\textbf{0.11}                         & \\textbf{0.23}                        & \\textbf{0.93}\n\\\\ \\hline\n\\end{tabular}\n}}\n\\label{table_inferece}\n\\end{table}\n\n\\subsection{\\tcr{Results on estimation efficiency} %\\tcg{Estimation results}\n}\\label{sec_sim_estimation}\nIn Tables \\ref{table_ate_efficiency}--\\ref{table_qte_efficiency}, we report the efficiencies, measured by mean squared errors, of various supervised and SS estimators relative to the corresponding ``oracle\'\' supervised estimators $\\muhatora$ and $\\thetahatora$, constructed via substituting $\\{\\pi(\\cdot),m(\\cdot),\\phi(\\cdot,\\cdot)\\}$ for $\\{\\pihatn(\\cdot),\\mhatn(\\cdot),\\phihatn(\\cdot,\\cdot)\\}$ in \\eqref{sup_ate} and \\eqref{sup_qte}. The supervised ``oracle\'\' estimators of the QTE use the initial estimators and estimated densities from the IPW approach described in Remark \\ref{remark_qte_initial_estimator} with $\\pihatN(\\cdot) $ replaced by $\\pi(\\cdot) $. \\tcg{\\tcr{We clarify here that s}uch ``oracle\'\' estimators \\tcr{(for both the ATE and the QTE) } are obviously \\emph{unrealistic}\\tcr{,}\nand \\tcr{are used here} just \\tcr{to} serve as suitable benchmarks that are always consistent. Specifically, the relative efficiencies in Table \\ref{table_ate_efficiency} are calculated by\\tcr{:}}\n\\bse\n\\E\\{(\\muhatora-\\mu_0)^2\\}/\\E\\{(\\muhatsup-\\mu_0)^2\\} \\hbox{ and } \\E\\{(\\muhatora-\\mu_0)^2\\}/\\E\\{(\\muhatss-\\mu_0)^2\\},\n\\ese\n\\tcg{while those in Table \\ref{table_qte_efficiency} are given by\\tcr{:}}\n\\bse\n\\E\\{(\\thetahatora-\\theta_0)^2\\}/\\E\\{(\\thetahatsup-\\theta_0)^2\\} \\hbox{ and } \\E\\{(\\thetahatora-\\theta_0)^2\\}/\\E\\{(\\thetahatss-\\theta_0)^2\\}.\n\\ese\nFor reference, we provide the ``oracle\'\' relative efficiencies \\tcr{(denoted \\tcr{as} ``ORE\'\' in the tables) } given by\\tcr{:} $\\lams^2/\\lamss^2$ and $\\sigsup^2/\\sigss^2$ with $\\{m^*(\\cdot),\\phis(\\cdot,\\cdot)\\}=\\{m(\\cdot),\\phi(\\cdot,\\cdot)\\}$ as well, where $\\lams^2$, $\\lamss^2$, $\\sigsup^2$ and $\\sigss^2$  are the asymptotic variances in \\eqref{ate_normality}, \\eqref{ate_sup_normality}, \\eqref{qte_normality} and \\eqref{qte_sup_normality}, respectively. The unknown quantities therein as well as the true values of $\\mu_0$ and $\\vt$ are approximated by Monte Carlo based on $100,000$ realizations of $(Y,T,\\X\\trans)\\trans$ independent of $\\cl\\cup\\cu$. It is noteworthy \\tcr{here} that these ``oracle\'\' relative efficiencies can be achieved only asymptotically, \\tcr{and that too \\emph{only}} when $\\{\\pi(\\cdot),m(\\cdot),\\phi(\\cdot,\\cdot)\\}$ are all correctly specified and estimated at fast enough rates.\n\n\\vskip0.08in\n\\tcg{Generally speaking, the results in Tables \\ref{table_ate_efficiency}--\\ref{table_qte_efficiency} clearly show that our SS estimators uniformly outperform their supervised competitors\\tcr{,} and even yield better efficiency than the supervised ``oracle\'\' estimators in most of the cases, indicated by numbers greater than one in the tables. Specifically, inspecting the two tables reveals that, among all the settings, our SS estimators make the most significant efficiency improvement when all the nuisance models are correctly specified. For instance, when $\\{m(\\X),\\pi(\\X)\\}=\\{(a),(i)\\}$, the combination of Lin and PR correctly estimate the nuisance functions and give fairly impressive results for the ATE case.}\n\n\\tcg{\\tcr{Moreover,} when both correctly approximating $\\pi(\\X) $, Lin and Quad yields similar results. However, under the setups with \\tcr{$\\{m(\\X), \\pi(\\X)\\}=\\{(c),(iii)\\}$}, for example, where Quad produces estimators converging to the true $\\pi(\\X) $ but Lin does not, and all the working outcome models misspecify the underlying relation between $Y/I(Y<\\vt) $ vs. $\\X$, Quad shows notable advantages over Lin. This substantiates the importance of the propensity score estimators $\\pihatN(\\X) $ in our methods, which has been\nstated in Corollaries \\ref{corate} and \\ref{corqte}. As regards the choices of $\\mhatnk(\\X) $ and $\\phihatnk(\\X, \\theta) $, KS$_1$ gives the best efficiency for most of the cases, justifying the approach combining kernel smoothing and dimension reduction to estimating the outcome models, as demonstrated in Sections \\ref{sec_nf_ate}--\\ref{sec_nf_qte}. Further, we observe that, as the labeled data size increases, the relative efficiencies of our SS estimators rise substantially, except for a few cases\\tcr{,} such as the ATE %estimation\n\\tcr{estimator} with the PR outcome model estimators when $p=10$.} %NOTE: leaving a space here makes a difference to formatting when breaking paragraphs!! %\n\n\\tcg{\\tcr{The} improvement verifies the asymptotic properties claimed in Section \\ref{sec_ate_ss} and \\ref{sec_qte_general}, while \\tcr{any of} the\nexceptions could be explained by the fact that the performance of the benchmarks for calculating the relative efficiencies, i.e., the ``oracle\'\' supervised estimators, are improved by more labeled data as well. Considering \\tcr{that} the ``oracle\'\' supervised estimators are always constructed with the true nuisance functions without \\emph{any} estimation errors, the positive effect of increasing $n$ on them is very likely to be more significant than that on our SS estimators. }\n\n\\tcg{\\tcr{In} addition, another interesting finding is that, in the scenario $(n,p,q)=(200,200,\\ceil{p^{1/2}}) $ where $q=O(n^{1/2}) $, our SS estimators still beat their supervised counterparts under all the settings, and possess efficiencies close to or even \\emph{better} than those of the supervised ``oracle\'\' estimators, which use the knowledge of the true data generating mechanisms, when all the nuisance models are correctly specified. This \\tcr{(pleasantly) } surprising fact implies \\tcr{that} the performance of our methods is \\tcr{somewhat}\n\\emph{insensitive} to the sparsity condition $q=o(n^{1/2}) $, which is often required in the high dimensional \\tcr{inference} literature \\citep{buhlmann2011statistics, negahban2012unified, wainwright2019high} to ensure the $L_1$\\tcr{--}consistency assumed in Assumption \\ref{al1} for the nuisance estimators; see the relevant discussion in Remark \\ref{remark_choice_of_P0} also.}\n\n%\\tcr{**Need to change the language a bit in the remark below -- AC.**}\n\\begin{remark}[Interpretations of the relative efficiencies in Tables \\ref{table_ate_efficiency}--\\ref{table_qte_efficiency}]\\label{remark_interpretation_RE}\n\\tcg{One may notice that the relative efficiencies of our SS estimators are sometimes quite different from the corresponding oracle quantities (ORE) in the tables. We attribute the differences to two reasons: (a) possible misspecification of the nuisance models, which obviously makes the oracle efficiencies unachievable, and (b) finite sample errors, from which \\emph{any} practical methods have to suffer, especially in high dimensional scenarios. In contrast, the oracle relative efficiencies are calculated presuming all the nuisance models are known and the sample sizes are infinite.}\n\n\\tcg{\\tcr{Lastly,} it is also \\tcr{worth} %noteworthy to\npoint\\tcr{ing} out that the quantities in Tables \\ref{table_ate_efficiency}--\\ref{table_qte_efficiency} somewhat ``understate\'\' the efficiency gain of our methods in the sense that the benchmarks, i.e, the ``oracle\'\' supervised estimators, are \\emph{unrealistic} due to requiring the knowledge of the underlying data generating mechanisms. When compared with the \\emph{feasible} supervised estimators, the advantage of our methods is even \\emph{more significant}. For example, when $(n,p,q)=(200,200,\\ceil{p^{1/2}}) $, $\\{m(\\X),\\pi(\\X)\\}=\\{(c),(i)\\}$ and the nuisance functions are estimated by the combination of Lin and KS$_1$, the efficiencies of our SS estimators relative to the supervised competitors are $0.56/0.16=3.50$ and $0.55/0.27=2.04$ for the cases of the ATE and the QTE, respectively. Relative to the original numbers $0.56$ and $0.55$ in the tables, the ratios $3.50$ and $2.04$ \\tcr{indeed} %apparently\nprovide \\tcr{a} more \\tcr{direct and overwhelming} %intuitive and straightforward\nevidence \\tcr{of} %for\nthe efficiency superiority of our methods, while we choose the ``oracle\'\' supervised estimators as suitable \\tcr{(common) } benchmarks \\tcr{(for comparing all estimators -- supervised and semi-supervised) } just because they are always consistent, \\tcr{and more importantly, are the \\emph{best} achievable supervised estimators (and yet \\tcr{are} idealized/infeasible, with both nuisance functions $\\pi(\\cdot) $ and $m(\\cdot)/\\phi(\\cdot,\\cdot) $ presumed known) }.}\n\\end{remark}\n\n%\\par\\smallskip\n%\\tcr{(**TO GUORONG: THE DESCRIPTION OF TABLE 2-3 RESULTS ABOVE NEEDS TO IMPROVE. INCLUDE MORE \\emph{DETAILS OR EXAMPLES} TO SUBSTANTIATE YOUR CLAIMS! ALSO, THE TABLES ARE HARD TO INTERPRET (SPECIFICALLY THE EFFICIENCY GAINS AND HOW THEY ARE CALCULATED). MY MAIN CONCERN IS WE ARE ``UNDER-SELLING\'\' THE EFFICIENCY IMPROVEMENTS FOR OUR SS ESTIMATOR -- WHEN COMPARED TO ONE OF THE "ACTUAL" SUP ESTIMATORS (NOT THE ORACLE ONE) -- THESE NUMBERS ARE ACTUALLY MUCH HIGHER THAN WHAT MIGHT BE APPARENT TO THE READER (AT A FIRST GLANCE OF THE TABLE)! COULD YOU CLARIFY THIS IN MORE DETAIL IN THE WRITEUP HERE? AND I THINK IT WOULD ALSO HELP TO INCLUDE AN EXAMPLE (OR A FEW) OF HOW TO CALCULATE THE "ACTUAL" EFFICIENCY IMPROVEMENT OF ONE OF OUR SS ESTIMATORS (COMPARED TO THE CORRESPONDING SUP ESTIMATORS IN THE TABLE.. I AM NOT TALKING ABOUT THE ORACLE ONE HERE). AND IT MIGHT HELP TO MENTION THIS (BRIEFLY) IN THE CAPTIONS AS WELL. }\n\n%\\tcr{**To Guorong (FURTHER COMMENT 1): ALSO NEED TO DISCUSS IN MORE DETAIL ABOUT OUR OWN ESTIMATOR\' PERFORMANCES -- COMPARISON AMONG THE VARIOUS CHOICES (OF NUISANCE ESTIMATORS) -- AND THE RESULTS.}\n\n%\\par\\smallskip\n%**\\tcr{\\underline{FURTHER/UPDATED COMMENTS (UPDATED AS OF 8/29/2021) -- AC}}\n\n%\\par\\smallskip\n%\\tcr{**To GUORONG (FURTHER COMMENT 2): NEED TO ALSO DISCUSS THE \\emph{HIGH DIMENSIONAL CASE RESULTS} -- AND THEIR IMP. FEATURES/BENEFITS -- SEPARATELY IN MORE DETAIL. IN PARTICULAR, I THINK IT IS IMP. TO HIGHLIGHT THE \\emph{ROBUSTNESS TO THE SPARSITY} $q$\'s  CHOICES WHEN YOU WRITE THE COMMENTS ON THE RESULTS -- AND THE FACT THAT THE RESULTS ARE GOOD (EXPLAIN WHAT WE MEAN BY ``GOOD\'\' HERE) -- IN ESTIMATION AND INFERENCE -- UNDER BOTH SPARSE AND MODERATELY DENSE SETTINGS (\\emph{EVEN WHEN} $p$ is \\emph{NOT} $o(\\sqrt{n}$), A CONDITION OFTEN REQUIRED IN HD INFERENCE LITERATURE).}\n\n%\\tcr{MOREOVER, YOU SHOULD ALSO TRY TO EXPLAIN A COUPLE OF THINGS:}\n\n%\\tcr{(A) WHY THE \\emph{ACHIEVED} RELATIVE EFFS. APPEAR \\emph{SUBSTANTIALLY LESS} THAN THE OREs HERE (I THINK THERE ARE TWO REASONS FOR THAT: (a) POSSIBLY MISSPECIFIED MODELS (AND THIS IS TRUE EVEN FOR THE LOW-DIMENSIONAL CASES) WHEN OBVIOUSLY THE BEST EFFICIENCY WON\'T BE ACHIEVED, and (b) SUBSTANTIAL FINITE SAMPLE ERRORS CREEPING IN THE HD SETTINGS FOR THE NUISANCE FUNCTIONS (EVEN IF BOTH ARE CORRECTLY SPECIFIED), AND SO THE DIFFERENCE WITH THEORETICAL ORE (with $n = \\infty$ basically) BECOMES EVEN MORE APPARENT). FOR BOTH POINTS, USE EXAMPLES TO CLARIFY.}\n\n%\\tcr{(B) I THINK YOU SHOULD CLARIFY, WHY THE REL. EFFS. IN SOME CASES TEND TO \\emph{DECREASE} WITH SAMPLE SIZE! (THIS PROBABLY COULD BE BECAUSE OF THE WAY WE ARE DEFINING THE REL. EFF.). UNLESS YOU CLARIFY THIS (AND SOMEHOW ARGUE THIS DOES \\emph{NOT} LEAD TO ANY CONTRADICTION), REVIEWERS ARE BOUND TO RAISE QUESTIONS ABOUT THIS!}\n\n%\\par\\smallskip\n%\\tcr{**To GUORONG (FURTHER COMMENT 3): \\emph{NOTE:} I tried addressing most of these already. Hence, I am commenting it out of the tex file (updated as of 8/29/2021) -- AC.}\n\n%\\par\\smallskip\n%\\tcr{(i) I THINK IT WOULD BE HELPFUL TO CREATE MORE VISIBLE ENVIRONMENTS (e.g. paragraph* or subsection) FOR DISCUSSING THE SETTINGS (DGPs, ESTIMATOR CHOICES ETC.), AND THE RESULTS (ESTIMATION AND INFERENCE).}\n\n%\\tcr{(ii) TOWARDS THE END OF THE SETTINGS, YOU SHOULD ALSO MENTION WHAT YOU ARE GOING TO DO -- FOR EACH DGP, and for ALL CHOICES of NUISANCE ESTIMATORS, WE IMPLEMENT OUR SS ESTIMATORS, and  CORRESPONDING SUP ESTIMATORS, and COMPARE THESE IN TERMS OF ESTIMATION EFFICIENCY, and ALSO INVESTIGATE/VALIDATE THE INFERENCE PROCEDURES (FOR OUR OWN SS ESTIMATORS) AS POSTULATED BY OUR ASYMPTOTIC THEORY -- IT IS HELPFUL TO MENTION THESE FIRST BEFORE GIVING THE RESULTS.} %WHAT YOU ARE DOING - estimators compared and measures used! ONLY THEN YOU SHOULD START TALKING ABOUT RESULTS.}\n\n\n%\\tcr{(iii) ALSO, FOR JUSTIFYING THE ORE MEASURE, YOU SHOULD MENTION THAT THE BENCHMARK (``ORACLE" SUP) ESTIMATOR BEING USED IS NOT \\emph{JUST} CONSISTENT, IT IS \\emph{ALSO THE BEST ACHIEVABLE (IDEALIZED) SUP ESTIMATOR.}} %(WITH NUISANCES KNOWN).} (NOTE: For addressing (i)--(iii), I tried editing a bit myself.) }\n\n%\\tcr{(iv) I THINK A LITTLE MORE DETAIL IN THE INFERENCE RESULTS COMMENTS WOULD BE HELPFUL HERE. AND YOU SHOULD MENTION YOU ARE CONSIDERING ONLY 1 ESTIMATOR AS A \\emph{REPRESENTATIVE CASE} for sake of brevity.}\n\n%\\tcr{(v) FINALLY, FOR THE DATA ANALYSIS, I THINK YOU SHOULD MENTION ABOUT ZHANG \\& BRADIC (2019) USING THE SAME DATASET FOR ILLUSTRATING THEIR APPROACH (AND YOU SHOULD ALSO ACKNOWLEDGE THEM.) } %-- perhaps in a footnote -- the first author for sharing some of the details on data preprocessing etc.) }\n\n%\\par\\smallskip\n\\subsection{\\tcr{Results on inference} %\\tcg{Inference results}\n}\\label{sec_sim_inference}\n Next, Table \\ref{table_inferece} presents the results of inference based on our SS estimators using KS$_1$ \\tcr{(as a representative case) } to calculate $\\mhatn(\\cdot) $ and $\\phihatn(\\cdot,\\cdot) $ when $n=500$. We report the bias, the empirical standard error (ESE), the average of the estimated standard errors (ASE), and the coverage rate (CR) of the 95\\% confidence intervals. As expected, the biases are negligible as long as either the propensity score or the outcome model  is correctly specified, which \\tcr{\\emph{verifies}} the DR property of our methods. Moreover, we can see \\tcr{that} whenever $\\pis(\\cdot)=\\pi(\\cdot) $, the ASEs are fairly close to the corresponding ESEs and the CRs are all around the nominal level \\tcr{of} 0.95\\tcr{,} \\tcr{\\emph{even if}} $m^*(\\cdot)\\neq m(\\cdot) $ and $\\phis(\\cdot,\\cdot)\\neq\\phi(\\cdot,\\cdot) $. See, for example, the results of the configurations marked in bold, where $\\pis(\\cdot)=\\pi(\\cdot) $ but the outcome model estimators based on KS$_1$ do {\\it not} converge to $m(\\cdot) $ (for the ATE) or $\\phi(\\cdot,\\cdot) $ (for the QTE). Such an observation confirms that, owing to the use of the massive unlabeled data, the \\tcr{\\emph{$n^{1/2}$-consistency and asymptotic normality}} of our \\tcr{SS} ATE and QTE estimators \\tcr{\\emph{only}} require correct specifications of $\\pi(\\cdot) $ as claimed in Corollaries \\ref{corate} and \\ref{corqte}. Also, it justifies the limiting distributions and variance estimations proposed in the two corollaries. \\tcr{Lastly, as mentioned before, we only present results of inference for one case as an illustration.}  When we set $n=200$ or take other choices of $\\{\\mhatn(\\cdot),\\phihatn(\\cdot,\\cdot)\\}$, our estimators still give satisfactory inference results similar \\tcr{in flavor} to those in Table \\ref{table_inferece}. We %hence\n\\tcr{therefore} skip them \\tcr{here} for the sake of brevity.\n\n\\section{Real \\tcr{d}ata \\tcr{a}nalysis}\\label{sec_data_analysis}\nIn this section, we apply our proposed methods to a data set from \\citet{baxter2006genotypic} \\tcr{that is} available at the Stanford University HIV Drug Resistance\nDatabase \\citep{rhee2003human} (https://hivdb.stanford.edu/pages/genopheno.dataset.html). \\tcr{This data was also considered in \\citet{zhang2019high} for illustration of their SS mean estimator\\footnote{We are grateful to Yuqian Zhang for sharing details on data pre-processing used in \\citet{zhang2019high}.}.} In the data set, there is an observed outcome\\tcr{,} $\\mathbb{Y}$\\tcr{,} representing the drug resistance to %the\n\\tcr{lamivudine} (3TC), a nucleoside reverse transcriptase inhibitor, along with the indicators of mutations on $240$ positions of the HIV reverse transcriptase. \\tcr{Our goal was to investigate the causal effect(s) (ATE$/$QTE) of these mutations on drug resistance.} We set the treatment indicator $T$ to be the existence of mutations on the $m$th position while regarding the other $p=239$ indicators  as the covariates $\\X$. In the interest of space, we only take $m\\in\\{39,69,75,98,123,162,184,203\\}$, a randomly selected subset of $\\{1,\\ldots,240\\}$, for illustration. Analysis with other choices of $m$ can be conducted analogously. As regards \\tcr{the} sample sizes, the labeled and unlabeled data contain $n=423$ and $N=2458$ observations, respectively. To test if the labeled and unlabeled data are equally distributed \\tcg{and satisfy Assumption \\ref{ass_equally_distributed}}, we calculate the Pearson \\tcr{test} statistic and obtain the corresponding \\tcr{$p$}-value \\tcr{as} $0.18$ using a permutation distribution \\citep{agresti2005multivariate}, %\\tcr{thus}\nimplying that the labeling is indeed independent of $(T,\\X\\trans)\\trans$. In the following,  we will estimate  the ATE \\eqref{ate} and the QTE \\eqref{qte} (with $\\tau=0.5$) \\tcr{with this data,}\nbased on the limiting distributions \\eqref{ate_difference_distribution} and \\eqref{qte_difference_distribution}\\tcr{,} rather than focusing on $\\mu_0(1) $ and $\\vt(1) $ only.\n\n\\tcr{For implementing our estimators, i}n addition to the \\tcr{nuisance estimation}\napproaches leveraged in %\\tcr{our simulations in}\nSection \\ref{sec_simulations}, we also estimate the propensity score and outcome models using random forest \\tcr{here}, treating $T$, $Y$ or $I(Y<\\thetahatinit) $ as the response,  growing $500$ trees and randomly sampling $\\ceil{p^{1/2}}$ covariates as candidates at each split. In Figures \\ref{figure_ate} and \\ref{figure_qte}, we display the 95\\% confidence intervals of the ATE and the QTE, respectively, averaging over 10 replications to remove %\\tcr{any}\npotential randomness from cross fitting. %Besides,\n\\tcr{(The} confidence intervals are also presented numerically in %Section\n\\tcr{Appendix} \\ref{sm_data_analysis}\nof the Supplementary Material.\\tcr{) } \\tcr{From the plots, w}e observe that our SS approaches generally yield \\tcr{\\it shorter} confidence intervals than their supervised counterparts, confirming again the efficiency gain from the usage of unlabeled data. Moreover, we notice that, when $m=203$, all the SS confidence intervals of the QTE are strictly above zero, indicating significantly positive median treatment effect. This finding is, however, very\nlikely to be ignored in the supervised setting since zero is included by the confidence intervals constructed based on the\nlabeled data only. Such a contrast\n\\tcr{thereby reinforces the fact that}\n%highlights that, compared to the supervised competitors,\nour SS methods \\tcr{in comparison} are notably more powerful in detecting significant treatment effects.\n\n\\begin{figure}\n\\centering\n\\caption{\\tcr{Data analysis:} $95\\%$ confidence intervals for the ATE of \\tcr{the mutations on} the drug resistance to %the\n\\tcr{3TC} based on the supervised estimator \\eqref{sup_ate} (\\underline{\\tcr{undashed bars}}) and the SS estimator \\eqref{ss_ate} (\\underline{\\tcr{dashed bars}}). Here\\tcr{,} $m$ is the position of mutation regarded as the treatment indicator. We consider three different combinations to estimate the ``propensity score \\& outcome model\'\' \\tcr{(denoted by the three bar colors) }: $(\\mathrm{i}) $ regularized logistic regression \\& kernel smoothing on the first two directions selected by the regularized sliced inverse regression ({\\color{red} %darkpink\n\\textbf{red}} fill); $(\\mathrm{ii}) $ regularized logistic regression \\& regularized parametric regression ({\\color{darkpastelgreen} \\textbf{green}} fill); $(\\mathrm{iii}) $ random forest \\& random forest ({\\color{bleudefrance} \\textbf{blue}} fill).}\n\\includegraphics[scale=0.6]{ate}  %%NOTE: For arxiv submission (with Pdflatex), you MUST have the graphics file as a pdf! (eps file, or its converted version won\'t work!!)\n\\label{figure_ate}\n\\end{figure}\n\n\\begin{figure}\n\\centering\n%\\caption{$95\\%$ confidence intervals of the QTE $(\\tau=0.5) $ of the drug resistence to the 3TC based on the supervised estimator \\eqref{sup_qte} (undashed bars) and the SS estimator \\eqref{ss_qte} (dashed bars). Here $m$ is the position of mutation regarded as the treatment indicator. We consider three different combinations to estimate the ``outcome model \\& propensity score\'\': $(\\mathrm{i}) $ kernel smoothing with dimension reduction using the first two directions selected by the regularized sliced inverse regression \\& regularized logistic regression (red fill); $(\\mathrm{ii}) $ regularized logistic regression \\& regularized logistic regression (green fill); $(\\mathrm{iii}) $ random forest \\& random forest (blue fill).}\n\\caption{\\tcr{Data analysis:} We consider the same scenario as \\tcr{in} Figure \\ref{figure_ate}, but now the estimand is the QTE $(\\tau=0.5) $.}\n\\includegraphics[scale=0.6]{qte} %%NOTE: For arxiv submission (with Pdflatex), you MUST have the graphics file as a pdf! (eps file, or its converted version won\'t work!!)\n\\label{figure_qte}\n\\end{figure}\n\n\n\n\\section{\\tcr{Concluding discussion}} %Conclusion and discussion}\n\\label{sec_conclusion_discussion}\nWe have developed \\tcr{here} a family of SS estimators for (a) the ATE and (b) the QTE\\tcr{, in possibly high dimensional settings,}  %when the covariates are possibly high dimensional\n\\tcr{and more importantly, we have developed a unified understanding of SS causal inference and its benefits -- {\\it both} in robustness and efficiency -- something we feel has been missing in the literature}. In addition to the DR property in consistency that can be attained by purely supervised methods as well, we have proved our estimators also possess $n^{1/2}$-consistency and asymptotic normality whenever the propensity score $\\pi(\\cdot) $ is correctly specified. This property is useful for inference while generally unachievable in supervised settings. Even if \\tcr{this} %the\ndifference in robustness is ignored, our estimators are still guaranteed to be more efficient than their supervised counterparts. Further, as long as all the nuisance functions are correctly specified, our approaches have been shown to attain semi-parametric optimality \\tcr{as well.} %for estimating the ATE and the QTE.\n\\tcr{All our theoretical claims above have also been validated numerically via extensive simulation studies as well as an empirical data analysis.} %Numerically, all the above claims have been validated by results of simulations and an empirical data example.\n\n\\tcr{Further, a}s a principled and flexible choice for estimating the outcome models in our methods, \\tcr{we have studied thoroughly} IPW type kernel smoothing estimators \\tcr{in high dimensional settings} with \\tcr{possible use of} dimension reduction \\tcr{techniques}.\n%have been studied thoroughly\n%in high dimensional scenarios.\nWe have shown they uniformly converge in probability to $\\E(Y\\mid\\mbP_0\\trans\\X) $ (for the case of the ATE) or $\\E\\{\\psi(Y,\\theta)\\mid\\mbP_0\\trans\\X\\}$ (for the case of the QTE) with some transformation matrix $\\mbP_0$, given either the propensity score or the outcome model is correctly specified but {\\it not} necessarily both. The precise convergence rates have been derived as well. This DR property guarantees the efficiency advantage of our SS methods over their supervised competitors. We view these results \\tcr{also} as one of our major contributions. \\tcr{To the best of our knowledge, results of this flavor (especially, in high dimensions, with $p$ diverging) have not been established in the relevant existing literature.} %since they have never been established in the literature yet and\n\\tcr{They} can be applicable to many other problems \\tcr{as well and should therefore be of independent interest}.\n\n%\\tcr{**Need to change the language a bit here -- AC.**}\n\n\\paragraph*{Extensions}\n\\tcg{%Although only focusing on the ATE and the QTE,\n\\tcr{As mentioned in Section \\ref{sec:psetup}, while we focus on the ATE and QTE\n%as two representative cases,\nfor simplicity and clarity of %exposition,}\nthe main messages,}\nour \\tcr{SS} methods \\tcr{\\it can} be easily extended to \\tcr{other causal estimands, including} the\n\\tcr{\\it general $Z$-estimation problem}, targeting a parameter defined as the solution to an estimating equation. %\\tcr{, for SS causal inference under potential outcome framework.} %, when extra unlabeled data are available while the response is not always observed in the labeled data.\nAs long as the estimand has a close form like $\\mu_0\\equiv\\E(Y) $, one can construct a family of SS estimators in the same manner as our ATE estimators \\eqref{ss_ate}. An example is the \\emph{linear regression parameter} $\\bbeta_0^{\\mbox{\\tiny LIN}}:=\\{\\E(\\Xarrow\\Xarrow\\trans)\\}^{-1}\\E(\\Xarrow Y) $ that solves the equation $\\E\\{\\Xarrow(Y-\\Xarrow\\trans\\bbeta_0^{\\mbox{\\tiny LIN}})\\}=\\bzero$, where $\\Xarrow:=(1,\\X\\trans)\\trans$. On the other hand, for estimating equations that cannot be solved straightforwardly, the one-step update strategy, used for our QTE estimators \\eqref{ss_qte}, allows for simple and flexible implementations of SS estimation and inference with various choices of nuisance estimators. For instance, our approach to constructing the SS QTE estimators can be adapted for the \\emph{quantile regression parameter} $\\bbeta_0^{\\mbox{\\tiny QUAN}}$, defined by the equation $\\E[\\Xarrow\\{I(Y<\\Xarrow\\trans\\bbeta_0^{\\mbox{\\tiny QUAN}})-\\tau\\}]=\\bzero$, with extra technical effort. These SS estimators for the general estimating equation problems are expected to possess desirable properties, such as improved robustness and efficiency relative to their supervised counterparts, which are similar in spirit to those stated in Sections \\ref{secos} and \\ref{secqte} for our SS ATE and QTE estimators. %Considering that the ATE and the QTE are fairly useful and representative parameters in the context of causal inference, we do not dig any further into other estimation problems, which are actually beyond the scope of our current work.\n\\tcr{However, a detailed analysis is beyond the scope (and the primary goals) of the current work, and therefore, we choose not to delve any further into these aspects here.}}\n\n\\vskip0.05in\n\\tcr{Lastly, i}n this article, we have only considered cases where the labeled and unlabeled data are equally distributed \\tcg{and thereby satisfy Assumption \\ref{ass_equally_distributed}}. However, the labeling mechanisms in some practical problems are in fact not determined by design and \\tcr{hence,} \\tcr{\\it labeling bias} \\tcr{can exist} %labeling bias hence exists\nbetween $\\cl$ and $\\cu$. It is %noteworthy\n\\tcr{important to note} that, due to the disproportion assumption \\eqref{disproportion}, one \\tcr{\\it cannot} simply analyze such settings by \\tcr{using} classical missing data theory \\citep{tsiatis2007semiparametric, little2019statistical}, which requires the proportion of complete observations is bounded away from zero in the sample. Some recent attention has been paid to SS inference with labeling bias in the context of linear regression \\tcr{\\citep[Section II]{chakrabortty2018efficient}} %\\citep[Section II in the Supplementary Material]{chakrabortty2018efficient}\nand mean %response\n\\tcr{estimation} \\citep{zhang2021double_robust}. For treatment effect estimation, which is more technically complicated owing to the potential outcome framework, a primary challenge is that there exists no consistent supervised method when the labeled and unlabeled data follow different distributions\\tcr{; so}  the %\\tcr{usual}\ngoal of using unlabeled data to \\tcr{`improve\'} estimation accuracy compared to supervised approaches becomes somewhat ambiguous. With biased labeling mechanisms, we believe SS inference for treatment effect needs to be studied under a novel framework and thus poses an interesting problem for future research.\n\n%\\tcr{**(TO GUORONG): ADD ADDITIONAL CONTENT (\\emph{BRIEF}) LIKE WE DISCUSSED.}\n\n%\\bibliographystyle{imsart-nameyear} % Style BST file (imsart-number.bst or imsart-nameyear.bst)\n%\\bibliography{myreference-te}       % Bibliography file (usually \'*.bib\')\n\n\\begin{appendix}\n\\section{Technical details}\\label{sm_technical}\n\n\\subsection{Preliminary lemmas}\\label{sm_lemmas}\nThe following Lemma \\ref{1v2} would be useful in the proofs of the main theorems\\tcr{, in particular, the results in Section \\ref{secqte} regarding QTE estimation}.\n\\begin{lemma}\\label{1v2}\nSuppose there are two independent samples, $\\ms_1$ and $\\ms_2$, consisting of $n$ and $m$ independent copies of $(\\X\\trans,Y)\\trans$, respectively. For $\\bgamma\\in\\rR^d$ with some fixed $d$, let $\\hg_{n}(\\x,\\bgamma) $ be an estimator of a measurable function $g(\\x,\\bgamma)\\in\\rR$ based on $\\ms_1$ and \\tcr{define:}\n\\bse\n\\mbG_{m}\\{\\hg_{n}(\\X,\\bgamma)\\}~:=~ m^{1/2}[m^{-1}\\hbox{$\\sum_{(\\X_i\\trans,Y_i)\\trans\\in\\ms_2}$}\\hg_{n}(\\X_i,\\bgamma)-\\E_\\X\\{\\hg_{n}(\\X,\\bgamma)\\}].\n\\ese\nFor some set $\\ct\\subset\\rR^d$, denote\n\\bse\n\\Delta(\\ms_1)~:=~(\\sg\\E_\\X[\\{\\hg_n(\\X,\\bgamma)\\}^2])^{1/2},\\  M(\\ms_1):=\\sgx|\\hg_n(\\x,\\bgamma)|.\n\\ese\nFor any $\\eta\\in(0,\\Delta(\\ms_1)+c\\,]$, suppose $\\G_{n}:=\\{\\hg_{n}(\\X,\\bgamma):\\bgamma\\in\\ct\\}$ satisfies that\n\\be\nN_{[\\,]}\\{\\eta,\\G_{n}\\mid\\ms_1,L_2(\\P_\\X)\\}~\\leq~ H(\\ms_1)\\eta^{-c}\\tcr{,}\n\\label{bracket2}\n\\ee\nwith some function $H(\\ms_1)>0$. Here $\\G_n$ is indexed by $\\bgamma$ only and treats $\\hg_n(\\cdot,\\bgamma) $ as a nonrandom function. Assume $H(\\ms_1)=O_p(a_n) $, $\\Delta(\\ms_1)=O_p(d_{n,2}) $ and $M(\\ms_1)=O_p(d_{n,\\infty}) $  with some positive sequences $a_n$, $d_{n,2}$ and $d_{n,\\infty}$ allowed to diverge, then we have\\tcr{:}\n\\bse\n\\sg|\\mbG_m\\{\\hg_n(\\X,\\bgamma)\\}|~=~O_p(r_{n,m}),\n\\ese\nwhere $r_{n,m}=d_{n,2}\\{\\log\\,a_n+\\log\\,(d_{n,2}^{-1})\\}+m^{-1/2}d_{n,\\infty}\\{(\\log\\,a_n)^2+(\\log\\,d_{n,2})^2\\}$.\n\\end{lemma}\n\n\n\n\\subsection{Proof of Lemma \\ref{1v2}} For any $\\delta\\in(0,\\Delta(\\ms_1)+c\\,]$, we have that the bracketing integral\n\\bse\nJ_{[\\,]}\\{\\delta,\\G_n\\mid\\ms_1,L_2(\\P_\\X)\\}&~\\equiv~&\\hbox{$\\int_0^\\delta$}[1+\\log\\,N_{[\\,]}\\{\\eta,\\G_n\\mid\\ms_1,L_2(\\P_\\X)\\}]^{1/2}d\\eta \\\\\n&~\\leq~&\\hbox{$\\int_0^\\delta$}1+\\log \\,N_{[\\,]}\\{\\eta,\\G_n\\mid\\ms_1,L_2(\\P_\\X)\\}d\\eta  \\\\\n&~\\leq~&\\hbox{$\\int_0^\\delta$}1+\\log\\,H(\\ms_1)-c\\,\\log\\,\\eta\\, d\\eta \\\\\n&~=~&\\delta\\{1+\\log\\,H(\\ms_1)\\}+c\\,(\\delta-\\delta\\,\\log\\,\\delta),\n\\ese\nwhere the third step is due to (\\ref{bracket2}). This, combined with Lemma 19.36 of \\citet{van2000asymptotic}, implies\\tcr{:}\n\\bse\n&&\\phantom{~=~}\\E_\\X[\\sg|\\mbG_m\\{\\hg_n(\\X,\\bgamma)\\}|] \\\\\n&&~\\leq~ J_{[\\,]}\\{\\delta,\\G_n\\mid\\ms_1,L_2(\\P_\\X)\\}+[J_{[\\,]}\\{\\delta,\\G_n\\mid\\ms_1,L_2(\\P_\\X)\\}]^2M(\\ms_1)\\delta^{-2}m^{-1/2} \\\\\n&&~\\leq~ \\delta\\{1+\\log\\,H(\\ms_1)\\}+c\\,(\\delta-\\delta\\,\\log\\,\\delta)+\\{1+\\log\\,H(\\ms_1)+c\\,(1-\\log\\,\\delta)\\}^2M(\\ms_1)m^{-1/2}\n\\ese\nfor any $\\delta\\in(\\Delta(\\ms_1),\\Delta(\\ms_1)+c\\,]$. Therefore\\tcr{,}\n\\bse\n\\E_\\X[\\sg|\\mbG_m\\{\\hg_n(\\X,\\bgamma)\\}|] &~\\leq~& \\Delta(\\ms_1)\\{1+\\log\\,H(\\ms_1)\\}+c\\,\\{\\Delta(\\ms_1)-\\Delta(\\ms_1)\\,\\log\\,\\Delta(\\ms_1)\\}+\\\\\n&&~~[1+\\log\\,H(\\ms_1)+c\\,\\{1-\\log\\,\\Delta(\\ms_1)\\}]^2M(\\ms_1)m^{-1/2}.\n\\ese\nSince the right hand side in the above is $O_p(r_{n,m}) $, it gives that\n\\be\n\\E_\\X[\\sg|\\mbG_m\\{\\hg_n(\\X,\\bgamma)\\}|] ~=~O_p(r_{n,m}).\n\\label{ex}\n\\ee\nThen, for any positive sequence $t_n\\to\\infty$, we have\n\\bse\n&&\\phantom{=}\\P_{\\ms_2}[\\sg|\\mbG_m\\{\\hg_n(\\X,\\bgamma)\\}|>t_n r_{n,m}\\mid\\ms_1] \\\\\n&&~\\leq~ (t_n r_{n,m})^{-1}\\E_\\X[\\sg|\\mbG_m\\{\\hg_n(\\X,\\bgamma)\\}|] ~=~o_p(1),\n\\ese\nwhere the first step holds by Markov\'s inequality and the last step is due to (\\ref{ex}). This, combined with Lemma 6.1 of \\citet{chernozhukov2018double}, gives that\n\\bse\n\\P[\\sg|\\mbG_m\\{\\hg_n(\\X,\\bgamma)\\}|>t_n r_{n,m}]~\\to~ 0,\n\\ese\nwhich completes the proof.\n\n\n\n\\subsection{Proof of Theorem \\ref{thate}}\nDenote $\\Enk^*\\{\\hg(\\Z)\\}:=n_{\\kK}^{-1}\\sum_{i\\in\\I_k}\\hg(\\Z_i) $ for any random function $\\hg(\\cdot) $ $(k=1,\\ldots,\\kK) $. Write\n\\be\n\\muhatss-\\mu_0~=~S_1+S_2+S_3+S_4+S_5,\n\\label{date}\n\\ee\nwhere\n\\be\nS_1&~:=~&\\E_n[\\{\\pis(\\X)\\}^{-1}T\\{Y- m^*(\\X) \\}]+\\E_{n+N}\\{ m^*(\\X) \\}-\\mu_0, \\label{s1}\\\\\nS_2&~:=~&\\E_n([\\nu_{n,N}-\\{\\pis(\\X)\\}^{-1}T]\\{\\mhatn(\\X)- m^*(\\X) \\})=\\kK^{-1}\\sk S_{2,k} \\nonumber\\\\\n&~:=~&\\kK^{-1}\\sk\\Enk^*([\\nu_{n,N}-\\{\\pis(\\X)\\}^{-1}T]\\{\\hat{m}_{n,k}(\\X)- m^*(\\X) \\}), \\nonumber\\\\\nS_3&~:=~&(1-\\nu_{n,N})\\E_{N}\\{\\mhatn(\\X)- m^*(\\X) \\}=\\kK^{-1}\\sk S_{3,k} \\nonumber\\\\\n&~:=~&\\kK^{-1}\\sk[(1-\\nu_{n,N})\\E_N\\{\\hat{m}_{n,k}(\\X)- m^*(\\X) \\}], \\nonumber\\\\\nS_4&~:=~&\\E_n[\\hD(\\X) T\\{Y- m^*(\\X) \\}],\\ S_5:=\\E_n[\\hD(\\X) T\\{ m^*(\\X) -\\mhatn(\\X)\\}].\n\\nonumber\n\\ee\n\nWe first handle $S_2$ and $S_3$. \\tcr{To this end, w}e have\\tcr{:}\n\\bse\n&&\\phantom{~=~}\\E_\\Z\\{([\\nu_{n,N}-\\{\\pis(\\X)\\}^{-1}T]\\{\\hat{m}_{n,k}(\\X)- m^*(\\X) \\})^2\\} \\\\\n&&~\\leq~ c\\,\\E_\\X[\\{\\hat{m}_{n,k}(\\X)- m^*(\\X) \\}^2] ~=~O_p(w_{n,2}^2),\n\\ese\nwhere the first step uses the boundedness of $\\{\\pis(\\X)\\}^{-1}$ from Assumption \\ref{api4} and the last step is due to (\\ref{wn2}) of Assumption \\ref{ahmu}. It now follows that\n\\bse\n\\var(S_{2,k}\\mid\\cl_k^-)~=~O_p(n^{-1}w_{n,2}^2),\\ \\var(S_{3,k}\\mid\\cl_k^-)~=~O_p(N^{-1}w_{n,2}^2).\n\\ese\nThus\\tcr{,} Chebyshev\'s inequality gives that, for any positive sequence $t_n\\to\\infty$,\n\\bse\n&&\\P_{\\cl_k}(|S_{2,k}-\\E_\\Z(S_{2,k})|\\geq t_n n^{-1/2}w_{n,2}\\mid\\cl_k^-)~\\leq~ n(t_nw_{n,2})^{-2}\\var(S_{2,k}\\mid\\cl_k^-) ~=~o_p(1), \\\\\n&&\\P_{\\cu}(|S_{3,k}-\\E_\\Z(S_{3,k})|\\geq t_n n^{-1/2}w_{n,2}\\mid\\cl_k^-)~\\leq~ n(t_nw_{n,2})^{-2}\\var(S_{3,k}\\mid\\cl_k^-) ~=~o_p(1).\n\\ese\nThen\\tcr{,} Lemma 6.1 of \\citet{chernozhukov2018double} implies\n\\bse\n|S_{2,k}-\\E_\\Z(S_{2,k})|~=~O_p(n^{-1/2}w_{n,2}),\\ |S_{3,k}-\\E_\\Z(S_{3,k})|~=~O_p(N^{-1/2}w_{n,2}),\n\\ese\nwhich gives that\n\\be\n|S_{2,k}+S_{3,k}-\\E_\\Z(S_{2,k}+S_{3,k})|~=~O_p(n^{-1/2}w_{n,2}).\n\\label{s23e}\n\\ee\nIn addition, we know that\n\\bse\n|\\E_\\Z(S_{2,k}+S_{3,k})|&~=~&|\\E_\\Z([1-\\{\\pis(\\X)\\}^{-1}T]\\{\\hat{m}_{n,k}(\\X)- m^*(\\X) \\})| \\\\\n&~\\leq~&c\\,I\\{\\pis(\\X)\\neq\\pi(\\X)\\} \\E\\{|\\hat{m}_{n,k}(\\X)- m^*(\\X) |\\} \\\\\n&~=~&I\\{\\pis(\\X)\\neq\\pi(\\X)\\}O_p(w_{n,1}),\n\\ese\nwhere the second step uses the boundedness of $\\{\\pis(\\X)\\}^{-1}$ from Assumption \\ref{api4} as well as the fact that\n\\bse\n\\E_\\Z([1-\\{\\pi(\\X)\\}^{-1}T]\\{\\hat{m}_{n,k}(\\X)- m^*(\\X) \\})~=~0,\n\\ese\nand the last step holds by (\\ref{wn1}) of Assumption \\ref{ahmu}. This, combined with (\\ref{s23e}), gives\n\\bse\n|S_{2,k}+S_{3,k}|~=~O_p(n^{-1/2}w_{n,2})+I\\{\\pis(\\X)\\neq\\pi(\\X)\\}O_p(w_{n,1}),\n\\ese\nwhich implies\\tcr{:}\n\\be\n|S_2+S_3|&~\\leq~&\\kK^{-1}\\sk|S_{2,k}+S_{3,k}| \\nonumber\\\\\n&~=~&O_p(n^{-1/2}w_{n,2})+I\\{\\pis(\\X)\\neq\\pi(\\X)\\}O_p(w_{n,1}).\n\\label{s23}\n\\ee\n\nNext, we control $S_4$. We know that\n\\bse\n\\E_\\Z([\\hD(\\X)T\\{Y- m^*(\\X) \\}]^2)~\\leq~\\E_\\Z([\\hD(\\X)\\{Y- m^*(\\X) \\}]^2)~=~O_p(b_N^2),\n\\ese\nwhere the last step holds by (\\ref{sn4}) of Assumption  \\ref{api4}. This implies\\tcr{:}\n\\bse\n\\var(S_{4}\\mid\\cu)~=~O_p(n^{-1}b_N^2).\n\\ese\nThus Chebyshev\'s inequality gives that, for any positive sequence $t_n\\to\\infty$,\n\\bse\n\\P_\\cl(|S_{4}-\\E_\\Z(S_4)|\\geq t_n n^{-1/2}b_N\\mid\\cu)~\\leq~ n(t_nb_N)^{-2}\\var(S_{4}\\mid\\cu) ~=~o_p(1).\n\\ese\nThen, by Lemma 6.1 of \\citet{chernozhukov2018double}, we have\n\\be\n|S_{4}-\\E_\\Z(S_4)|~=~O_p(n^{-1/2}b_N).\n\\label{s41}\n\\ee\nIn addition, if $ m^*(\\X) = m(\\X) $, then\n\\bse\n\\E_\\Z(S_4)~=~\\E(\\E[\\hD(\\X) T\\{Y- m(\\X) \\} \\mid\\cu,\\X]\\mid \\cu)~=~0.\n\\ese\nOtherwise, we have\n\\bse\n|\\E_\\Z(S_{4})|~\\leq~(\\E_\\X[\\{\\hD(\\X)\\}^2]\\E[\\{Y- m^*(\\X) \\}^2])^{1/2} ~=~O_p(s_N),\n\\ese\nwhere the first step uses H\\"older\'s inequality and the last step is due to (\\ref{sn2}) of Assumption \\ref{api4}. Therefore $|\\E_\\Z(S_4)|=I\\{ m^*(\\X) \\neq m(\\X) \\}O_p(s_N) $. This, combined with (\\ref{s41}), implies\\tcr{:}\n\\be\n|S_4|~=~O_p(n^{-1/2}b_N)+I\\{ m(\\X) \\neq m^*(\\X) \\}O_p(s_N).\n\\label{s4}\n\\ee\n\nNow, we consider $S_5$. Markov\'s inequality gives that, for any positive sequence $t_n\\to\\infty$,\n\\be\n&&\\phantom{~=~}\\P_\\cl(\\Enk^* [\\{\\hD(\\X)\\}^2]\\geq t_ns_N^2\\mid\\cu)~\\leq~ t_n^{-1}s_N^{-2}\\E_\\X [\\{\\hD(\\X)\\}^2]~=~o_p(1), \\label{pdn}\\\\\n&&\\phantom{~=~}\\P_{\\cl_k}(\\Enk^*[\\{ m^*(\\X) -\\hat{m}_{n,k}(\\X)\\}^2]\\geq t_nw_{n,2}^2\\mid\\cl_k^-) \\nonumber\\\\\n&&~\\leq~ t_n^{-1}w_{n,2}^{-2}\\E_\\X[\\{ m^*(\\X) -\\hat{m}_{n,k}(\\X)\\}^2]=o_p(1)\\quad (k=1,\\ldots,\\kK),\n\\label{pmun}\n\\ee\nwhere (\\ref{pdn}) uses (\\ref{sn2}) of Assumption \\ref{api4} and (\\ref{pmun}) holds by (\\ref{wn2}) of Assumption \\ref{ahmu}. Then, by Lemma 6.1 of \\citet{chernozhukov2018double}, we have\n\\be\n&&\\Enk^* [\\{\\hD(\\X)\\}^2]~=~O_p(s_N^2), \\label{sn}\\\\\n&&\\Enk^* [\\{ m^*(\\X) -\\hat{m}_{n,k}(\\X)\\}^2]~=~O_p(w_{n,2}^2)\\quad (k=1,\\ldots,\\kK).\n\\label{en}\n\\ee\nHence\\tcr{,} H\\"older\'s inequality implies\\tcr{:}\n\\be\n|S_5|&~\\leq~&\\kK^{-1}\\sk\\Enk^*[|\\hD(\\X)\\{ m^*(\\X) -\\hat{m}_{n,k}(\\X)\\}|]  \\nonumber\\\\\n&~\\leq~&\\kK^{-1}\\sk(\\Enk^*[\\{\\hD(\\X)\\}^2]\\Enk^*[\\{ m^*(\\X) -\\hat{m}_{n,k}(\\X)\\}^2])^{1/2} =O_p(s_N\\,w_{n,2}),\n\\label{s5}\n\\ee\nwhere the last step holds by (\\ref{sn}) and(\\ref{en}).\n\nSumming up, the equations (\\ref{date}), (\\ref{s1}), (\\ref{s23}), (\\ref{s4}) and (\\ref{s5}) conclude the result.\n\n\n\\subsection{Proof of Corollary \\ref{corate}}\nSince $\\nu=0$, we have\n\\bse\n\\E_{n+N}\\{ m^*(\\X) \\}~=~\\E\\{ m^*(\\X) \\}+O_p\\{(n+N)^{-1/2}\\}~=~\\E\\{ m^*(\\X) \\}+ o_p(n^{-1/2}).\n\\ese\nby the central limit theorem. Then the stochastic expansion directly follows from Theorem \\ref{thate} and the asymptotic normality is obvious.\n\n\\subsection{Proof of Corollary \\ref{coratesup}}\nWith $\\E_{n+N}\\{\\mhatn(\\X)\\}$ substituted by $\\E_n\\{\\mhatn(\\X)\\}$, the proof of Theorem \\ref{thate} directly gives the stochastic expansion followed by the asymptotic normality. Then\\tcr{,} we have\n\\bse\n&&\\phantom{~=~}\\cov[\\{\\pi(\\X)\\}^{-1}T\\{Y- m^*(\\X) \\}, m^*(\\X) ] \\\\\n&&~=~\\E\\{ m^*(\\X) Y\\}-\\E[\\{ m^*(\\X) \\}^2]-\\E\\{Y- m^*(\\X) \\} \\E\\{ m^*(\\X) \\} \\\\\n&&~=~\\E\\{ m^*(\\X) Y\\}-\\var\\{ m^*(\\X) \\}.\n\\ese\nTherefore\\tcr{,}\n\\bse\n&&\\lams^2~=~\\var[\\{\\pi(\\X)\\}^{-1}T\\{Y- m^*(\\X) \\}]+\\var\\{ m^*(\\X) \\}+ \\\\\n&&\\phantom{\\lams^2=~~}2\\,\\cov[\\{\\pi(\\X)\\}^{-1}T\\{Y- m^*(\\X) \\}, m^*(\\X) ] \\\\\n&&\\phantom{\\lams^2}~=~\\var[\\{\\pi(\\X)\\}^{-1}T\\{Y- m^*(\\X) \\}]-\\var\\{ m^*(\\X) \\}+2\\,\\E\\{ m^*(\\X) (Y-\\mu_0)\\}.\n\\ese\n\n\\subsection{Proof of Corollary \\ref{corate_dagger}}\nThe stochastic expansion can be obtained from the proof of Theorem \\ref{thate} with $\\pihatN(\\cdot) $ replaced by $\\pihatn(\\cdot) $. The asymptotic normality directly follows.\n\n\n\\subsection{Proof of Theorem \\ref{thqte}} Write\n\\be\n\\thetahatss-\\vt~=~\\{T_1(\\thetahatinit)-\\vt\\}+\\{\\hf(\\thetahatinit)\\}^{-1}\\{T_2(\\thetahatinit)+T_3(\\thetahatinit)+T_4(\\thetahatinit)\\},\n\\label{dde}\n\\ee\nwhere\n\\bse\nT_1(\\theta)&~:=~&\\theta+\\{\\hf(\\theta)\\}^{-1}(\\E_n[\\{\\pis(\\X)\\}^{-1}T\\{\\phis(\\X,\\theta)-\\psi(Y,\\theta)\\}]-\\E_{n+N}\\{\\phis(\\X,\\theta)\\}), \\\\ T_2(\\theta)&~:=~&\\E_n([\\{\\pis(\\X)\\}^{-1}T-\\nu_{n,N}]\\{\\phihatn(\\X,\\theta)-\\phis(\\X,\\theta)\\})-\\\\\n&&~~(1-\\nu_{n,N})\\E_{N}\\{\\phihatn(\\X,\\theta)-\\phis(\\X,\\theta)\\}, \\\\\nT_3(\\theta)&~:=~&\\E_n[\\hD(\\X) T\\{\\phis(\\X,\\theta)-\\psi(Y,\\theta)\\}],\\\\\nT_4(\\theta)&~:=~&\\E_n[\\hD(\\X) T\\{\\phihatn(\\X,\\theta)-\\phis(\\X,\\theta)\\}].\n\\ese\n\nFirst, the conditions (\\ref{hvti}) and (\\ref{hf}) of Assumption \\ref{ainit} give\n\\be\n&&\\P\\{\\thetahatinit\\in\\mbtv\\}~\\to~ 1, \\label{belong}\\\\\n&&\\hL~:=~\\{\\hf(\\thetahatinit)\\}^{-1}-\\{f(\\vt)\\}^{-1}~=~O_p(v_n)~=~o_p(1).\n\\label{hl}\n\\ee\nAlso, we have\n\\be\n\\hf(\\thetahatinit)~=~O_p(1)\\tcr{,}\n\\label{hfo}\n\\ee\ndue to (\\ref{hf}) of Assumption \\ref{ainit} and the fact that $f(\\vt)>0$ from Assumption \\ref{adensity}.\n\nNow\\tcr{,} we consider $T_1(\\thetahatinit) $. According to (\\ref{hvti}) of Assumption \\ref{ainit} and (\\ref{unipi1}) of Assumption \\ref{abound}, we have\n\\bse\nn^{-1/2}\\mbG_n[\\{\\pis(\\X)\\}^{-1}T\\phis(\\X,\\thetahatinit)]~=~n^{-1/2}\\mbG_n[\\{\\pis(\\X)\\}^{-1}T\\phis(\\X,\\vt)]+o_p(n^{-1/2}),\n\\ese\nwhich implies that\n\\be\n&&\\phantom{~=~}\\E_n[\\{\\pis(\\X)\\}^{-1}T\\phis(\\X,\\thetahatinit)]\\nonumber\\\\\n&&~=~\\E_\\Z[\\{\\pis(\\X)\\}^{-1}T\\phis(\\X,\\thetahatinit)] +\\E_n[\\{\\pis(\\X)\\}^{-1}T\\phis(\\X,\\vt)]- \\nonumber\\\\\n&&\\phantom{~=~}\\E_\\Z[\\{\\pis(\\X)\\}^{-1}T\\phis(\\X,\\vt)]+o_p(n^{-1/2}).\n\\label{t11}\n\\ee\nConsidering that $\\{\\psi(Y,\\theta):\\theta\\in\\mbtv\\}$ is a $\\P$-Donsker class from Theorem 19.3 of \\citet{van2000asymptotic} and the permanence properties of $\\P$-Donsker classes \\citet{van1996weak}, Theorem 2.10.6 of \\citet{van1996weak} gives that $\\md^*=\\{\\{\\pis(\\X)\\}^{-1}T\\psi(Y,\\theta):\\theta\\in\\mbtv\\}$ is $\\P$-Donsker since $\\{\\pis(\\X)\\}^{-1}T$ and $\\psi(Y,\\theta) $ are bounded. Moreover, the convergence (\\ref{belong}) implies that $\\{\\pis(\\X)\\}^{-1}T\\psi(Y,\\thetahatinit) $ is in $\\md^*$ with probability tending to one. In addition, we have\n\\bse\n&&\\phantom{~=~}\\E_\\Z[\\{\\pis(\\X)\\}^{-2}T\\{\\psi(Y,\\thetahatinit)-\\psi(Y,\\vt)\\}^2] \\\\\n&&~\\leq~ c\\, \\E_\\bfZ[\\{I(Y<\\thetahatinit)-I(Y<\\vt)\\}^2] =c\\,F(\\thetahatinit)+F(\\vt)-2F\\{\\min(\\thetahatinit,\\vt)\\} \\to 0\n\\ese\nin probability\\tcr{,} because of the boundedness of $\\{\\pis(\\X)\\}^{-2}T$, the continuity of $F(\\cdot) $ from Assumption \\ref{adensity} and the consistency of $\\thetahatinit$ from Assumption \\ref{ainit}. Hence Lemma 19.24 of \\citet{van2000asymptotic}\ngives that\n\\bse\n\\mbG_n[\\{\\pis(\\X)\\}^{-1}T\\{\\psi(Y,\\thetahatinit)-\\psi(Y,\\vt)\\}]~=~o_p(1),\n\\ese\nwhich implies\\tcr{:}\n\\be\n\\E_n[\\{\\pis(\\X)\\}^{-1}T\\psi(Y,\\thetahatinit)]&~=~&\\E_\\Z[\\{\\pis(\\X)\\}^{-1}T\\psi(Y,\\thetahatinit)] +\\E_n[\\{\\pis(\\X)\\}^{-1}T\\psi(Y,\\vt)]- \\nonumber\\\\\n&&\\E_\\Z[\\{\\pis(\\X)\\}^{-1}T\\psi(Y,\\vt)]+o_p(n^{-1/2}).\\label{t12}\n\\ee\nFurther, the condition (\\ref{unipi2}) gives\n\\be\n\\E_{n+N}\\{\\phis(\\X,\\thetahatinit)\\}&~=~&\\E_\\X\\{\\phis(\\X,\\thetahatinit)\\}+\\E_{n+N}\\{\\phis(\\X,\\vt)\\}- \\nonumber\\\\\n&&\\E_\\X\\{\\phis(\\X,\\vt)\\}+o_p(n^{-1/2}).\n\\label{t13}\n\\ee\nSince either $\\phis(\\cdot,\\cdot)=\\phi(\\cdot,\\cdot) $ or $\\pis(\\cdot)=\\pi(\\cdot) $, we know that\n\\be\n\\E_\\Z[\\{\\pis(\\X)\\}^{-1}T\\{\\phis(\\X,\\vt)-\\psi(Y,\\vt)\\}]-\\E_\\X\\{\\phis(\\X,\\vt)\\}~=~0,\n\\label{t14}\n\\ee\nand that\n\\be\n&&\\phantom{~=~}\\E_\\Z[\\{\\pis(\\X)\\}^{-1}T\\{\\phis(\\X,\\thetahatinit)-\\psi(Y,\\thetahatinit)\\}]-\\E_\\X\\{\\phis(\\X,\\thetahatinit)\\} \\nonumber\\\\ &&~=~ -\\E_\\Z\\{\\psi(Y,\\thetahatinit)\\}.\n\\label{t15}\n\\ee\nIn addition, Taylor\'s expansion gives that\n\\be\n\\E_\\bfZ\\{\\psi(Y,\\thetahatinit)\\}&~=~&f(\\vt)(\\thetahatinit-\\vt)+O_p(|\\thetahatinit-\\vt|^2)\\nonumber\\\\\n&~=~&f(\\vt)(\\thetahatinit-\\vt)+O_p(u_n^2) \\label{df12} \\\\\n&~=~&O_p(u_n),\\label{df122}\n\\ee\nwhere the residual term in the first step is due to (\\ref{belong}) and the fact that $f(\\cdot) $ has a bounded derivative in $\\mbtv$ from Assumption \\ref{adensity}, the second step uses (\\ref{hvti}) in Assumption \\ref{ainit} and the last step holds by the fact that $u_n=o(1) $ from Assumption \\ref{ainit}. Therefore\\tcr{,}\n\\be\n\\E_n\\{\\omega_{n,N}(\\Z,\\thetahatinit)\\}&~=~&\\E_n\\{\\omega_{n,N}(\\Z,\\vt)\\}- \\E_\\Z\\{\\psi(Y,\\thetahatinit)\\}+o_p(n^{-1/2}) \\nonumber\\\\\n&~=~&\\E_n\\{\\omega_{n,N}(\\Z,\\vt)\\}-f(\\vt)(\\thetahatinit-\\vt)+O_p(u_n^2)+o_p(n^{-1/2}) \\label{taylor}\\\\\n&~=~&\\E_n\\{\\omega_{n,N}(\\Z,\\vt)\\}+O_p(u_n)+o_p(n^{-1/2}), \\nonumber\n\\ee\nwhere the first step uses (\\ref{t11})--(\\ref{t15}), the second step is due to (\\ref{df12}) and the last step holds by (\\ref{df122}). It now follows that\n\\be\n\\hL\\E_n\\{\\omega_{n,N}(\\Z,\\thetahatinit)\\}~=~O_p(u_nv_n)+o_p(n^{-1/2})\\label{diffl}\n\\ee\nfrom (\\ref{hl}) and the fact that $\\E_n\\{\\omega_{n,N}(\\Z,\\vt)\\}=O_p(n^{-1/2}) $ from the central limit theorem. Hence\\tcr{,} we have\n\\be\n&&T_1(\\thetahatinit)-\\vt~=~\\thetahatinit-\\vt+\\{\\hf(\\thetahatinit)\\}^{-1}\\E_n\\{\\omega_{n,N}(\\Z,\\thetahatinit)\\} \\nonumber\\\\\n&&\\phantom{T_1(\\thetahatinit)-\\vt}~=~\\thetahatinit-\\vt+\\{f(\\vt)\\}^{-1}\\E_n\\{\\omega_{n,N}(\\Z,\\thetahatinit)\\}+O_p(u_nv_n)+o_p(n^{-1/2}) \\nonumber\\\\\n&&\\phantom{T_1(\\thetahatinit)-\\vt}~=~\\thetahatinit-\\vt+\\{f(\\vt)\\}^{-1}[\\E_n\\{\\omega_{n,N}(\\Z,\\vt)\\}-f(\\vt)(\\thetahatinit-\\vt)]+ \\nonumber\\\\\n&&\\phantom{T_1(\\thetahatinit)-\\vt=}O_p(u_n^2+u_nv_n)+o_p(n^{-1/2}) \\nonumber\\\\\n&&\\phantom{T_1(\\thetahatinit)-\\vt}~=~\\{f(\\vt)\\}^{-1}\\E_n\\{\\omega_{n,N}(\\Z,\\vt)\\}+O_p(u_nv_n+u_n^2)+o_p(n^{-1/2}),\n\\label{t1}\n\\ee\nwhere the second step uses (\\ref{diffl}) and the third step is due to (\\ref{taylor}).\n\nNext, we control $T_2(\\thetahatinit) $. Denote\n\\bse\n\\mp_{n,k}^*~:=~\\{[\\{\\pis(\\X)\\}^{-1}T-\\nu_{n,N}]\\hpsi(\\X,\\theta):\\theta\\in\\mbtv\\}.\n\\ese\nDue to the boundedness of $[\\{\\pis(\\X)\\}^{-1}T-\\nu_{n,N}]$ from Assumption \\ref{api}, we have\n\\be\n&&\\phantom{=}N_{[\\,]} \\{c\\,\\eta,\\mp_{n,k}^*\\mid\\cl,L_2(\\P_\\X)\\}~\\leq~ N_{[\\,]} \\{\\eta,\\mp_{n,k}\\mid\\cl,L_2(\\P_\\X)\\}~\\leq~ H(\\cl)\\eta^{-c}, \\label{bracket}\\\\\n&&\\phantom{~=~}\\sbx|[\\{\\pis(\\X)\\}^{-1}T-\\nu_{n,N}]\\hpsi(\\X,\\theta)| \\nonumber\\\\\n&&~\\leq~ c\\,\\sbx|\\hpsi(\\X,\\theta)| =O_p(d_{n,\\infty}), \\label{bn}\\\\\n&&\\phantom{~~}[\\sb\\E_\\Z\\{([\\{\\pis(\\X)\\}^{-1}T-\\nu_{n,N}]\\hpsi(\\X,\\theta))\\}^2]^{1/2} \\nonumber\\\\\n&&~\\leq~ c\\,\\Delta_{k}(\\cl)=O_p(d_{n,2}) \\quad (k=1,\\ldots,\\kK)\\tcr{,} \\label{dn}\n\\ee\nfrom Assumption \\ref{aest}. Then\\tcr{,} (\\ref{bracket}) implies\\tcr{:}\n\\be\nN_{[\\,]} \\{\\eta,\\mp_{n,k}^*\\mid\\cl,L_2(\\P_\\X)\\}~\\leq~ c_1^{c_2} H(\\cl)\\eta^{-c_2}. \\label{an} %~~\\tcr{\\mbox{**What is ``$a$\'\' here? -- AC (1/2)**}} %% Comment posted on 1/2/22 AFTER arxiving when I noticed this possible typo -- AC. %%\n\\ee\nSince $c_1^{c_2} H(\\cl)=O_p(a_n) $ from Assumption \\ref{aest}, combining (\\ref{bn})--(\\ref{an})  and applying Lemma \\ref{1v2} yield that\n\\be\n\\sb|\\mbG_{n_\\kK,k}([\\{\\pis(\\X)\\}^{-1}T-\\nu_{n,N}]\\hpsi(\\X,\\theta))|~=~ O_p(r_n)\\tcr{,}\n\\label{mbgpi}\n\\ee\nwith the notation\n\\bse\n\\mbG_{n_\\kK,k}\\{\\ghat(\\Z)\\}~:=~n_\\kK^{1/2}[n_\\kK^{-1}\\hbox{$\\sum_{i\\in\\I_k}$}\\ghat(\\Z_i)-\\E_\\X\\{\\ghat(\\Z)\\}]\\quad (k=1,\\ldots,\\kK)\\tcr{,}\n\\ese\nfor any random function $\\ghat(\\cdot) $. In addition, we have\n\\be\n&&\\phantom{=}\\sb|\\E_\\Z([\\{\\pis(\\X)\\}^{-1}T-1]\\hpsi(\\X,\\theta))| \\nonumber\\\\\n&&~\\leq~ c\\, I\\{\\pis(\\X)\\neq\\pi(\\X)\\} \\sb\\E_\\Z\\{|\\hpsi(\\X,\\theta)|\\} \\nonumber\\\\\n&&~=~I\\{\\pis(\\X)\\neq\\pi(\\X)\\}O_p(d_{n,1}),\n\\label{idn}\n\\ee\nwhere the first step holds by the boundedness of $\\{\\pis(\\X)\\}^{-1}$ from Assumption \\ref{api} and the fact that\n\\bse\n\\E_\\Z([\\{\\pi(\\X)\\}^{-1}T-1]\\hpsi(\\X,\\theta))~=~0,\n\\ese\nand the last step is due to Assumption \\ref{aest}. Moreover, under Assumption \\ref{aest}, Lemma \\ref{1v2} implies that\n\\be\n&&\\phantom{~=~}\\sb|\\mbG_N\\{\\hpsi(\\X,\\theta)\\}| \\nonumber\\\\\n&&~=~O_p[d_{n,2}\\{\\log\\,a_n+\\log\\,(d_{n,2}^{-1})\\}+N^{-1/2}d_{n,\\infty}\\{(\\log\\,a_n)^2+(\\log\\,d_{n,2})^2\\}] \\nonumber\\\\\n&&~=~O_p(r_n)\\quad (k=1,\\ldots,\\kK). \\label{rr2}\n\\ee\nConsidering (\\ref{mbgpi})--(\\ref{rr2}), we know that\n\\bse\n&&T_2(\\thetahatinit)~=~\\kK^{-1}\\sk \\{n_\\kK^{-1/2}\\mbG_{n_\\kK,k}([\\{\\pis(\\X)\\}^{-1}T-\\nu_{n,N}]\\hpsi(\\X,\\thetahatinit))- \\\\\n&&\\phantom{T_2(\\thetahatinit)=\\kK^{-1}\\sk\\{}N^{-1/2}(1-\\nu_{n,N})\\mbG_N\\{\\hpsi(\\X,\\thetahatinit)\\}+ \\\\\n&&\\phantom{T_2(\\thetahatinit)=\\kK^{-1}\\sk\\{}\\E_\\Z([\\{\\pis(\\X)\\}^{-1}T-1]\\hpsi(\\X,\\thetahatinit))\\} \\\\\n&&\\phantom{T_2(\\thetahatinit) }~=~O_p(n^{-1/2}r_n)+I\\{\\pis(\\X)\\neq\\pi(\\X)\\}O_p(d_{n,1}),\n\\ese\nwhich, combined with (\\ref{hfo}), implies that\n\\be\n\\{\\hf(\\thetahatinit)\\}^{-1}T_2(\\thetahatinit)~=~O_p(n^{-1/2}r_n)+I\\{\\pis(\\X)\\neq\\pi(\\X)\\}O_p(d_{n,2}).\n\\label{t2}\n\\ee\n\nFurther, we \\tcr{now} handle $T_3(\\thetahatinit) $. Let $\\mh:= \\{\\hD(\\X)T\\phis(\\X,\\theta):\\theta\\in\\mbtv\\}$ and recall $\\mm= \\{\\phis(\\X,\\theta):\\theta\\in\\mbtv\\}$. We have\n\\be\n&&\\phantom{=}N_{[\\,]} \\{\\sx|\\hD(\\x)|\\eta,\\mh\\mid\\cu,L_2(\\P_\\X)\\} ~\\leq~ N_{[\\,]} \\{\\eta,\\mm,L_2(\\P_\\X)\\} \\leq c_1\\,\\eta^{-c_2}, \\label{bracketh}\\\\\n&&\\phantom{=}\\sbx|\\hD(\\X)T\\phis(\\X,\\theta)|~=~O_p(1), \\label{bnh}\\\\\n&&\\phantom{=}(\\sb\\E_\\Z[\\{\\hD(\\X)T\\phis(Y,\\theta)\\}^2])^{1/2}~=~O_p(s_N), \\label{dnh}\n\\ee\nwhere (\\ref{bracketh}) uses (\\ref{bmm}) of Assumption \\ref{abound}, (\\ref{bnh}) holds by (\\ref{dsup}) of Assumption \\ref{api} and the boundedness of $\\phis(\\X,\\theta) $ from Assumption \\ref{abound}, and (\\ref{dnh}) is due to (\\ref{d2}) of Assumption \\ref{api} and the boundedness of $\\phis(\\X,\\theta) $ from Assumption \\ref{abound}. Then\\tcr{,} (\\ref{bracketh}) gives\n\\be\nN_{[\\,]} \\{\\eta,\\mh\\mid\\cu,L_2(\\P_\\X)\\} ~\\leq~ c_1\\,\\{\\sx|\\hD(\\x)|\\}^{c_2}\\eta^{-c_2}.\n\\label{anh}\n\\ee\nSince $c_1\\,\\{\\sx|\\hD(\\x)|\\}^{c_2}=O_p(1) $ from Assumption \\ref{api}, combining (\\ref{bnh})--(\\ref{anh})  and applying Lemma \\ref{1v2} yield that\n\\bse\n\\sb|\\mbG_{n}\\{\\hD(\\X)T\\phis(Y,\\theta)\\}|~=~O_p(z_{n,N}),\n\\ese\nwhich gives that\n\\be\n|\\E_n\\{\\hD(\\X)T\\phis(Y,\\thetahatinit)\\}-\\E_\\Z\\{\\hD(\\X)T\\phis(Y,\\thetahatinit)\\}|~=~O_p(n^{-1/2}z_{n,N}).\n\\label{t311}\n\\ee\nAnalogously, by Example19.6 of \\citet{van2000asymptotic} and the boundedness of $\\psi(Y,\\theta) $, we know that\n\\be\n|\\E_n\\{\\hD(\\X)T\\psi(Y,\\thetahatinit)\\}-\\E_\\Z\\{\\hD(\\X)T\\psi(Y,\\thetahatinit)\\}|~=~O_p(n^{-1/2}z_{n,N}).\n\\label{t312}\n\\ee\nCombining (\\ref{t311}) and (\\ref{t312}) yields\\tcr{:}\n\\be\n|T_3(\\thetahatinit)-\\E_\\Z\\{T_3(\\thetahatinit)\\}|~=~O_p(n^{-1/2}z_{n,N}).\n\\label{et3}\n\\ee\nIn addition, if $\\phis(\\X,\\theta)=\\phi(\\X,\\theta) $, then\n\\bse\n\\E_\\Z\\{T_3(\\thetahatinit)\\}~=~\\E_\\Z(\\E_\\Z[\\hD(\\X)T\\{\\phis(\\X,\\thetahatinit)-\\psi(Y,\\thetahatinit)\\} \\mid\\X])~=~0.\n\\ese\nOtherwise, we have\n\\bse\n|\\E_\\Z\\{T_{3}(\\thetahatinit)\\}|~\\leq~(\\E_\\X[\\{\\hD(\\X)\\}^2]\\E[\\{\\phis(\\X,\\thetahatinit)-\\psi(Y,\\thetahatinit)\\}^2])^{1/2} ~=~O_p(s_N),\n\\ese\nwhere the last step uses the boundedness of $\\phis(\\X,\\theta) $ from Assumption \\ref{abound}. Hence\\tcr{,}\n\\bse\n|\\E_\\Z\\{T_{3}(\\thetahatinit)\\}|~=~I\\{\\phis(\\X,\\theta)\\neq\\phi(\\X,\\theta)\\}O_p(s_N).\n\\ese\nThis, combined with (\\ref{hfo}) and (\\ref{et3}), implies\\tcr{:}\n\\be\n\\{\\hf(\\thetahatinit)\\}^{-1}T_3(\\thetahatinit)~=~O_p(n^{-1/2}z_{n,N})+I\\{\\phis(\\X,\\theta)\\neq\\phi(\\X,\\theta)\\}O_p(s_N).\n\\label{t3}\n\\ee\n\nEventually, we deal with $T_4(\\thetahatinit) $. Denote\n\\bse\n\\mq~:=~\\{\\hD (\\X)T\\hpsi(\\X,\\theta):\\theta\\in\\mbtv\\}.\n\\ese\nDue to (\\ref{dsup})  of Assumption \\ref{api}, we have\n\\be\n&&\\phantom{~=~}N_{[\\,]} \\{\\sx|\\hD(\\x)|\\eta,\\mq\\mid\\cl\\cup\\cu,L_2(\\P_\\X)\\} \\nonumber\\\\\n&&~\\leq~ N_{[\\,]} \\{\\eta,\\mp_{n,k}\\mid\\cl,L_2(\\P_\\X)\\} \\leq H(\\cl)\\eta^{-c}, \\label{bracket1}\\\\\n&&\\phantom{~=~}\\sbx|\\hD(\\X)\\hpsi(\\X,\\theta)| \\nonumber\\\\\n&&~\\leq~ \\sx|\\hD(\\x)|\\sbx|\\hpsi(\\X,\\theta)| =O_p(d_{n,\\infty}), \\label{bn1}\\\\\n&&\\phantom{~=~}(\\sb\\E_\\X[\\{\\hD(\\X)\\hpsi(\\X,\\theta)\\}^2])^{1/2} \\nonumber\\\\\n&&~\\leq~ \\sx|\\hD(\\x)|\\Delta_{k}(\\cl)=O_p(d_{n,2}) \\quad (k=1,\\ldots,\\kK)\\tcr{,} \\label{dn1}\n\\ee\nfrom Assumption \\ref{aest}. Then\\tcr{,} (\\ref{bracket1}) implies\\tcr{:}\n\\be\nN_{[\\,]} \\{\\eta,\\mq\\mid\\cl\\cup\\cu,L_2(\\P_\\X)\\}~\\leq~ \\{\\sx|\\hD(\\x)|\\}^c H(\\cl)\\eta^{-c}. \\label{an1}\n\\ee\nSince $\\{\\sx|\\hD(\\x)|\\}^c H(\\cl)=O_p(a_n) $\nfrom Assumptions \\ref{aest} and \\ref{api}, combining (\\ref{bn1})--(\\ref{an1})  and applying Lemma \\ref{1v2} yield that\n\\be\n\\sb|\\mbG_{n_\\kK,k}\\{\\hD(\\X)\\hpsi(\\X,\\theta)\\}|~=~O_p(r_n).\n\\label{mbgpi1}\n\\ee\nIn addition, we have\n\\be\n&&\\phantom{~=~}\\sb|\\E_\\X\\{\\hD(\\X)\\hpsi(\\X,\\theta)\\}|\\nonumber\\\\\n&&~\\leq~ (\\E_\\X[\\{\\hD(\\X)\\}^2]\\sb\\E_\\X[\\{\\hpsi(\\X,\\theta)\\}^2])^{1/2}=O_p(s_N d_{n,2}),\n\\label{idn1}\n\\ee\nwhere the first step holds by H\\"older\'s inequality and the last step is due to Assumptions \\ref{api} and \\ref{aest}. Considering (\\ref{mbgpi1}) and (\\ref{idn1}), we know that\n\\bse\nT_4(\\thetahatinit)&~=~&\\kK^{-1}\\sk [n_\\kK^{-1/2}\\mbG_{n_\\kK,k}\\{\\hD(\\X)\\hpsi(\\X,\\thetahatinit)\\}+\\E_\\X\\{\\hD(\\X)\\hpsi(\\X,\\thetahatinit)\\}] \\\\\n&~=~&O_p(n^{-1/2}r_n+s_N d_{n,2}),\n\\ese\nwhich, combined with (\\ref{hfo}), implies that\n\\be\n\\{\\hf(\\thetahatinit)\\}^{-1}T_4(\\thetahatinit)~=~O_p(n^{-1/2}r_n+s_N d_{n,2}). \\label{t4}\n\\ee\n\nSumming up,  the equations (\\ref{t1}), (\\ref{t2}), (\\ref{t3}) and (\\ref{t4}) conclude the result.\n\n\\subsection{Proof of Corollary \\ref{corqte}}\nSince $\\nu=0$, we have\n\\bse\n\\E_{n+N}\\{\\phis(\\X,\\vt)\\}=\\E\\{\\phis(\\X,\\vt)\\}+O_p\\{(n+N)^{-1/2}\\}~=~\\E\\{\\phis(\\X,\\vt)\\}+ o_p(n^{-1/2})\\tcr{,}\n\\ese\nby the central limit theorem. Then\\tcr{,} the stochastic expansion directly follows from Theorem \\ref{thqte} and the asymptotic normality is obvious.\n\n\\subsection{Proof of Corollary \\ref{corsup}}\nWith $\\E_{n+N}\\{\\phihatn(\\X,\\thetahatinit)\\}$ substituted by $\\E_n\\{\\phihatn(\\X,\\thetahatinit)\\}$, the proof of Theorem \\ref{thqte} directly gives the stochastic expansion followed by the asymptotic normality. Then\\tcr{,} we have\n\\bse\n&&\\phantom{~=~}\\cov[\\{\\pi(\\X)\\}^{-1}T\\{\\phis(\\X,\\vt)-\\psi(Y,\\vt)\\},\\phis(\\X,\\vt)] \\\\\n&&~=~\\E[\\{\\phis(\\X,\\vt)\\}^2]-\\E\\{\\phis(\\X,\\vt)\\psi(Y,\\vt)\\}-\\E\\{\\phis(\\X,\\vt)-\\psi(Y,\\vt)\\} \\E\\{\\phis(\\X,\\vt)\\} \\\\\n&&~=~\\var\\{\\phis(\\X,\\vt)\\}-\\E\\{\\phis(\\X,\\vt)\\psi(Y,\\vt)\\}.\n\\ese\nTherefore\\tcr{,}\n\\bse\n&&\\sigsup^2~=~\\var[\\{\\pi(\\X)\\}^{-1}T\\{\\psi(Y,\\vt)-\\phis(\\X,\\vt)\\}]+\\var\\{\\phis(\\X,\\vt)\\}- \\\\\n&&\\phantom{\\sigsup^2~=~}2\\,\\cov[\\{\\pi(\\X)\\}^{-1}T\\{\\phis(\\X,\\vt)-\\psi(Y,\\vt)\\},\\phis(\\X,\\vt)] \\\\\n&&\\phantom{\\sigsup^2}~=~\\var[\\{\\pi(\\X)\\}^{-1}T\\{\\psi(Y,\\vt)-\\phis(\\X,\\vt)\\}]-\\var\\{\\phis(\\X,\\vt)\\}+2\\,\\E\\{\\phis(\\X,\\vt)\\psi(Y,\\vt)\\}.\n\\ese\n\n\\subsection{Proof of Theorem \\ref{theorem_ks_ate}}\nDenote $\\ell^{(t) }(\\x,\\mbP)=\\kappa_t(\\mbP\\trans\\x)f_\\S(\\mbP\\trans\\x) $  $(t=0,1) $. We now derive the convergence rate of $\\hlo(\\x,\\hmbP)-\\ell^{(1) }(\\x,\\mbP) $. The case of $\\hlz(\\x,\\hmbP)-\\ell^{(0) }(\\x,\\mbP) $ is similar.\n\nWe first deal with the error from estimating $\\mbP_0$ by $\\hmbP$, i.e., $\\hlo(\\x,\\hmbP)-\\hlo(\\x,\\mbP_0) $. Taylor\'s expansion gives that, for\n\\be\n\\bar{\\s}_n~:=~h_n^{-1}\\{\\mbP_0\\trans+\\bmu(\\hmbP-\\mbP_0)\\trans\\}(\\x-\\X)\\tcr{,}\n\\label{ysbar}\n\\ee\nwith some $\\bmu:=\\diag(\\mu_1,\\ldots,\\mu_r) $ and $\\mu_j\\in(0,1) $ $(j=1,\\ldots,r) $,\n\\be\n&&\\phantom{~=~}\\hlo(\\x,\\hmbP)-\\hlo(\\x,\\mbP_0) \\nonumber \\\\\n&&~=~h_n^{-(r+1) }\\Enk[\\{\\nabla K(\\bar{\\s})\\} \\trans(\\hmbP-\\mbP_0)\\trans(\\x-\\X)\\{\\pihatN(\\X)\\}^{-1}TY] \\nonumber\\\\\n&&~=~U_n(\\x)+V_{n,N}(\\x) ,\n\\label{ydhbe}\n\\ee\nwhere\n\\bse\n&&U_n(\\x)~:=~h_n^{-(r+1) }\\Enk[\\{\\nabla K(\\bar{\\s})\\} \\trans(\\hmbP-\\mbP_0)\\trans(\\x-\\X)\\{\\pis(\\X)\\}^{-1}TY] ,\n\\nonumber \\\\\n&&V_{n,N}(\\x)~:=~h_n^{-(r+1) }\\Enk[\\{\\nabla K(\\bar{\\s})\\} \\trans(\\hmbP-\\mbP_0)\\trans(\\x-\\X)\\hD(\\X)TY].\n\\ese\nTo control $U_n(\\x) $, write\n\\be\nU_n(\\x)&~=~& h_n^{-(r+1) }\\trace ((\\hmbP-\\mbP_0)\\trans \\Enk[(\\x-\\X)\\{\\nabla K(\\bar{\\s})\\} \\trans\\{\\pis(\\X)\\}^{-1}TY]) \\nonumber\\\\\n&~=~&h_n^{-(r+1) }\\trace[(\\hmbP-\\mbP_0)\\trans\\{\\bfU_{n,1}(\\x)+\\bfU_{n,2}(\\x)-\\bfU_{n,3}(\\x)\\}],\n\\label{yun}\n\\ee\nwhere\n\\bse\n&&\\bfU_{n,1}(\\x)~:=~\\Enk((\\x-\\X)[\\nabla K(\\bar{\\s}_n)-\\nabla K\\{h_n^{-1}\\mbP_0\\trans(\\x-\\X)\\}]\\trans\\{\\pis(\\X)\\}^{-1}TY),  \\\\\n&&\\bfU_{n,2}(\\x)~:=~\\Enk(\\x [\\nabla K\\{h_n^{-1}\\mbP_0\\trans(\\x-\\X)\\}]\\trans\\{\\pis(\\X)\\}^{-1}TY),  \\\\\n&&\\bfU_{n,3}(\\x)~:=~\\Enk(\\X [\\nabla K\\{h_n^{-1}\\mbP_0\\trans(\\x-\\X)\\}]\\trans\\{\\pis(\\X)\\}^{-1}TY).\n\\ese\nWe know\n\\be\n\\ss\\E[h_n^{-r}\\rho \\{h_n^{-1}(\\s-\\S) \\}|Y|]&~=~&\\ss\\hbox{$\\int$}h_n^{-r}\\rho\\{h_n^{-1}(\\s-\\bfv) \\} \\E(|Y|\\mid\\S=\\bfv) f_\\S(\\bfv)d\\bfv \\nonumber\\\\\n&~=~&\\ss\\hbox{$\\int$}\\rho(\\bft )\\E(|Y|\\mid\\S=\\s-h_n\\bft) f_\\S(\\s-h_n\\bft)d\\bft  \\nonumber\\\\\n&~=~& O(1).\n\\label{ygrho}\n\\ee\nwhere the second step uses change of variables while the last step holds by the boundedness of $\\E(|Y|\\mid\\S=\\cdot)f_\\S(\\cdot) $ from Assumptions \\ref{akernel} (ii)--(iii) and the integrability of $\\rho(\\cdot) $ from Assumption \\ref{ahbey} (ii). Moreover, under Assumptions \\ref{akernel} (ii)--(iii) and \\ref{ahbey} (ii), Theorem 2 of \\citet{hansen2008uniform} gives\\tcr{:}\n\\bse\n\\ss(\\Enk[h_n^{-r}\\rho \\{h_n^{-1}(\\s-\\S) \\}Y]-\\E[h_n^{-r}\\rho \\{h_n^{-1}(\\s-\\S) \\}Y])~=~O_p(\\xi_n)~=~o_p(1)\\tcr{.}\n\\ese\nThis, combined with (\\ref{ygrho}), implies\\tcr{:}\n\\be\n\\ss\\Enk[h_n^{-r}\\rho \\{h_n^{-1}(\\s-\\S) \\}Y]~=~O_p(1).\n\\label{yexrho}\n\\ee\nNext, we have\n\\be\n&&\\phantom{~=~}\\sx\\Enk (\\|[\\nabla K(\\bar{\\s}_n)-\\nabla K\\{h_n^{-1}\\mbP_0\\trans(\\x-\\X)\\}]Y\\|) \\nonumber\\\\\n&&~\\leq~ \\sx\\Enk [\\|\\bar{\\s}_n-h_n^{-1}\\mbP_0\\trans(\\x-\\X)\\|\\rho\\{h_n^{-1}\\mbP_0\\trans(\\x-\\X)\\}|Y|] \\nonumber\\\\\n&&~\\leq~\\sx\\Enk [\\|(\\hmbP-\\mbP_0)\\trans(\\x-\\X)\\|h_n^{-1}\\rho\\{h_n^{-1}\\mbP_0\\trans(\\x-\\X)\\}|Y|] \\nonumber\\\\\n&&~\\leq~ c\\,\\|\\hmbP-\\mbP_0\\|_1\\sxx\\|\\x-\\X\\|_{\\infty}\\ss\\Enk [h_n^{-1}\\rho\\{h_n^{-1}(\\s-\\S)\\}|Y|]\\nonumber \\\\\n&&~=~O_p(h_n^{r-1}\\alpha_n),\n\\label{yalphan}\n\\ee\nwhere the first step uses the local Lipschitz continuity of $\\nabla K(\\cdot) $ from Assumption \\ref{ahbey} (ii), the second step is due to the definition (\\ref{ysbar}) of $\\bar{\\s}_n$, the third step holds by H\\"older\'s inequality, and the last step is because of Assumptions \\ref{al1}, \\ref{ahbe} (i) and the equation (\\ref{yexrho}). Hence\\tcr{,}\n\\bse\n&&\\phantom{~=~}\\sx\\|\\bfU_{n,1}(\\x)\\|_{\\infty} \\\\\n&&~\\leq~ c\\,\\sx\\Enk (\\|\\x-\\X\\|_{\\infty}\\|[\\nabla K(\\bar{\\s}_n)-\\nabla K\\{h_n^{-1}\\mbP_0\\trans(\\x-\\X)\\}]Y\\|) \\\\\n&&~\\leq~ c\\,\\sx\\Enk (\\|[\\nabla K(\\bar{\\s}_n)-\\nabla K\\{h_n^{-1}\\mbP_0\\trans(\\x-\\X)\\}]Y\\|) =O_p(h_n^{r-1}\\alpha_n).\n\\ese\nwhere the first step holds by the boundedness of $\\{\\pis(\\X)\\}^{-1}T$, the second step is due to Assumption \\ref{ahbe} (i), and the last step uses (\\ref{yalphan}). This, combined with Assumption \\ref{al1} and H\\"older\'s inequality, implies\\tcr{:}\n\\be\n&&\\phantom{~=~}\\sx\\|(\\hmbP-\\mbP_0)\\trans \\bfU_{n,1}(\\x)\\|_\\infty \\nonumber\\\\\n&&~\\leq~\\|\\hmbP-\\mbP_0\\|_1\\sx\\|\\bfU_{n,1}(\\x)\\|_{\\infty}=O_p(h_n^{r-1}\\alpha_n^2).\n\\label{ybdn1}\n\\ee\nThen, under Assumptions \\ref{akernel} (ii)--(iii) and \\ref{ahbey} (ii), Theorem 2 of \\citet{hansen2008uniform} gives\n\\be\n&&\\sx\\|\\bfU_{n,2}(\\x)-\\E\\{\\bfU_{n,2}(\\x)\\} \\|_{\\infty}~=~O_p(h_n^{r}\\xi_n), \\label{ydn2}\\\\\n&&\\sx\\|\\bfU_{n,3}(\\x)-\\E\\{\\bfU_{n,3}(\\x)\\} \\|_{\\infty}~=~O_p(h_n^{r}\\xi_n).\n\\label{ydn3}\n\\ee\nLet $\\delta(\\s):=f_\\S(\\s)\\kappa_1(\\s) $ and $\\nabla\\delta(\\s):=\\partial \\delta(\\s)/\\partial \\s$. We \\tcr{then} have\n\\be\n&&\\phantom{~=~}\\sx\\|\\E\\{\\bfU_{n,2}(\\x)\\} \\|_\\infty \\nonumber\\\\\n&&~\\leq~ \\sx\\|\\x\\hbox{$\\int$}\\delta(\\s)[\\nabla K\\{h_n^{-1}(\\mbP_0\\trans\\x-s)\\}]\\trans ds\\|_\\infty \\nonumber\\\\\n&&~=~h_n^{r+1}\\sx\\|\\x\\hbox{$\\int$}\\{\\nabla\\delta(\\mbP_0\\trans\\x-h_n\\bft)\\} \\trans K(\\bft)d\\bft\\|_\\infty =O(h_n^{r+1}).\n\\label{yedn2}\n\\ee\nIn the above, the second step uses integration by parts and change of variables, and the last step holds by Assumption \\ref{ahbey} (i), the boundedness of $\\nabla\\delta(\\s) $ from Assumptions \\ref{akernel} (ii) and (iv), and the integrability of $K(\\cdot) $ from Assumption \\ref{akernel} (i). Set $\\bzeta(\\s):=f_\\S(\\s)\\bchi_1(\\s) $ and $\\nabla\\bzeta(\\s):=\\partial \\bzeta(\\s)/\\partial \\s$. Analogous to (\\ref{yedn2}), we know\n\\be\n&&\\phantom{~=~}\\sx\\|\\E\\{\\bfU_{n,3}(\\x)\\} \\|_\\infty  \\nonumber\\\\\n&&~\\leq~ \\sx\\|\\hbox{$\\int$}\\bzeta(\\s) [\\nabla K\\{h_n^{-1}(\\mbP_0\\trans\\x-s)\\}]\\trans ds\\|_\\infty \\nonumber\\\\\n&&~=~h_n^{r+1}\\sx\\|\\hbox{$\\int$}\\{\\nabla\\bzeta(\\mbP_0\\trans\\x-h_n\\bft)\\} \\trans K(\\bft)d\\bft\\|_\\infty =O(h_n^{r+1}),\n\\label{yedn3}\n\\ee\nwhere the last step holds by the boundedness of $\\|\\nabla\\bzeta(\\s)\\|_\\infty$ from Assumptions \\ref{akernel} (ii) and \\ref{ahbey} (iii), and the integrability of $K(\\cdot) $ from Assumption \\ref{akernel} (i). Combining (\\ref{ydn2})--(\\ref{yedn3}) yields\n\\bse\n\\sx\\|\\bfU_{n,2}(\\x)-\\bfU_{n,3}(\\x)\\|_\\infty~=~O_p(h_n^{r}\\xi_n+h_n^{r+1}),\n\\ese\nwhich implies that\n\\bse\n&&\\phantom{~=~}\\sx\\|(\\mbP_0-\\hmbP)\\trans\\{\\bfU_{n,2}(\\x)-\\bfU_{n,3}(\\x)\\} \\|_\\infty \\\\\n&&~\\leq~\\|\\mbP_0-\\hmbP\\|_1\\sx\\|\\bfU_{n,2}(\\x)-\\bfU_{n,3}(\\x)\\|_{\\infty} \\\\\n&&~=~O_p(h_n^{r}\\xi_n\\alpha_n+h_n^{r+1}\\alpha_n)\\tcr{,}\n\\ese\nusing H\\"older\'s inequality and Assumption \\ref{al1}. This, combined with (\\ref{yun}) and (\\ref{ybdn1}), gives\n\\be\n\\sx|U_n(\\x)|~=~O_p(h_n^{-2}\\alpha_n^2+h_n^{-1}\\xi_n\\alpha_n+\\alpha_n).\n\\label{yunr}\n\\ee\nThen\\tcr{,} we consider $V_{n,N}$. Write\n\\be\nV_{n,N}(\\x)&~=~& h_n^{-(r+1) }\\trace ((\\hmbP-\\mbP_0)\\trans \\Enk[(\\x-\\X)\\{\\nabla K(\\bar{\\s})\\} \\trans\\hD(\\X)TY]) \\nonumber\\\\\n&~=~&h_n^{-(r+1) }\\trace[(\\hmbP-\\mbP_0)\\trans\\{\\bfV^{(1) }_{n,N}(\\x)+\\bfV^{(2) }_{n,N}(\\x)\\}],\n\\label{yvn}\n\\ee\nwhere\n\\bse\n&&\\bfV^{(1) }_{n,N}(\\x)~:=~\\Enk((\\x-\\X)[\\nabla K(\\bar{\\s}_n)-\\nabla K\\{h_n^{-1}\\mbP_0\\trans(\\x-\\X)\\}]\\trans\\hD(\\X)TY),  \\\\\n&&\\bfV^{(2) }_{n,N}(\\x)~:=~\\Enk((\\x-\\X) [\\nabla K\\{h_n^{-1}\\mbP_0\\trans(\\x-\\X)\\}]\\trans\\hD(\\X)TY).\n\\ese\nWe know\n\\be\n&&\\phantom{~=~}\\ss\\E(h_n^{-r}[\\rho \\{h_n^{-1}(\\s-\\S) \\}Y]^2) \\nonumber\\\\\n&&~=~\\ss\\hbox{$\\int$}h_n^{-r}[\\rho\\{h_n^{-1}(\\s-\\bfv) \\}]^2\\E(Y^2\\mid\\S=\\bfv) f_\\S(\\bfv)d\\bfv \\nonumber\\\\\n&&~=~\\ss\\hbox{$\\int$}\\{\\rho(\\bft )\\}^2\\E(Y^2\\mid\\S=\\s-h_n\\bft) f_\\S(\\s-h_n\\bft)d\\bft = O(1).\n\\label{ygrhosq}\n\\ee\nwhere the second step uses change of variables while the last step holds by the boundedness of $\\E(Y^2\\mid\\S=\\cdot)f_\\S(\\cdot) $ from Assumptions \\ref{akernel} (ii)--(iii) and the square integrability of $\\rho(\\cdot) $ from Assumption \\ref{ahbey} (ii). Moreover, under Assumptions \\ref{akernel} (ii)--(iii) and \\ref{ahbey} (ii), Theorem 2 of \\citet{hansen2008uniform} gives\n\\bse\n\\ss\\{\\Enk(h_n^{-r}[\\rho \\{h_n^{-1}(\\s-\\S) \\}Y]^2)-\\E(h_n^{-r}[\\rho \\{h_n^{-1}(\\s-\\S) \\}Y]^2)\\}=O_p(\\xi_n)~=~o_p(1)\\tcr{.}\n\\ese\nThis, combined with (\\ref{ygrhosq}), implies\n\\be\n\\ss\\Enk(h_n^{-r}[\\rho \\{h_n^{-1}(\\s-\\S) \\}Y]^2)~=~O_p(1).\n\\label{yexrhosq}\n\\ee\nNext, we have\n\\be\n&&\\phantom{~=~}\\sx\\Enk (\\|[\\nabla K(\\bar{\\s}_n)-\\nabla K\\{h_n^{-1}\\mbP_0\\trans(\\x-\\X)\\}]Y\\|^2) \\nonumber\\\\\n&&~\\leq~\\sx\\Enk (\\|\\bar{\\s}_n-h_n^{-1}\\mbP_0\\trans(\\x-\\X)\\|^2[\\rho\\{h_n^{-1}\\mbP_0\\trans(\\x-\\X)\\}Y]^2) \\nonumber\\\\\n&&~\\leq~\\sx\\Enk (\\|(\\hmbP-\\mbP_0)\\trans(\\x-\\X)\\|^2h_n^{-2}[\\rho\\{h_n^{-1}\\mbP_0\\trans(\\x-\\X)\\}Y]^2) \\nonumber\\\\\n&&~\\leq~ c\\,\\|\\hmbP-\\mbP_0\\|_1^2\\sxx\\|\\x-\\X\\|_{\\infty}^2\\ss\\Enk (h_n^{-2}[\\rho\\{h_n^{-1}\\mbP_0\\trans(\\x-\\X)\\}Y]^2)\\nonumber \\\\\n&&~=~O_p(h_n^{r-2}\\alpha_n^2),\n\\label{yalphansq}\n\\ee\nwhere the first step uses the local Lipschitz continuity of $\\nabla K(\\cdot) $ from Assumption \\ref{ahbey} (ii), the second step is due to the definition (\\ref{ysbar}) of $\\bar{\\s}_n$, the third step holds by H\\"older\'s inequality, and the last step is because of Assumptions \\ref{al1}, \\ref{ahbe} (i) and the equation (\\ref{yexrhosq}). Thus\\tcr{,} we have\n\\be\n&&\\phantom{~=~}\\|\\bfV^{(1) }_{n,N}(\\x)\\|_{\\infty} \\nonumber\\\\\n&&~\\leq~ c\\, (\\E_{n,k}[\\{\\hD(\\X)\\}^2]\\sx\\Enk (\\|[\\nabla K(\\bar{\\s}_n)-\\nabla K\\{h_n^{-1}\\mbP_0\\trans(\\x-\\X)\\}]Y\\|^2))^{1/2} \\nonumber \\\\\n&& ~=~O_p(h_n^{r/2-1}\\alpha_n s_N),\n\\label{yvn1}\n\\ee\nwhere the first step uses H\\"older\'s inequality and the boundedness of $\\sx\\|\\x-\\X\\|_\\infty $ from Assumption \\ref{ahbey} (i), and the last step holds by (\\ref{sn}) and (\\ref{yalphansq}). Next, we know that\n\\be\n&&\\phantom{~=~}|\\ss\\E_\\S([\\nabla K_{[j]}\\{h_n^{-1}(\\s-\\S)\\}Y]^2)|\\nonumber\\\\\n&&~=~|\\ss\\hbox{$\\int$}[\\nabla K_{[j]}\\{h_n^{-1}(\\s-\\bfv)\\}]^2 E(Y^2\\mid\\S=\\bfv)f_{\\S}(\\bfv)d\\bfv| \\nonumber\\\\\n&&~=~h_n^{r}|\\ss\\hbox{$\\int$}\\{\\nabla K_{[j]}(\\bft)\\}^2E(Y^2\\mid\\S=\\s-h_n\\bft)f_{\\S}(\\s-h_n\\bft)d\\bft|=O(h_n^{r}),\n\\label{yexp1}\n\\ee\nwhere the second step uses change of variables while the last step is due to the boundedness of $\\E(Y^2\\mid\\S=\\cdot)f_\\S(\\cdot) $ from Assumptions \\ref{akernel} (ii)--(iii) and the square integrability of $\\nabla K_{[j]}(\\cdot) $ from Assumption \\ref{akernel} (i). Then, under Assumptions \\ref{akernel} (ii)--(iii) and \\ref{ahbey} (ii), Theorem 2 of \\citet{hansen2008uniform} implies\\tcr{:}\n\\bse\n&&\\phantom{~=~}\\ss|\\Enk([\\nabla K_{[j]}\\{h_n^{-1}(\\s-\\S)\\}Y]^2)-\\E_\\S([\\nabla K_{[j]}\\{h_n^{-1}(\\s-\\S)\\}Y]^2)| \\\\\n&&~=~O_p(h_n^{r}\\xi_{n})=o_p(h_n^r)\\tcr{,}\n\\ese\nwhere the last step is because we assume $\\xi_{n}=o(1) $. This, combined with (\\ref{yexp1}), yields\n\\be\n\\ss\\Enk([\\nabla K_{[j]}\\{h_n^{-1}(\\s-\\S)\\}Y]^2)~=~O_p(h_n^{r}).\n\\label{yone1}\n\\ee\nLet $v_{ij}(\\x) $ be the $(i,j) $th entry of $\\bfV^{(2) }_{n,N}(\\x) $ $(i=1,\\ldots,p;\\,j=1,\\ldots,r) $. We know\n\\bse\n&&\\phantom{~=~}\\sx|v_{ij}(\\x)| \\\\\n&&~\\equiv~\\sx|\\Enk[(\\x_{[i]}-\\X_{[i]}) \\nabla K_{[j]}\\{h_n^{-1}\\mbP_0\\trans(\\x-\\X)\\} \\hD(\\X)TY]| \\\\\n&&~\\leq~\\ss\\Enk[|\\nabla K_{[j]}\\{h_n^{-1}(\\s-\\S)\\} \\hD(\\X)Y|] \\\\\n&&~\\leq~\\{\\ss\\Enk([\\nabla K_{[j]}\\{h_n^{-1}(\\s-\\S)\\}Y]^2)\\Enk[\\{\\hD(\\X)\\}^2]\\}^{1/2}=O_p(h_n^{r/2}s_N),\n\\ese\nwhere the second step uses the boundedness of $\\sx\\|\\x-\\X\\|_\\infty$ from Assumption \\ref{ahbe} (i), the third step is due to H\\"older\'s inequality and the last step holds by (\\ref{yone1}) and (\\ref{sn}). It now follows that\n\\be\n\\sx\\|\\bfV^{(2) }_{n,N}(\\x)\\|_\\infty~=~O_p(h_n^{r/2}s_N).\n\\label{yvn2}\n\\ee\nTherefore\\tcr{,} we have\n\\bse\n&&\\phantom{~=~}\\sx\\|(\\mbP_0-\\hmbP)\\trans\\{\\bfV^{(1) }_{n,N}(\\x)+\\bfV^{(2) }_{n,N}(\\x)\\} \\|_\\infty \\\\\n&&~\\leq~\\|\\mbP_0-\\hmbP\\|_1\\sx\\|\\bfV^{(1) }_{n,N}(\\x)+\\bfV^{(2) }_{n,N}(\\x)\\|_{\\infty} \\\\\n&&~=~O_p(h_n^{r/2-1}\\alpha_n^2 s_N+h_n^{r/2}\\alpha_n s_N)~=~O_p(h_n^{r/2}\\alpha_n s_N),\n\\ese\nwhere the first step is due to H\\"older\'s inequality, the second step uses (\\ref{yvn1}), (\\ref{yvn2}) and Assumption \\ref{al1}, and the last step is because we assume $h_n^{-1}\\alpha_n=o(1) $. Combined with (\\ref{yvn}), it gives\n\\be\n\\sx |V_{n,N}(\\x)|~=~O_p\\{h_n^{-(r/2+1) }\\alpha_ns_N\\}.\n\\label{yvnr}\n\\ee\nConsidering (\\ref{ydhbe}), (\\ref{yunr}) and (\\ref{yvnr}), we know that\n\\be\n&&\\phantom{~=~}\\sx|\\hlo(\\x,\\hmbP)-\\hlo(\\x,\\mbP_0)| \\nonumber\\\\\n&&~=~O_p\\{h_n^{-2}\\alpha_n^2+h_n^{-1}\\xi_n\\alpha_n+\\alpha_n+h_n^{-(r/2+1) }\\alpha_n s_N\\}.\n\\label{yhmbe}\n\\ee\n\nFurther, we control the error from estimating $\\pi(\\x) $ by $\\pihatN(\\x) $, i.e., $\\hlo(\\x,\\mbP_0)-\\lnk^{(1) }(\\x,\\mbP_0) $ with\n\\bse\n\\lnk^{(1) }(\\x,\\mbP)~:=~h_n^{-r}\\Enk [\\{\\pis(\\X)\\}^{-1}TY K_h\\{\\mbP\\trans(\\x-\\X)\\}].\n\\ese\nWe have\n\\be\n&&\\phantom{~=~}|\\ss\\E_\\S[h_n^{-r}\\{K_h(\\s-\\S)Y\\}^2]|\\nonumber\\\\\n&&~=~h_n^{-r}|\\ss\\hbox{$\\int$}[K\\{h_n^{-1}(\\s-\\bfv)\\}]^2\\E(Y^2\\mid\\S=\\bfv)f_{\\S}(\\bfv)d\\bfv| \\nonumber\\\\\n&&~=~|\\ss\\hbox{$\\int$}\\{K(\\bft)\\}^2\\E(Y^2\\mid\\S=\\s-h_n\\bft)f_{\\S}(\\s-h_n\\bft)d\\bft|~=~O(1),\n\\label{yexp}\n\\ee\nwhere the second step uses change of variables while the last step is due to the boundedness of $\\E(Y^2\\mid\\S=\\cdot)f_\\S(\\cdot) $ from Assumptions \\ref{akernel} (ii)--(iii) along with the square integrability of $K(\\cdot) $ from Assumption \\ref{akernel} (i). Then, under Assumptions \\ref{akernel}, Theorem 2 of \\citet{hansen2008uniform} gives\n\\bse\n\\ss|\\Enk[h_n^{-r}\\{K_h(\\s-\\S)Y\\}^2]-\\E_\\S[h_n^{-r}\\{K_h(\\s-\\S)Y\\}^2]|~=~O_p(\\xi_n)~=~o_p(1),\n\\ese\nwhere the last step is because we assume $\\xi_n=o(1) $. This, combined with (\\ref{yexp}), yields\n\\be\n\\ss\\Enk[h_n^{-r}\\{K_h(\\s-\\S)Y\\}^2]~=~O_p(1).\n\\label{yone}\n\\ee\nTherefore\\tcr{,} we know that\n\\be\n&&\\phantom{~=~}\\sx|\\hlo(\\x,\\mbP_0)-\\lnk^{(1) }(\\x,\\mbP_0)| \\nonumber\\\\\n&&~\\leq~ c\\,\\ss \\Enk \\{|\\hD(\\X)h_n^{-r}K_h(\\s-\\S)Y| \\} \\nonumber\\\\\n&&~\\leq~ c\\,h^{-r/2}\\{\\Enk [\\{\\hD(\\X)\\}^2] \\ss\\Enk[h_n^{-r}\\{K_h(\\s-\\S)Y\\}^2]\\}^{1/2} \\nonumber\\\\\n&&~=~O_p(h^{-r/2}s_N), \\label{ymnk}\n\\ee\nwhere the second step is due to H\\"older\'s inequality and the last step holds by (\\ref{sn}) and (\\ref{yone}).\n\nCombining (\\ref{yhmbe}) and (\\ref{ymnk}) yields that\n\\be\n&&\\phantom{~=~}\\sx|\\hlo(\\x,\\hmbP)-\\lnk^{(1) }(\\x,\\mbP_0)| \\nonumber\\\\ &&=O_p\\{h_n^{-2}\\alpha_n^2+h_n^{-1}\\xi_n\\alpha_n+\\alpha_n+h_n^{-(r/2+1) }\\alpha_n s_N+h^{-r/2}s_N\\} \\nonumber\\\\\n&&~=~O_p\\{h_n^{-2}\\alpha_n^2+h_n^{-1}\\xi_n\\alpha_n+\\alpha_n+h^{-r/2}s_N\\} ~=~O_p\\{b_{n,N}^{(2) }\\},\n\\label{yan2}\n\\ee\nwhere the second step holds by the fact that $h_n^{-(r/2+1) }\\alpha_n s_N=o(h^{-r/2}s_N) $ because we assume $h^{-1}\\alpha_n=o(1) $.\n\nNow we handle the error $\\lnk^{(1) }(\\x,\\mbP_0)-\\ell^{(1) }(\\x,\\mbP_0) $. Under Assumptions \\ref{akernel}, Theorem 2 of \\citet{hansen2008uniform} gives\n\\be\n\\sx|\\lnk^{(1) }(\\x,\\mbP_0)-\\E\\{\\lnk^{(1) }(\\x,\\mbP_0)\\}|~=~O_p(\\xi_n).\n\\label{ypt2}\n\\ee\nFurther, under Assumptions \\ref{akernel} (i), (ii) and (iv), standard arguments based on $d$th order Taylor\'s expansion of $\\ell^{(1) }(\\x,\\mbP_0) $ yield that\n\\be\n\\sx|\\E\\{\\lnk^{(1) }(\\x,\\mbP_0)\\}-\\ell^{(1) }(\\x,\\mbP_0)|~=~O(h_n^d).\n\\label{ypt3}\n\\ee\n\nCombining (\\ref{yan2}), (\\ref{ypt2}) and (\\ref{ypt3}) yields\n\\be\n\\sx|\\hlo(\\x,\\hmbP)-\\ell^{(1) }(\\x,\\mbP_0)|~=~O_p\\{b_n^{(1) }+b_{n,N}^{(2) }\\}.\n\\label{ynum}\n\\ee\nSimilar arguments imply that\n\\be\n\\sx|\\hlz(\\x,\\hmbP)-\\ell^{(0) }(\\x,\\mbP_0)|~=~O_p\\{b_n^{(1) }+b_{n,N}^{(2) }\\}.\n\\label{ydeno}\n\\ee\nTherefore\\tcr{,} we have\n\\bse\n&&\\phantom{~=~}\\sx|\\mhatnk(\\x,\\hmbP)-\\tmu(\\x,\\mbP_0)| \\nonumber\\\\\n&&~=~\\sx|\\{\\hlz(\\x,\\hmbP)\\}^{-1}\\hl^{(0) }(\\x,\\hmbP)-\\{\\ell^{(0) }(\\x,\\mbP_0)\\}^{-1}\\ell^{(1) }(\\x,\\mbP_0)| \\\\\n&&~\\leq~\\sx|\\{\\hlz(\\x,\\mbP_0)\\}^{-1}\\{\\hlo(\\x,\\hmbP)-\\ell^{(1) }(\\x,\\mbP_0)\\}|+ \\\\\n&&\\phantom{~=~}\\sx|[\\{\\hlz(\\x,\\mbP_0)\\}^{-1}-\\{\\ell^{(0) }(\\x,\\mbP_0)\\}^{-1}]\\ell^{(1) }(\\x,\\mbP_0)| \\\\\n&&~=~O_p\\{b_n^{(1) }+b_{n,N}^{(2) }\\},\n\\ese\nwhere the last step follows from the fact that $b_n^{(1) }+b_{n,N}^{(2) }=o(1) $, and repeated use of (\\ref{ynum}) and (\\ref{ydeno}) as well as Assumptions \\ref{api4} and \\ref{akernel} (ii).\n\n\\subsection{Proof of Proposition \\ref{thphi}}\nThe function $F(\\cdot\\mid\\S) $ is obviously bounded. For any $\\theta_1,\\theta_2\\in\\mbtv$, Taylor\'s expansion gives\n\\bse\n&&\\phantom{~=~}|[\\{\\pis(\\X)\\}^{-1}T]^m\\{\\phis(\\X,\\theta_1)-\\phis(\\X,\\theta_2)\\}| \\\\\n&&~\\leq~ c\\,|F(\\theta_1\\mid\\S)-F(\\theta_2\\mid\\S)| ~\\leq~ c\\,\\sb f(\\theta\\mid\\S)|\\theta_1-\\theta_2|\\quad (m=0,1),\n\\ese\nwhere the first step uses the boundedness of $\\{\\pis(\\X)\\}^{-1}$ from Assumption \\ref{api}. Therefore, the condition \\eqref{conditional_density} and Example 19.7 of \\citet{van2000asymptotic} give\n\\be\n&&N_{[\\,]}\\{\\eta,\\mm,L_2(\\P_\\X)\\}~\\leq~ c\\,\\eta^{-1}, \\label{mm} \\\\\n&&N_{[\\,]}\\{\\eta,\\mathcal{F}^*,L_2(\\P_\\X)\\}~\\leq~ c\\,\\eta^{-1}\\tcr{,} \\nonumber\n\\ee\nwith $\\mathcal{F}^*:=\\{\\{\\pis(\\X)\\}^{-1}T\\phis(\\X,\\theta):\\theta\\in\\mbtv\\}$, which implies that $\\mathcal{F}^*$ and $\\mm$ are $\\P$-Donsker according to Theorem 19.5 of \\citet{van2000asymptotic}. Further, we have that, for any sequence $\\tvt\\to\\vt$ in probability,\n\\bse\n&&\\phantom{~=~}\\E_\\X([\\{\\pis(\\X)\\}^{-2}T]^m\\{\\phis(\\X,\\tvt)-\\phis(\\X,\\vt)\\}^2) \\\\\n&&~\\leq~ c\\,\\E_\\S[\\{F(\\tvt\\mid\\S)-F(\\vt\\mid\\S)\\}^2] ~\\leq~ c\\,(\\tvt-\\vt)^2\\E[\\{\\sb f(\\theta\\mid\\S)\\}^2]\\to 0 \\;\\; (m=0,1)\n\\ese\nin probability, where the first step uses the boundedness of $\\{\\pis(\\X)\\}^{-2}$ from Assumption \\ref{api}, the second step uses Taylor\'s expansion as well as the fact that $\\tvt\\in\\mbtv$ with probability approaching one, and the last step holds by the condition \\eqref{conditional_density}. Thus applying Lemma 19.24 of \\citet{van2000asymptotic} concludes (\\ref{unipi1}) and (\\ref{unipi2}).\n\n\\subsection{Proof of Theorem \\ref{thhd}}\nDenote $e^{(t) }(\\x,\\theta,\\mbP)=\\varphi_t(\\mbP\\trans\\x,\\theta)f_\\S(\\mbP\\trans\\x) $  $(t=0,1) $. We now derive the convergence rate of $\\hateo(\\x,\\theta,\\hmbP)-e^{(1) }(\\x,\\theta,\\mbP) $. The case of $\\hatez(\\x,\\theta,\\hmbP) - e^{(0) }(\\x,\\theta,\\mbP) $ is similar.\n\nWe first deal with the error from estimating $\\mbP_0$ by $\\hmbP$, i.e., $\\hateo(\\x,\\theta,\\hmbP)-\\hateo(\\x,\\theta,\\mbP_0) $. Taylor\'s expansion gives that, for\n\\be\n\\bar{\\s}_n~:=~h_n^{-1}\\{\\mbP_0\\trans+\\bmu(\\hmbP-\\mbP_0)\\trans\\}(\\x-\\X)\n\\label{sbar}\n\\ee\nwith some $\\bmu:=\\diag(\\mu_1,\\ldots,\\mu_r) $ and $\\mu_j\\in(0,1) $ $(j=1,\\ldots,r) $,\n\\be\n&&\\phantom{~=~}\\hateo(\\x,\\theta,\\hmbP)-\\hateo(\\x,\\theta,\\mbP_0) \\nonumber \\\\\n&&~=~h_n^{-(r+1) }\\Enk[\\{\\nabla K(\\bar{\\s})\\} \\trans(\\hmbP-\\mbP_0)\\trans(\\x-\\X)\\{\\pihatN(\\X)\\}^{-1}T\\psi(Y,\\theta)] \\nonumber\\\\\n&&~=~U_n(\\x,\\theta)+V_{n,N}(\\x,\\theta) ,\n\\label{dhbe}\n\\ee\nwhere\n\\bse\n&&U_n(\\x,\\theta)~:=~h_n^{-(r+1) }\\Enk[\\{\\nabla K(\\bar{\\s})\\} \\trans(\\hmbP-\\mbP_0)\\trans(\\x-\\X)\\{\\pis(\\X)\\}^{-1}T\\psi(Y,\\theta)] ,\n\\nonumber \\\\\n&&V_{n,N}(\\x,\\theta)~:=~h_n^{-(r+1) }\\Enk[\\{\\nabla K(\\bar{\\s})\\} \\trans(\\hmbP-\\mbP_0)\\trans(\\x-\\X)\\hD(\\X)T\\psi(Y,\\theta)].\n\\ese\nTo control $U_n(\\x,\\theta) $, write\n\\be\nU_n(\\x,\\theta)&~=~& h_n^{-(r+1) }\\trace ((\\hmbP-\\mbP_0)\\trans \\Enk[(\\x-\\X)\\{\\nabla K(\\bar{\\s})\\} \\trans\\{\\pis(\\X)\\}^{-1}T\\psi(Y,\\theta)]) \\nonumber\\\\\n&~=~&h_n^{-(r+1) }\\trace[(\\hmbP-\\mbP_0)\\trans\\{\\bfU_{n,1}(\\x,\\theta)+\\bfU_{n,2}(\\x,\\theta)-\\bfU_{n,3}(\\x,\\theta)\\}],\n\\label{un}\n\\ee\nwhere\n\\bse\n&&\\bfU_{n,1}(\\x,\\theta)~:=~\\Enk((\\x-\\X)[\\nabla K(\\bar{\\s}_n)-\\nabla K\\{h_n^{-1}\\mbP_0\\trans(\\x-\\X)\\}]\\trans\\{\\pis(\\X)\\}^{-1}T\\psi(Y,\\theta)),  \\\\\n&&\\bfU_{n,2}(\\x,\\theta)~:=~\\Enk(\\x [\\nabla K\\{h_n^{-1}\\mbP_0\\trans(\\x-\\X)\\}]\\trans\\{\\pis(\\X)\\}^{-1}T\\psi(Y,\\theta)),  \\\\\n&&\\bfU_{n,3}(\\x,\\theta)~:=~\\Enk(\\X [\\nabla K\\{h_n^{-1}\\mbP_0\\trans(\\x-\\X)\\}]\\trans\\{\\pis(\\X)\\}^{-1}T\\psi(Y,\\theta)).\n\\ese\nFor the function $\\rho(\\cdot) $ in Assumption \\ref{ahbe} (ii), denote $\\mathcal{J}_n:=\\{h^{-r}_n\\rho\\{h_n^{-1}(\\s-\\mbP_0\\trans\\X)\\}:\\s\\in\\ms\\}$. Taylor\'s expansion gives that, for  any $\\s_1,\\s_2\\in\\ms$  and some $\\bar{\\s}:=\\s_1+\\bmu(\\s_2-\\s_1) $ with $\\bmu:=\\diag(\\mu_1,\\ldots,\\mu_r) $ and $\\mu_j\\in(0,1) $ $(j=1,\\ldots,r) $,\n\\bse\n&&\\phantom{~=~}h^{-r}_n|\\rho\\{h_n^{-1}(\\s_1-\\mbP_0\\trans\\X)\\}-\\rho\\{h_n^{-1}(\\s_2-\\mbP_0\\trans\\X)\\}| \\\\\n&&~=~ h_n^{-(r+1) }|[\\nabla\\rho\\{h_n^{-1}(\\bar{\\s}-\\mbP_0\\trans\\X)\\}]\\trans(\\s_1-\\s_2)|\\leq c\\,h^{-(r+1) }_n\\|\\s_1-\\s_2\\|,\n\\ese\nwhere the second step uses the boundedness of $\\nabla\\rho(\\cdot) $ from Assumption \\ref{ahbe} (ii). Therefore Example 19.7 of \\citet{van2000asymptotic} implies\n\\be\nN_{[\\,]}\\{\\eta ,\\mathcal{J}_n,L_2(\\P_\\X)\\}~\\leq~ c\\,h_n^{-(r+1) }\\eta^{-r}.\n\\label{bracj}\n\\ee\nMoreover, we have that\n\\be\n\\ssx [h^{-r}_n\\rho\\{h_n^{-1}(\\s-\\mbP_0\\trans\\x)\\}]~=~O(h_n^{-r}).\n\\label{supj}\n\\ee\ndue to the boundedness of $\\rho(\\cdot) $ from Assumption \\ref{ahbe} (ii). In addition, we know that\n\\be\n\\ss\\E_\\S([h_n^{-r}\\rho \\{h_n^{-1}(\\s-\\S) \\}]^2)&~=~&h^{-r}\\ss\\hbox{$\\int$}h_n^{-r}[\\rho\\{h_n^{-1}(\\s-\\bfv) \\}]^2 f_\\S(\\bfv)d\\bfv \\nonumber\\\\\n&~=~&h_n^{-r}\\ss\\hbox{$\\int$}\\{\\rho(\\bft )\\}^2 f_\\S(\\s-h_n\\bft)d\\bft ~=~ O(h_n^{-r}),\n\\label{varj}\n\\ee\nwhere the second step uses change of variables while the last step holds by the boundedness of $f_\\S(\\cdot) $ from Assumption \\ref{akernel_qte} (ii) and the square integrability of $\\rho(\\cdot) $ from Assumption \\ref{ahbe} (ii). Based on (\\ref{bracj})--(\\ref{varj}), applying Lemma \\ref{1v2} yields that\n\\be\n&&\\phantom{~=~}\\ss|\\Enk[h^{-r}_n\\rho\\{h_n^{-1}(\\s-\\mbP_0\\trans\\X)\\}]-\\E_\\X[h^{-r}_n\\rho\\{h_n^{-1}(\\s-\\mbP_0\\trans\\X)\\}]| \\nonumber\\\\\n&&~=~O_p\\{n_{\\kK^-}^{-1/2}h_n^{-r/2}\\log(h_n^{-1})+n_{\\kK^-}^{-1}h_n^{-r}(\\log\\,h_n)^2\\}~=~o_p(1),\n\\label{grho}\n\\ee\nwhere the second step is because we assume $(nh_n^r)^{-1/2}\\log(h_n^{-r})=o(1) $. Then we know\n\\bse\n\\ss\\E_\\S[h_n^{-r}\\rho \\{h_n^{-1}(\\s-\\S) \\}]&~=~&\\ss\\hbox{$\\int$}h_n^{-r}\\rho\\{h_n^{-1}(\\s-\\bfv) \\} f_\\S(\\bfv)d\\bfv  \\\\\n&~=~&\\ss\\hbox{$\\int$}\\rho(\\bft ) f_\\S(\\s-h_n\\bft)d\\bft ~=~ O(1).\n\\ese\nwhere the second step uses change of variables while the last step holds by the boundedness of $f_\\S(\\cdot) $ from Assumption \\ref{akernel_qte} (ii) and the integrability of $\\rho(\\cdot) $ from Assumption \\ref{ahbe} (ii). This, combined with (\\ref{grho}), implies\\tcr{:}\n\\be\n\\ss\\Enk[h_n^{-r}\\rho \\{h_n^{-1}(\\s-\\S) \\}]~=~O_p(1).\n\\label{exrho}\n\\ee\nNext, we have\n\\be\n&&\\phantom{~=~}\\sx\\Enk [\\|\\nabla K(\\bar{\\s}_n)-\\nabla K\\{h_n^{-1}\\mbP_0\\trans(\\x-\\X)\\} \\|] \\nonumber\\\\\n&&~\\leq~\\sx\\Enk [\\|\\bar{\\s}_n-h_n^{-1}\\mbP_0\\trans(\\x-\\X)\\|\\rho\\{h_n^{-1}\\mbP_0\\trans(\\x-\\X)\\}] \\nonumber\\\\\n&&~\\leq~\\sx\\Enk [\\|(\\hmbP-\\mbP_0)\\trans(\\x-\\X)\\|h_n^{-1}\\rho\\{h_n^{-1}\\mbP_0\\trans(\\x-\\X)\\}] \\nonumber\\\\\n&&~\\leq~ c\\,\\|\\hmbP-\\mbP_0\\|_1\\sxx\\|\\x-\\X\\|_{\\infty}\\ss\\Enk [h_n^{-1}\\rho\\{h_n^{-1}(\\s-\\S)\\}]\\nonumber \\\\\n&&~=~O_p(h_n^{r-1}\\alpha_n),\n\\label{alphan}\n\\ee\nwhere the first step uses the local Lipschitz continuity of $\\nabla K(\\cdot) $ from Assumption \\ref{ahbe} (ii), the second step is due to the definition (\\ref{sbar}) of $\\bar{\\s}_n$, the third step holds by H\\"older\'s inequality, and the last step is because of Assumptions \\ref{al1}, \\ref{ahbe} (i) and the equation (\\ref{exrho}). Hence\n\\bse\n&&\\phantom{~=~}\\sbx\\|\\bfU_{n,1}(\\x,\\theta)\\|_{\\infty} \\\\\n&&~\\leq~ c\\,\\sx\\Enk [\\|\\x-\\X\\|_{\\infty}\\|\\nabla K(\\bar{\\s}_n)-\\nabla K\\{h_n^{-1}\\mbP_0\\trans(\\x-\\X)\\} \\|] \\\\\n&&~\\leq~ c\\,\\sx\\Enk [\\|\\nabla K(\\bar{\\s}_n)-\\nabla K\\{h_n^{-1}\\mbP_0\\trans(\\x-\\X)\\} \\|] ~=~O_p(h_n^{r-1}\\alpha_n).\n\\ese\nwhere the first step holds by the boundedness of $\\{\\pis(\\X)\\}^{-1}T\\psi(Y,\\theta) $, the second step is due to Assumption \\ref{ahbe} (i), and the last step uses (\\ref{alphan}). This, combined with Assumption \\ref{al1} and H\\"older\'s inequality, implies\n\\be\n&&\\phantom{~=~}\\sbx\\|(\\hmbP-\\mbP_0)\\trans \\bfU_{n,1}(\\x,\\theta)\\|_\\infty \\nonumber\\\\\n&&~\\leq~\\|\\hmbP-\\mbP_0\\|_1\\sbx\\|\\bfU_{n,1}(\\x,\\theta)\\|_{\\infty}~=~O_p(h_n^{r-1}\\alpha_n^2).\n\\label{bdn1}\n\\ee\nThen, under Assumptions \\ref{akernel_qte} (ii) and \\ref{ahbe} (ii), as well as the fact that $\\{\\{\\pis(\\X)\\}^{-1}T\\psi(Y,\\theta):\\theta\\in\\mbtv\\}$ is a VC class with a bounded envelope function $\\sb[\\{\\pis(\\X)\\}^{-1}T|\\psi(Y,\\theta)|]$\nfrom Assumption \\ref{api}, Lemma B.4 of \\citet{escanciano2014uniform} gives that\n\\be\n&&\\sbx\\|\\bfU_{n,2}(\\x,\\theta)-\\E\\{\\bfU_{n,2}(\\x,\\theta)\\} \\|_{\\infty}~=~O_p(h_n^{r}\\gamma_n), \\label{dn2}\\\\\n&&\\sbx\\|\\bfU_{n,3}(\\x,\\theta)-\\E\\{\\bfU_{n,3}(\\x,\\theta)\\} \\|_{\\infty}~=~O_p(h_n^{r}\\gamma_n).\n\\label{dn3}\n\\ee\nLet $\\delta(\\s,\\theta):=f_\\S(\\s)\\varphi_1(\\s,\\theta) $ and $\\nabla\\delta(\\s,\\theta):=\\partial \\delta(\\s,\\theta)/\\partial \\s$. We have\n\\be\n&&\\phantom{~=~}\\sbx\\|\\E\\{\\bfU_{n,2}(\\x,\\theta)\\} \\|_\\infty \\nonumber\\\\\n&&~\\leq~ \\sbx\\|\\x\\hbox{$\\int$}\\delta(\\s,\\theta)[\\nabla K\\{h_n^{-1}(\\mbP_0\\trans\\x-s)\\}]\\trans ds\\|_\\infty \\nonumber\\\\\n&&~=~h_n^{r+1}\\sbx\\|\\x\\hbox{$\\int$}\\{\\nabla\\delta(\\mbP_0\\trans\\x-h_n\\bft,\\theta)\\} \\trans K(\\bft)d\\bft\\|_\\infty ~=~O(h_n^{r+1}).\n\\label{edn2}\n\\ee\nIn the above, the second step uses integration by parts and change of variables, while the last step holds by Assumption \\ref{ahbe} (i), the boundedness of $\\nabla\\delta(\\s,\\theta) $ from Assumptions \\ref{akernel_qte} (ii)--(iii), as well as the integrability of $K(\\cdot) $ from Assumption \\ref{akernel_qte} (i). Set $\\bzeta(\\s,\\theta):=f_\\S(\\s)\\bfeta_1(\\s,\\theta) $ and $\\nabla\\bzeta(\\s,\\theta):=\\partial \\bzeta(\\s,\\theta)/\\partial \\s$. Analogous to (\\ref{edn2}), we know\n\\be\n&&\\phantom{~=~}\\sbx\\|\\E\\{\\bfU_{n,3}(\\x,\\theta)\\} \\|_\\infty  \\nonumber\\\\\n&&~\\leq~ \\sbx\\|\\hbox{$\\int$}\\bzeta(\\s,\\theta) [\\nabla K\\{h_n^{-1}(\\mbP_0\\trans\\x-s)\\}]\\trans ds\\|_\\infty \\nonumber\\\\\n&&~=~h_n^{r+1}\\sbx\\|\\hbox{$\\int$}\\{\\nabla\\bzeta(\\mbP_0\\trans\\x-h_n\\bft,\\theta)\\} \\trans K(\\bft)d\\bft\\|_\\infty ~=~O(h_n^{r+1}),\n\\label{edn3}\n\\ee\nwhere the last step holds by the boundedness of $\\|\\nabla\\bzeta(\\s,\\theta)\\|_\\infty$ from Assumptions \\ref{akernel_qte} (ii) and \\ref{ahbe} (iii), as well as the integrability of $K(\\cdot) $ from Assumption \\ref{akernel_qte} (i). Combining (\\ref{dn2})--(\\ref{edn3}) yields\n\\bse\n\\sbx\\|\\bfU_{n,2}(\\x,\\theta)-\\bfU_{n,3}(\\x,\\theta)\\|_\\infty~=~O_p(h_n^{r}\\gamma_n+h_n^{r+1}),\n\\ese\nwhich implies that\n\\bse\n&&\\phantom{~=~}\\sbx\\|(\\mbP_0-\\hmbP)\\trans\\{\\bfU_{n,2}(\\x,\\theta)-\\bfU_{n,3}(\\x,\\theta)\\} \\|_\\infty \\\\\n&&~\\leq~\\|\\mbP_0-\\hmbP\\|_1\\sbx\\|\\bfU_{n,2}(\\x,\\theta)-\\bfU_{n,3}(\\x,\\theta)\\|_{\\infty} \\\\\n&&~=~O_p(h_n^{r}\\gamma_n\\alpha_n+h_n^{r+1}\\alpha_n)\\tcr{,}\n\\ese\nusing H\\"older\'s inequality and Assumption \\ref{al1}. This, combined with (\\ref{un}) and (\\ref{bdn1}), gives\n\\be\n\\sbx|U_n(\\x,\\theta)|~=~O_p(h_n^{-2}\\alpha_n^2+h_n^{-1}\\gamma_n\\alpha_n+\\alpha_n).\n\\label{unr}\n\\ee\nThen\\tcr{,} we consider $V_{n,N}$. Write\n\\be\nV_{n,N}(\\x,\\theta)&~=~& h_n^{-(r+1) }\\trace ((\\hmbP-\\mbP_0)\\trans \\Enk[(\\x-\\X)\\{\\nabla K(\\bar{\\s})\\} \\trans\\hD(\\X)T\\psi(Y,\\theta)]) \\nonumber\\\\\n&~=~&h_n^{-(r+1) }\\trace[(\\hmbP-\\mbP_0)\\trans\\{\\bfV^{(1) }_{n,N}(\\x,\\theta)+\\bfV^{(2) }_{n,N}(\\x,\\theta)\\}],\n\\label{vn}\n\\ee\nwhere\n\\bse\n&&\\bfV^{(1) }_{n,N}(\\x,\\theta)~:=~\\Enk((\\x-\\X)[\\nabla K(\\bar{\\s}_n)-\\nabla K\\{h_n^{-1}\\mbP_0\\trans(\\x-\\X)\\}]\\trans\\hD(\\X)T\\psi(Y,\\theta)),  \\\\\n&&\\bfV^{(2) }_{n,N}(\\x,\\theta)~:=~\\Enk((\\x-\\X) [\\nabla K\\{h_n^{-1}\\mbP_0\\trans(\\x-\\X)\\}]\\trans\\hD(\\X)T\\psi(Y,\\theta)).\n\\ese\nWe have\n\\be\n&&\\phantom{~=~}\\sbx\\|\\bfV^{(1) }_{n,N}(\\x,\\theta)\\|_{\\infty} \\nonumber\\\\\n&&~\\leq~ c\\,\\sx|\\hD(\\x)|\\sx\\Enk [\\|\\nabla K(\\bar{\\s}_n)-\\nabla K\\{h_n^{-1}\\mbP_0\\trans(\\x-\\X)\\} \\|] \\nonumber \\\\\n&& ~=~O_p(h_n^{r-1}\\alpha_n),\n\\label{vn1}\n\\ee\nwhere the first step uses the boundedness of $\\sx\\|\\x-\\X\\|_\\infty T\\psi(Y,\\theta) $ from Assumption \\ref{ahbe} (i), and the last step holds by (\\ref{alphan}) and (\\ref{dsup}) in Assumption \\ref{api}. Next, we know that\n\\be\n&&\\phantom{~=~}|\\ss\\E_\\S([\\nabla K_{[j]}\\{h_n^{-1}(\\s-\\S)\\}]^2)|\\nonumber\\\\\n&&~=~|\\ss\\hbox{$\\int$}[\\nabla K_{[j]}\\{h_n^{-1}(\\s-\\bfv)\\}]^2 f_{\\S}(\\bfv)d\\bfv| \\nonumber\\\\\n&&~=~h_n^{r}|\\ss\\hbox{$\\int$}\\{\\nabla K_{[j]}(\\bft)\\}^2f_{\\S}(\\s-h_n\\bft)d\\bft|=O(h_n^{r}),\n\\label{exp1}\n\\ee\nwhere the second step uses change of variables while the last step is due to the boundedness of $f_\\S(\\cdot) $ from Assumption \\ref{akernel_qte} (ii) and the square integrability of $\\nabla K_{[j]}(\\cdot) $ from Assumption \\ref{akernel_qte} (i). Then, under Assumptions \\ref{akernel_qte} (ii) and \\ref{ahbe} (ii), Lemma B.4 of \\citet{escanciano2014uniform} implies\\tcr{:}\n\\bse\n\\ss|\\Enk([\\nabla K_{[j]}\\{h_n^{-1}(\\s-\\S)\\}]^2)-\\E_\\S([\\nabla K_{[j]}\\{h_n^{-1}(\\s-\\S)\\}]^2)|~=~O_p(h_n^{r}\\gamma_{n})~=~o_p(h_n^r)\n\\ese\nwhere the last step is because we assume $\\gamma_{n}=o(1) $. This, combined with (\\ref{exp1}), yields\n\\be\n\\ss\\Enk([\\nabla K_{[j]}\\{h_n^{-1}(\\s-\\S)\\}]^2)~=~O_p(h_n^{r}).\n\\label{one1}\n\\ee\nLet $v_{ij}(\\x,\\theta) $ be the $(i,j) $th entry of $\\bfV^{(2) }_{n,N}(\\x,\\theta) $ $(i=1,\\ldots,p;\\,j=1,\\ldots,r) $. We know\n\\bse\n&&\\phantom{~=~}\\sbx|v_{ij}(\\x,\\theta)| \\\\\n&&~\\equiv~\\sbx|\\Enk[(\\x_{[i]}-\\X_{[i]}) \\nabla K_{[j]}\\{h_n^{-1}\\mbP_0\\trans(\\x-\\X)\\} \\hD(\\X)T\\psi(Y,\\theta)]| \\\\\n&&~\\leq~\\ss\\Enk[|\\nabla K_{[j]}\\{h_n^{-1}(\\s-\\S)\\} \\hD(\\X)|] \\\\\n&&~\\leq~\\{\\ss\\Enk([\\nabla K_{[j]}\\{h_n^{-1}(\\s-\\S)\\}]^2)\\Enk[\\{\\hD(\\X)\\}^2]\\}^{1/2}~=~O_p(h_n^{r/2}s_N),\n\\ese\nwhere the second step uses the boundedness of $\\sx\\|\\x-\\X\\|_\\infty T\\psi(Y,\\theta) $ from Assumption \\ref{ahbe} (i), the third step is due to H\\"older\'s inequality and the last step holds by (\\ref{one1}) and (\\ref{sn}). Therefore it follows that\n\\be\n\\sbx\\|\\bfV^{(2) }_{n,N}(\\x,\\theta)\\|_\\infty~=~O_p(h_n^{r/2}s_N).\n\\label{vn2}\n\\ee\nTherefore, we have\n\\bse\n&&\\phantom{~=~}\\sbx\\|(\\mbP_0-\\hmbP)\\trans\\{\\bfV^{(1) }_{n,N}(\\x,\\theta)+\\bfV^{(2) }_{n,N}(\\x,\\theta)\\} \\|_\\infty \\\\\n&&~\\leq~\\|\\mbP_0-\\hmbP\\|_1\\sbx\\|\\bfV^{(1) }_{n,N}(\\x,\\theta)+\\bfV^{(2) }_{n,N}(\\x,\\theta)\\|_{\\infty} \\\\\n&&~=~O_p(h_n^{r-1}\\alpha_n^2+h_n^{r/2}\\alpha_n s_N),\n\\ese\nwhere the first step is due to H\\"older\'s inequality and the last step uses (\\ref{vn1}), (\\ref{vn2}) and Assumption \\ref{al1}. Combined with (\\ref{vn}), it gives\n\\be\n\\sbx |V_{n,N}(\\x,\\theta)|~=~O_p\\{h_n^{-2}\\alpha_n^2+h_n^{-(r/2+1) }\\alpha_ns_N\\}.\n\\label{vnr}\n\\ee\nConsidering (\\ref{dhbe}), (\\ref{unr}) and (\\ref{vnr}), we know that\n\\be\n&&\\phantom{~=~}\\sbx|\\hateo(\\x,\\theta,\\hmbP)-\\hateo(\\x,\\theta,\\mbP_0)| \\nonumber\\\\\n&&~=~O_p\\{h_n^{-2}\\alpha_n^2+h_n^{-1}\\gamma_n\\alpha_n+\\alpha_n+h_n^{-(r/2+1) }\\alpha_n s_N\\}.\n\\label{hmbe}\n\\ee\n\nFurther, we control the error from estimating $\\pi(\\x) $ by $\\pihatN(\\x) $, i.e., $\\hateo(\\x,\\theta,\\mbP_0)-\\enk^{(1) }(\\x,\\theta,\\mbP_0) $ with\n\\bse\n\\enk^{(1) }(\\x,\\theta,\\mbP)~:=~h_n^{-r}\\Enk [\\{\\pis(\\X)\\}^{-1}T\\psi(Y,\\theta) K_h\\{\\mbP\\trans(\\x-\\X)\\}].\n\\ese\nWe have\n\\be\n&&\\phantom{~=~}|\\ss\\E_\\S[h_n^{-r}\\{K_h(\\s-\\S)\\}^2]|\\nonumber\\\\\n&&~=~h_n^{-r}|\\ss\\hbox{$\\int$}[K\\{h_n^{-1}(\\s-\\bfv)\\}]^2f_{\\S}(\\bfv)d\\bfv| \\nonumber\\\\\n&&~=~|\\ss\\hbox{$\\int$}\\{K(\\bft)\\}^2f_{\\S}(\\s-h_n\\bft)d\\bft|~=~O(1),\n\\label{exp}\n\\ee\nwhere the second step uses change of variables while the last step is due to the boundedness of $f_\\S(\\cdot) $ from Assumption \\ref{akernel_qte} (ii) and the square integrability of $K(\\cdot) $ from Assumption \\ref{akernel_qte} (i). Then, under Assumptions \\ref{akernel_qte} (i)--(ii) , Lemma B.4 of \\citet{escanciano2014uniform} implies\\tcr{:}\n\\bse\n\\ss|\\Enk[h_n^{-r}\\{K_h(\\s-\\S)\\}^2]-\\E_\\S[h_n^{-r}\\{K_h(\\s-\\S)\\}^2]|~=~O_p(\\gamma_{n})~=~o_p(1),\n\\ese\nwhere the last step is because we assume $\\gamma_{n}=o(1) $. This, combined with (\\ref{exp}), yields\n\\be\n\\ss\\Enk[h_n^{-r}\\{K_h(\\s-\\S)\\}^2]~=~O_p(1).\n\\label{one}\n\\ee\nTherefore\\tcr{,} we know that\n\\be\n&&\\phantom{~=~}\\sbx|\\hateo(\\x,\\theta,\\mbP_0)-\\enk^{(1) }(\\x,\\theta,\\mbP_0)| \\nonumber\\\\\n&&~\\leq~ c\\,\\ss \\Enk \\{|\\hD(\\X)h_n^{-r}K_h(\\s-\\S)| \\} \\nonumber\\\\\n&&~\\leq~ c\\,h^{-r/2}\\{\\Enk [\\{\\hD(\\X)\\}^2] \\ss\\Enk[h_n^{-r}\\{K_h(\\s-\\S)\\}^2]\\}^{1/2} \\nonumber\\\\\n&&~=~O_p(h^{-r/2}s_N)\\tcr{,} \\label{mnk}\n\\ee\nwhere the first step uses the boundedness of $T\\psi(Y,\\theta) $, the second step is due to H\\"older\'s inequality and the last step holds by (\\ref{sn}) and (\\ref{one}).\n\nCombining (\\ref{hmbe}) and (\\ref{mnk}) yields that\n\\be\n&&\\phantom{~=~}\\sbx|\\hateo(\\x,\\theta,\\hmbP)-\\enk^{(1) }(\\x,\\theta,\\mbP_0)| \\nonumber\\\\ &&~=~O_p\\{h_n^{-2}\\alpha_n^2+h_n^{-1}\\gamma_n\\alpha_n+\\alpha_n+h_n^{-(r/2+1) }\\alpha_n s_N+h^{-r/2}s_N\\} \\nonumber\\\\\n&&~=~O_p\\{h_n^{-2}\\alpha_n^2+h_n^{-1}\\gamma_n\\alpha_n+\\alpha_n+h^{-r/2}s_N\\} ~=~O_p\\{a_{n,N}^{(2) }\\},\n\\label{an2}\n\\ee\nwhere the second step holds by the fact that $h_n^{-(r/2+1) }\\alpha_n s_N=o(h^{-r/2}s_N) $ because we assume $h^{-1}\\alpha_n=o(1) $.\n\nNow\\tcr{,} we handle the error $\\enk^{(1) }(\\x,\\theta,\\mbP_0)-e^{(1) }(\\x,\\theta,\\mbP_0) $. Under Assumptions \\ref{akernel_qte} (i)--(ii) and the fact that $\\{\\{\\pis(\\X)\\}^{-1}T\\psi(Y,\\theta):\\theta\\in\\mbtv\\}$ is a VC class with a bounded envelope function $\\sb[\\{\\pis(\\X)\\}^{-1}T\\psi(Y,\\theta)]$\nfrom Assumption \\ref{api}, Lemma B.4 of \\citet{escanciano2014uniform} gives that\n\\be\n\\sbx|\\enk^{(1) }(\\x,\\theta,\\mbP_0)-\\E\\{\\enk^{(1) }(\\x,\\theta,\\mbP_0)\\}|~=~O_p(\\gamma_n).\n\\label{pt2}\n\\ee\nFurther, under Assumptions \\ref{akernel_qte}, standard arguments based on $d$th order Taylor\'s expansion of $e^{(1) }(\\x,\\theta,\\mbP_0) $ yield that\n\\be\n\\sbx|\\E\\{\\enk^{(1) }(\\x,\\theta,\\mbP_0)\\}-e^{(1) }(\\x,\\theta,\\mbP_0)|~=~O(h_n^d).\n\\label{pt3}\n\\ee\n\nCombining (\\ref{an2}), (\\ref{pt2}) and (\\ref{pt3}) yields\n\\be\n\\sbx|\\hateo(\\x,\\theta,\\hmbP)-e^{(1) }(\\x,\\theta,\\mbP_0)|~=~O_p\\{a_{n}^{(1) }+a_{n,N}^{(2) }\\}.\n\\label{num}\n\\ee\nSimilar arguments imply that\n\\be\n\\sx|\\hatez(\\x,\\hmbP)-e^{(0) }(\\x,\\mbP_0)|~=~O_p\\{a_{n}^{(1) }+a_{n,N}^{(2) }\\},\n\\label{deno}\n\\ee\nwhere $\\hatez(\\x,\\mbP)\\equiv\\hatez(\\x,\\theta,\\mbP) $ and $\\ e^{(0) }(\\x,\\mbP)\\equiv e^{(0) }(\\x,\\theta,\\mbP) $.\nTherefore\\tcr{,} we have\n\\bse\n&&\\phantom{~=~}\\sbx|\\phihatnk(\\x,\\theta,\\hmbP)-\\tphi(\\x,\\theta,\\mbP_0)| \\nonumber\\\\\n&&~=~\\sbx|\\{\\hatez(\\x,\\hmbP)\\}^{-1}\\hatez(\\x,\\theta,\\hmbP)-\\{e^{(0) }(\\x,\\mbP_0)\\}^{-1}e^{(1) }(\\x,\\theta,\\mbP_0)| \\\\\n&&~\\leq~\\sbx|\\{\\hatez(\\x,\\mbP_0)\\}^{-1}\\{\\hateo(\\x,\\theta,\\hmbP)-e^{(1) }(\\x,\\theta,\\mbP_0)\\}|+ \\\\\n&&\\phantom{~=~}\\sbx|[\\{\\hatez(\\x,\\mbP_0)\\}^{-1}-\\{e^{(0) }(\\x,\\mbP_0)\\}^{-1}]e^{(1) }(\\x,\\theta,\\mbP_0)| \\\\\n&&~=~O_p\\{a_{n}^{(1) }+a_{n,N}^{(2) }\\},\n\\ese\nwhere the last step follows from the fact that $a_{n}^{(1) }+a_{n,N}^{(2) }=o(1) $, and repeated use of (\\ref{num}) and (\\ref{deno}) as well as Assumptions \\ref{api} and \\ref{akernel_qte} (ii).\n\n\n\\subsection{Proof of Proposition \\ref{thbn}}\nConsidering\n\\bse\n\\phihatnk(\\x,\\theta,\\hmbP)~\\equiv~ \\{\\hatez(\\x,\\theta,\\hmbP)\\}^{-1}\\hateo(\\x,\\theta,\\hmbP)\\equiv\\{\\hatez(\\x,\\hmbP)\\}^{-1}\\hateo(\\x,\\theta,\\hmbP)\\tcr{,}\n\\ese\nwith\n\\bse\n\\hateo(\\x,\\theta,\\mbP)~\\equiv~ h_n^{-r}\\Enk[\\{\\pihatN(\\X)\\}^{-1}T \\{I(Y<\\theta)-\\tau\\}K_h\\{\\mbP\\trans(\\x-\\X)\\},\n\\ese\nit is obvious that, given $\\cl$,\n\\bse\n\\{\\phihatnk(\\X,\\theta,\\hmbP):\\theta\\in\\mbtv\\} \\subset\\{\\phihatnk(\\X,\\theta_i,\\hmbP):i=1,\\ldots,n+1\\},\n\\ese\nfor any $\\theta_1<Y_{(1) }$, $\\theta_i\\in[Y_{(i-1) },Y_{(i) }) $ $(i=2,\\ldots,n) $ and $\\theta_{n+1}\\geq Y_{(n) }$, where $Y_{(i) }$ is the $i$th order statistic of $\\{Y_i:i=1,\\ldots,n\\}$. Therefore the set $\\{\\phihatnk(\\X,\\theta,\\hmbP):\\theta\\in\\mbtv\\}$ contains at most $(n+1) $ different functions given $\\cl$. This, combined with (\\ref{mm}), implies the set\n\\bse\n\\mp_{n,k}~\\equiv~\\{\\phihatnk(\\X,\\theta,\\hmbP)-\\phis(\\X,\\theta):\\theta\\in\\mbtv\\}\n\\ese\nsatisfies $N_{[\\,]}\\{\\eta,\\mp_{n,k}\\mid\\cl,L_2(\\P_\\X)\\} \\leq c\\,(n+1)\\eta^{-1}$.\n\n\\section{Additional simulation results}\\label{sm_simulations}\nWe \\tcr{present here} %display\nin Tables \\ref{table_supp_efficiency} (efficiency) and \\ref{table_supp_infernce} (inference) the results of \\tcr{our} simulation\\tcr{s for the} cases with the null and double index outcome models (d)--(e)\\tcr{; s}ee %See\nSection \\ref{sec_simulations} for detailed descriptions of the simulation setups. In the null model (d) where $Y$ and $\\X$ are independent, it is apparent that the unlabeled data cannot help the estimation in theory, so the supervised and SS methods \\tcr{not surprisingly} have close efficiencies. When the outcome model is (e), our SS estimators show significant superiority over the supervised competitors and even outperform the ``oracle\'\' supervised estimators most of time. As regards inference in the models (d) and (e), our methods still produce satisfactory results analogous \\tcr{in pattern} to those in Table \\ref{table_inferece} of Section \\ref{sec_simulations}. The quantities in Tables \\ref{table_supp_efficiency} and \\ref{table_supp_infernce} again confirm the advantage of our SS estimators compared to their supervised counterparts in terms of robustness and efficiency, which have already been demonstrated \\tcr{in detail} by the simulation results in Section \\ref{sec_simulations}.\n\n\\begin{table}%[H] %Removing the [H] here -- AC\n\\def~{\\hphantom{0}}\n\\caption{%\\tcr{**To be edited -- AC.**}\nEfficiencies of the ATE and the QTE estimators relative to the corresponding oracle supervised estimators when $p=10$; \\tcg{see Remark \\ref{remark_interpretation_RE} for interpretations of these relative efficiencies.} Here\\tcr{,} $n$ denotes the labeled data size, $p$ the number of covariates, $q$ the model sparsity, $m(\\X)\\equiv\\E(Y\\mid\\X) $, $\\pi(\\X)\\equiv\\E(T\\mid\\X) $, $\\hat{\\pi}(\\X) $ \\tcr{--} the estimated propensity score, Lin \\tcr{--} logistic regression of $T$ vs. $\\X$\\tcr{,} and Quad \\tcr{--} logistic regression of $T$ vs. $(\\X\\trans,\\X_{[1]}^2,\\ldots,\\X_{[p]}^2)\\trans$; KS$_1/$KS$_2$ represents kernel smoothing on the one$/$two direction(s) selected by linear regression$/$%the\n\\tcr{sliced} inverse regression; PR \\tcr{denotes} parametric regression\\tcr{,} and ORE \\tcr{denotes the} oracle relative efficiency. The \\textbf{\\tcn{blue}} color \\tcr{indicates} %implies\nthe best efficiency in each case.}{\n\\resizebox{\\textwidth}{!}{\n\\begin{tabular}{ccc||ccc|ccc||ccc|ccc||c}\n\\hline\n\\multicolumn{3}{c||}{\\multirow{2}{*}{ATE}}   & \\multicolumn{6}{c||}{$n=200$}                      & \\multicolumn{6}{c||}{$n=500$}                      & \\multirow{3}{*}{ORE} \\\\\n\\cline{4-15}\n&  &  & \\multicolumn{3}{c|}{Supervised} & \\multicolumn{3}{c||}{\\textbf{SS}}& \\multicolumn{3}{c|}{Supervised} & \\multicolumn{3}{c||}{\\textbf{SS}}&                      \\\\\n$m(\\X) $              & $\\pi(\\X) $            & $\\hat{\\pi}(\\X) $      & KS$_1$  & KS$_2$ & PR   & KS$_1$ & KS$_2$ & PR   & KS$_1$  & KS$_2$ & PR   & KS$_1$ & KS$_2$ & PR   &                      \\\\ \\hline\n\\multirow{6}{*}{(d) } & (i)       & Lin             & 0.89    & 0.83   & 0.87 & \\tcn{\\bf 0.95}   & 0.94   & 0.91 & 0.93    & 0.95   & 0.94 & 0.93   & \\tcn{\\bf 0.97}   & 0.93 & 1.00                 \\\\\n&           & Quad            & 0.68    & 0.50   & 0.64 & 0.95   & \\tcn{\\bf 0.96}   & 0.92 & 0.87    & 0.87   & 0.87 & 0.93   & \\tcn{\\bf 0.96}   & 0.93 & 1.00                 \\\\\n& (ii)      & Lin             & 0.86    & 0.85   & 0.87 & 0.92   & \\tcn{\\bf 0.93}   & 0.92 & 0.96    & 0.94   & 0.97 & 0.99   & \\tcn{\\bf 1.00}   & 0.97 & 1.00                 \\\\\n&           & Quad            & 0.75    & 0.77   & 0.67 & 0.92   & \\tcn{\\bf 0.94}   & 0.92 & 0.93    & 0.91   & 0.92 & 1.00   & \\tcn{\\bf 1.01}   & 0.98 & 1.00                 \\\\\n& (iii)     & Lin             & 0.85    & 0.84   & 0.85 & 0.88   & \\tcn{\\bf 0.91}   & 0.86 & 0.93    & 0.95   & 0.94 & 0.94   & \\tcn{\\bf 0.96}   & 0.94 & 1.00                 \\\\\n&           & Quad            & 0.71    & 0.72   & 0.72 & 0.90   & \\tcn{\\bf 0.92}   & 0.87 & 0.92    & 0.93   & 0.93 & 0.94   & \\tcn{\\bf 0.97}   & 0.95 & 1.00                 \\\\ \\hline\n\\multirow{6}{*}{(e) } & (i)       & Lin             & 0.76    & 0.75   & 0.41 & 1.73   & \\tcn{\\bf 1.80}   & 0.77 & 0.86    & 0.87   & 0.64 & 2.02   & \\tcn{\\bf 2.04}   & 0.88 & 5.41                 \\\\\n&           & Quad            & 0.68    & 0.70   & 0.29 & 1.74   & \\tcn{\\bf 1.78}   & 0.76 & 0.84    & 0.83   & 0.57 & 2.02   & \\tcn{\\bf 2.03}   & 0.88 & 5.41                 \\\\\n& (ii)      & Lin             & 0.73    & 0.63   & 0.24 & \\tcn{\\bf 1.18}   & 0.94   & 0.34 & 0.81    & 0.71   & 0.15 & \\tcn{\\bf 1.35}   & 1.18   & 0.19 & 3.93                 \\\\\n&           & Quad            & 0.69    & 0.59   & 0.27 & \\tcn{\\bf 1.25}   & 1.00   & 0.38 & 0.85    & 0.76   & 0.18 & \\tcn{\\bf 1.41}   & 1.23   & 0.21 & 3.93                 \\\\\n& (iii)     & Lin             & 0.75    & 0.71   & 0.41 & \\tcn{\\bf 1.60}   & 1.57   & 0.72 & 0.74    & 0.77   & 0.53 & 1.32   & \\tcn{\\bf 1.43}   & 0.65 & 4.78                 \\\\\n&           & Quad            & 0.74    & 0.75   & 0.52 & \\tcn{\\bf 1.83}   & 1.75   & 0.92 & 0.79    & 0.82   & 0.56 & 1.53   & \\tcn{\\bf 1.67}   & 0.85 & 4.78                 \\\\ \\hline\n\\multicolumn{16}{c}{}                                                                                                                                                           \\\\\n\\hline\n\\multicolumn{3}{c||}{\\multirow{2}{*}{QTE}}   & \\multicolumn{6}{c||}{$n=200$}                      & \\multicolumn{6}{c||}{$n=500$}                      & \\multirow{3}{*}{ORE} \\\\\n\\cline{4-15}\n&  &  & \\multicolumn{3}{c|}{Supervised} & \\multicolumn{3}{c||}{\\textbf{SS}}& \\multicolumn{3}{c|}{Supervised} & \\multicolumn{3}{c||}{\\textbf{SS}}&                      \\\\\n$m(\\X) $              & $\\pi(\\X) $            & $\\hat{\\pi}(\\X) $      & KS$_1$  & KS$_2$ & PR   & KS$_1$ & KS$_2$ & PR   & KS$_1$  & KS$_2$ & PR   & KS$_1$ & KS$_2$ & PR   &                      \\\\ \\hline\n\\multirow{6}{*}{(d) } & (i)       & Lin             & 0.87    & 0.86   & 0.78 & 0.92   & \\tcn{\\bf 0.95}   & 0.79 & 0.93    & 0.92   & 0.92 & 0.98   & \\tcn{\\bf 0.98}   & 0.92 & 1.00                 \\\\\n&           & Quad            & 0.72    & 0.73   & 0.55 & 0.92   & \\tcn{\\bf 0.95}   & 0.79 & 0.89    & 0.88   & 0.89 & \\tcn{\\bf 0.99}   & 0.99   & 0.92 & 1.00                 \\\\\n& (ii)      & Lin             & 0.87    & 0.86   & 0.89 & 0.93   & \\tcn{\\bf 0.94}   & 0.89 & 0.92    & 0.90   & \\tcn{\\bf 0.99} & 0.95   & 0.93   & 0.97 & 1.00                 \\\\\n&           & Quad            & 0.71    & 0.71   & 0.71 & 0.94   & \\tcn{\\bf 0.96}   & 0.90 & 0.89    & 0.89   & 0.95 & 0.96   & 0.94   & \\tcn{\\bf 0.98} & 1.00                 \\\\\n& (iii)     & Lin             & 0.83    & 0.82   & 0.85 & \\tcn{\\bf 0.92}   & 0.92   & 0.83 & 0.94    & 0.93   & 0.95 & 0.96   & \\tcn{\\bf 0.97}   & 0.96 & 1.00                 \\\\\n&           & Quad            & 0.81    & 0.78   & 0.71 & 0.95   & \\tcn{\\bf 0.95}   & 0.83 & 0.92    & 0.92   & 0.94 & 0.97   & \\tcn{\\bf 0.99}   & 0.95 & 1.00                 \\\\ \\hline\n\\multirow{6}{*}{(e) } & (i)       & Lin             & 0.82    & 0.79   & 0.78 & \\tcn{\\bf 1.30}   & 1.23   & 1.13 & 0.85    & 0.84   & 0.89 & 1.37   & 1.34   & \\tcn{\\bf 1.42} & 1.85                 \\\\\n&           & Quad            & 0.65    & 0.68   & 0.61 & \\tcn{\\bf 1.30}   & 1.24   & 1.11 & 0.87    & 0.86   & 0.85 & 1.39   & 1.35   & \\tcn{\\bf 1.42} & 1.85                 \\\\\n& (ii)      & Lin             & 0.61    & 0.55   & 0.49 & \\tcn{\\bf 0.92}   & 0.73   & 0.65 & 0.81    & 0.71   & 0.40 & \\tcn{\\bf 1.16}   & 0.97   & 0.48 & 1.78                 \\\\\n&           & Quad            & 0.62    & 0.56   & 0.48 & \\tcn{\\bf 0.99}   & 0.80   & 0.70 & 0.82    & 0.73   & 0.44 & \\tcn{\\bf 1.23}   & 1.04   & 0.53 & 1.78                 \\\\\n& (iii)     & Lin             & 0.75    & 0.70   & 0.73 & 1.13   & 1.08   & \\tcn{\\bf 1.22} & 0.82    & 0.82   & 0.85 & \\tcn{\\bf 1.34}   & 1.33   & 1.18 & 1.93                 \\\\\n&           & Quad            & 0.78    & 0.74   & 0.84 & 1.28   & 1.23   & \\tcn{\\bf 1.44} & 0.86    & 0.87   & 0.85 & \\tcn{\\bf 1.45}   & 1.44   & 1.31 & 1.93                \\\\ \\hline\n\\end{tabular}\n}}\n\\label{table_supp_efficiency}\n\\end{table}\n\n\n\\begin{table}%[H] %Removing the [H] here -- AC\n\\def~{\\hphantom{0}}\n\\caption{%\\tcr{**To be edited -- AC.**}\nInference based on the SS estimators \\underline{\\tcr{using} kernel smoothing on the direction selected by linear regression \\tcr{(KS$_1$) }} \\tcr{as the choice of the working outcome model, for the ATE and the QTE,} when $n=500$ and $p=10$. Here\\tcr{,} ESE is the empirical standard error, \\tcr{Bias is the empirical bias,} ASE \\tcr{is} the average of the estimated standard errors\\tcr{,} and CR \\tcr{is} the \\tcr{empirical} coverage rate of the 95\\% confidence intervals. \\tcr{All o}ther notations are the same as in Table \\ref{table_supp_efficiency}. The \\textbf{{\\color{navyblue} blue}} color\n%implies both\n\\tcr{highlights settings where} the propensity scor\\tcr{e} %scores\nand the outcome mode\\tcr{l} %models\nare \\tcr{both} correctly specified, while the \\textbf{boldfaces} \\tcr{denote ones where} %mean\nthe propensity scor\\tcr{e is} %scores are\ncorrectly specified but the outcome %models are\nmode\\tcr{l is} not.}{\n\\begin{tabular}{ccc|cccc|cccc}\n\\hline\n&  &  & \\multicolumn{4}{c|}{ATE}   & \\multicolumn{4}{c}{QTE}   \\\\\n$m(\\X) $              & $\\pi(\\X) $            & $\\hat{\\pi}(\\X) $      & ESE  & Bias & ASE  & CR   & ESE  & Bias & ASE  & CR   \\\\ \\hline\n& (i)   & {\\color{navyblue} \\textbf{Lin}}  & {\\color{navyblue} \\textbf{0.08}} & {\\color{navyblue} \\textbf{0.00}} & {\\color{navyblue} \\textbf{0.07}} & {\\color{navyblue} \\textbf{0.94}} & {\\color{navyblue} \\textbf{0.09}} & {\\color{navyblue} \\textbf{0.01}} & {\\color{navyblue} \\textbf{0.10}} & {\\color{navyblue} \\textbf{0.96}} \\\\\n&       & {\\color{navyblue} \\textbf{Quad}} & {\\color{navyblue} \\textbf{0.08}} & {\\color{navyblue} \\textbf{0.00}} & {\\color{navyblue} \\textbf{0.07}} & {\\color{navyblue} \\textbf{0.94}} & {\\color{navyblue} \\textbf{0.09}} & {\\color{navyblue} \\textbf{0.01}} & {\\color{navyblue} \\textbf{0.10}} & {\\color{navyblue} \\textbf{0.95}} \\\\\n& (ii)  & Lin                                  & 0.07                                 & 0.00                                 & 0.07                                 & 0.95                                 & 0.08                                 & 0.01                                 & 0.09                                 & 0.94                                 \\\\\n&       & Quad                                 & 0.06                                 & 0.00                                 & 0.07                                 & 0.95                                 & 0.08                                 & 0.01                                 & 0.09                                 & 0.95                                 \\\\\n& (iii) & Lin                                  & 0.07                                 & 0.00                                 & 0.07                                 & 0.94                                 & 0.08                                 & 0.01                                 & 0.09                                 & 0.97                                 \\\\\n\\multirow{-6}{*}{(d) } &       & {\\color{navyblue} \\textbf{Quad}} & {\\color{navyblue} \\textbf{0.07}} & {\\color{navyblue} \\textbf{0.00}} & {\\color{navyblue} \\textbf{0.06}} & {\\color{navyblue} \\textbf{0.93}} & {\\color{navyblue} \\textbf{0.08}} & {\\color{navyblue} \\textbf{0.01}} & {\\color{navyblue} \\textbf{0.09}} & {\\color{navyblue} \\textbf{0.96}} \\\\ \\hline\n& (i)   & \\textbf{Lin}                         & \\textbf{0.12}                        & \\textbf{0.00}                        & \\textbf{0.11}                        & \\textbf{0.93}                        & \\textbf{0.16}                        & \\textbf{0.03}                        & \\textbf{0.17}                        & \\textbf{0.94}                        \\\\\n&       & \\textbf{Quad}                        & \\textbf{0.12}                        & \\textbf{0.00}                        & \\textbf{0.11}                        & \\textbf{0.94}                        & \\textbf{0.16}                        & \\textbf{0.03}                        & \\textbf{0.17}                        & \\textbf{0.94}                        \\\\\n& (ii)  & Lin                                  & 0.10                                 & 0.04                                 & 0.11                                 & 0.95                                 & 0.15                                 & 0.06                                 & 0.16                                 & 0.96                                 \\\\\n&       & Quad                                 & 0.10                                 & 0.04                                 & 0.11                                 & 0.95                                 & 0.14                                 & 0.05                                 & 0.16                                 & 0.95                                 \\\\\n& (iii) & Lin                                  & 0.12                                 & 0.00                                 & 0.11                                 & 0.91                                 & 0.15                                 & 0.03                                 & 0.16                                 & 0.96                                 \\\\\n\\multirow{-6}{*}{(e) } &       & \\textbf{Quad}                        & \\textbf{0.11}                        & \\textbf{0.00}                        & \\textbf{0.10}                        & \\textbf{0.91}                        & \\textbf{0.14}                        & \\textbf{0.02}                        & \\textbf{0.15}                        & \\textbf{0.95}                        \\\\\n\\hline\n\\end{tabular}\n}\n\\label{table_supp_infernce}\n\\end{table}\n\n\\section{Supplement to the data analysis in Section \\ref{sec_data_analysis}} \\label{sm_data_analysis}\nWe present in Table \\ref{table_data_analysis} the \\tcr{detailed} numerical results of the data analysis in Section \\ref{sec_data_analysis}, which\n%have been illustrated by\n\\tcr{were} illustrated \\tcr{in} Figures \\ref{figure_ate} and \\ref{figure_qte}, \\tcr{in course of our discussion of the analysis and the results.} %in Section \\ref{sec_data_analysis} %in that section.\n\\begin{table}[H]\n\\def~{\\hphantom{0}}\n\\caption{$95\\%$ confidence intervals of the ATE and the QTE in the HIV Drug Resistance data. Here\\tcr{,} $m$ is the position of mutatio\\tcr{n} %mutations\nregarded as the treatment. In the first row of the table, the notation\\tcr{s} \\tcr{of the form} \\tcr{`A-B\'} \\tcr{refer to} %means\nestimating the propensity score and the outcome model by the methods \\tcr{`A\'} and \\tcr{`B\'}, respectively. Lin stands for logistic regression of $T$ vs. $\\X$; KS$_2$ \\tcr{--} kernel smoothing on the two directions selected by %the\n\\tcr{sliced} inverse regression, PR \\tcr{--} parametric regression\\tcr{;} and RF \\tcr{--} random forest. The abbreviations Sup and SS refer to supervised and SS estimators, respectively. The \\textbf{\\tcn{blue}} color \\tcr{indicates} %implies\nthe shortest SS confidence interval in each case.}{\n\\resizebox{\\textwidth}{!}{\n\\begin{tabular}{cc|cc|cc|cc}\n\\hline\n& \\multirow{2}{*}{$m$} & \\multicolumn{2}{c|}{\\bf{Lin-KS$_2$}}                              & \\multicolumn{2}{c|}{\\bf{Lin-PR}}                                  & \\multicolumn{2}{c}{\\bf{RF-RF}}                                   \\\\\n&                      & Sup                          & \\bf{SS}                           & Sup                          & \\bf{SS}                        & Sup                          & \\bf{SS}                           \\\\ \\hline\n\\multirow{8}{*}{ATE} & 39                   & $[ 0.13 , 0.43 ]$            & $[ 0.13 , 0.38 ]$            & $[ 0.10 , 0.41 ]$            & $[ 0.11 , 0.36 ]$            & $[ 0.13 , 0.32 ]$            & $\\tcn{\\bf [ 0.13 , 0.32 ]}$            \\\\\n& 69                   & $[ 0.12 , 0.44 ]$            & $[ 0.19 , 0.44 ]$            & $[ 0.10 , 0.42 ]$            & $[ 0.18 , 0.43 ]$            & $[ 0.19 , 0.40 ]$            & $\\tcn{\\bf [ 0.24 , 0.43 ]}$            \\\\\n& 75                   & $[ 0.02 , 0.29 ]$            & $[ 0.08 , 0.32 ]$            & $[ 0.04 , 0.33 ]$            & $[ 0.07 , 0.33 ]$            & $[ 0.14 , 0.33 ]$            & $\\tcn{\\bf [ 0.17 , 0.35 ]}$            \\\\\n& 98                   & $[ \\hbox{-}0.02 ,  0.37   ]$ & $[  0.06 ,  0.37 ]$          & $[ 0.01 , 0.40 ]$            & $[ 0.05 , 0.36 ]$            & $[ 0.10 , 0.29 ]$            & $\\tcn{\\bf [ 0.13 , 0.33 ]}$            \\\\\n& 123                  & $[ \\hbox{-}0.16 ,  0.15   ]$ & $[ \\hbox{-}0.12 ,  0.13   ]$ & $[ \\hbox{-}0.15 ,  0.17   ]$ & $[ \\hbox{-}0.10 ,  0.15   ]$ & $[ \\hbox{-}0.15 ,  0.04   ]$ & $\\tcn{\\bf [ \\hbox{-}0.15 ,  0.05   ]}$ \\\\\n& 162                  & $[ \\hbox{-}0.16 ,  0.19   ]$ & $[ \\hbox{-}0.14 ,  0.12   ]$ & $[ \\hbox{-}0.16 ,  0.18   ]$ & $[ \\hbox{-}0.14 ,  0.13   ]$ & $[ \\hbox{-}0.13 ,  0.07   ]$ & $\\tcn{\\bf [ \\hbox{-}0.12 ,  0.09   ]}$ \\\\\n& 184                  & $[ 2.02 , 2.36 ]$            & $[ 2.08 , 2.35 ]$            & $[ 2.03 , 2.37 ]$            & $[ 2.03 , 2.30 ]$            & $[ 2.08 , 2.30 ]$            & $\\tcn{\\bf [ 2.12 , 2.31 ]}$            \\\\\n& 203                  & $[ 0.08 , 0.50 ]$            & $[ 0.17 , 0.51 ]$            & $[ 0.00 , 0.45 ]$            & $[ 0.08 , 0.45 ]$            & $[ 0.14 , 0.33 ]$            & $\\tcn{\\bf [ 0.20 , 0.38 ]}$            \\\\ \\hline\n\\multirow{8}{*}{QTE} & 39  & $[ 0.07 , 0.43 ]$   & $[ 0.12 , 0.38 ]$   & $[ 0.05 , 0.42 ]$   & $[ 0.09 , 0.36 ]$   & $[ \\hbox{-}0.01 ,  0.32 ]$ & $\\tcn{\\bf [  0.05 ,    0.30 ]}$ \\\\\n& 69  & $[ \\hbox{-}0.14 ,  0.16 ]$ & $\\tcn{\\bf [ \\hbox{-}0.06 ,  0.18 ]}$ & $[ \\hbox{-}0.14 ,  0.17 ]$ & $[ \\hbox{-}0.06 ,  0.19 ]$ & $[ \\hbox{-}0.13 ,  0.22 ]$ & $[ \\hbox{-}0.06 ,  0.20 ]$   \\\\\n& 75  & $[ \\hbox{-}0.06 ,  0.29 ]$ & $\\tcn{\\bf [ \\hbox{-}0.01 ,  0.26 ]}$ & $[ \\hbox{-}0.09 ,  0.26 ]$ & $[ \\hbox{-}0.04 ,  0.23 ]$ & $[ 0.03 , 0.42 ]$   & $[ 0.11 , 0.39 ]$     \\\\\n& 98  & $[ 0.01 , 0.34 ]$   & $[ 0.00 , 0.29 ]$   & $[ 0.03 , 0.38 ]$   & $[ 0.00 , 0.28 ]$   & $[ \\hbox{-}0.04 ,  0.37 ]$ & $\\tcn{\\bf [  0.02 ,  0.30 ]}$   \\\\\n& 123 & $[ \\hbox{-}0.16 ,  0.21 ]$ & $\\tcn{\\bf [ \\hbox{-}0.12 ,  0.15 ]}$ & $[ \\hbox{-}0.16 ,  0.22 ]$ & $[ \\hbox{-}0.13 ,  0.15 ]$ & $[ \\hbox{-}0.17 ,  0.29 ]$ & $[ \\hbox{-}0.10 ,  0.18 ]$   \\\\\n& 162 & $[ \\hbox{-}0.25 ,  0.07 ]$ & $\\tcn{\\bf [ \\hbox{-}0.23 ,  0.02 ]}$ & $[ \\hbox{-}0.23 ,  0.09 ]$ & $[ \\hbox{-}0.20 ,  0.05 ]$ & $[ \\hbox{-}0.22 ,  0.16 ]$ & $[ \\hbox{-}0.15 ,  0.11 ]$   \\\\\n& 184 & $[ 2.16 , 2.50 ]$   & $[ 2.22 , 2.49 ]$   & $[ 2.15 , 2.49 ]$   & $\\tcn{\\bf [ 2.17 , 2.44 ]}$   & $[ 2.14 , 2.50 ]$   & $[ 2.23 , 2.50 ]$     \\\\\n& 203 & $[ \\hbox{-}0.15 ,  0.34 ]$ & $[  0.06 ,  0.41 ]$ & $[ \\hbox{-}0.14 ,  0.34 ]$ & $[  0.06 ,  0.40 ]$ & $[ 0.01 , 0.40 ]$   & $\\tcn{\\bf [ 0.09 , 0.36 ]}$       \\\\ \\hline\n\\end{tabular}}\n}\n\\label{table_data_analysis}\n\\end{table}\n\n\\end{appendix}\n\n%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n%%                  The Bibliography                       %%\n%%                                                         %%\n%%  imsart-???.bst  will be used to                        %%\n%%  create a .BBL file for submission.                     %%\n%%                                                         %%\n%%  Note that the displayed Bibliography will not          %%\n%%  necessarily be rendered by Latex exactly as specified  %%\n%%  in the online Instructions for Authors.                %%\n%%                                                         %%\n%%  MR numbers will be added by VTeX.                      %%\n%%                                                         %%\n%%  Use \\citep{...} to cite references in text.             %%\n%%                                                         %%\n%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n\n%% if your bibliography is in bibtex format, uncomment commands:\n\\bibliographystyle{imsart-nameyear} % Style BST file (imsart-number.bst or imsart-nameyear.bst)\n\\bibliography{myreference-te}       % Bibliography file (usually \'*.bib\')\n\n\n\n\n'], []), '\n']

In [None]:
soup.find_all('section')

In [None]:
soup

In [None]:
find_bad_lines(infile_path, encoding='utf-8')

In [None]:
tar_path = "./data/2201_samp/2201.00008v2.tar.gz"
encoding = "utf-8"
with tarfile.open(tar_path, 'r') as in_tar:
    tex_files = [f for f in in_tar.getnames() if f.endswith('.tex')]

    # got one file
    if len(tex_files) == 1:
        pass #return tex_files[0]

    main_files = {}
    for tf in tex_files:
        fp = in_tar.extractfile(tf)
        wrapped_file = io.TextIOWrapper(fp, newline=None, encoding=encoding) #universal newlines
        # does it have a doc class?
        # get the type
        main_files[tf] = find_doc_class(wrapped_file)
        wrapped_file.close() 

    # got one file with doc class
    if len(main_files) == 1:
        pass #return(main_files.keys()[0])

    # account for multi-file submissions
    #return(max(main_files, key=main_files.get))

In [None]:
main_files

In [None]:
doc_class_pat = re.compile(r"^\s*\\document(?:style|class)")

with tarfile.open(tar_path, 'r', encoding='utf-8') as in_tar:
    #in_tar.getnames()
    fp = in_tar.extractfile('main.tex')
    wrapped_file = io.TextIOWrapper(fp, newline=None, encoding='utf-8') #universal newlines
    for line in wrapped_file:
        if doc_class_pat.search(line):
            print(line)
            break

In [None]:
next(wrapped_file)

In [None]:
min_example=r"""
\documentclass{article}
\begin{document}
% \renewcommand{\shorttitle}{Avoiding Catastrophe}
\end{document}
""".strip() #.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example))
#print(min_example)
TS.TexSoup(r'\newcommand{\bra}[1]{\left\langle#1\right|}')

In [None]:
TS.TexSoup(r'\def\be{\foo{equation}}')

In [None]:
TS.TexSoup(r'\renewcommand{\shorttitle}{Avoiding Catastrophe}')
min_example = r"\newenvironment{inlinemath}{$}{$}".strip()
TS.TexSoup(pre_format(min_example))
#print(min_example)

In [None]:
min_example = r"In practice, the matrix $\left [ 4 \right]\Inv\M{D}^{(1)}_n $".strip()
TS.TexSoup(pre_format(min_example))
#print(min_example)

In [None]:
min_example = r"In practice, the matrix $\left[ 4 \right]\Inv\M{D}^{(1)}_n $"


cats = TS.category.categorize(min_example)
tokens = list(TS.tokens.tokenize(cats))

char_codes = list(TS.category.categorize(min_example))

buf = TS.reader.Buffer(TS.tokens.tokenize(TS.category.categorize(r'\left[ 4 \right]')))
TS.reader.read_command(buf, n_required_args=-1, mode='mode:math', skip=1 )

In [None]:
min_example = r"$ t \in [0,1] $$ t \in [0,1] $"


cats = TS.category.categorize(min_example)
tokens = list(TS.tokens.tokenize(cats))

char_codes = list(TS.category.categorize(min_example))

with pd.option_context('display.max.columns', None, 'display.max_colwidth', 0):
    pd.DataFrame({'char':char_codes, 'code':(x.category for x in char_codes)}).transpose()
    pd.DataFrame({'tokens':tokens})

buf = TS.reader.Buffer(TS.tokens.tokenize(TS.category.categorize(min_example)))
TS.reader.read_command(buf, n_required_args=-1, mode='mode:math', skip=3, tolerance=1)

buf = TS.reader.Buffer(TS.tokens.tokenize(TS.category.categorize(min_example)))
TS.read(buf, tolerance=1)

In [None]:
with pd.option_context('display.max.columns', None, 'display.max_colwidth', 0):
    pd.DataFrame({'char':char_codes, 'code':(x.category for x in char_codes)}).transpose()
    pd.DataFrame({'tokens':tokens})

In [None]:
min_example = r"In practice, the matrix $\left [\M{D}^{(1)}_n(\M{D}^{(1)}_n)\Tra\right]\Inv\M{D}^{(1)}_n $"
print(min_example)
TS.TexSoup(pre_format(min_example))
TS.TexSoup(min_example)

In [None]:
min_example=r"""
\documentclass{article}
\begin{document}
% \renewcommand{\shorttitle}{Avoiding Catastrophe}
\end{document}
""".strip() #.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example))
#print(min_example)

In [None]:
min_example=r"""
\def\bean {\begin{foo}}  \def\eean {\end{foo}}
""".strip() #.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example))
TS.TexSoup(min_example)
print(min_example)
min_example=r"""
we {use $A=8B$ and $s=1$, then the scalar field becomes same with (\Ref{scalarfield}) and
""".strip() #.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#TS.TexSoup(min_example)
print(min_example)
print(pre_format(min_example))
BRACKETS_DELIMITERS = {
    '(', ')', '<', '>', '[', ']', '{', '}', r'\{', r'\}', '.' '|', r'\langle',
    r'\rangle', r'\lfloor', r'\rfloor', r'\lceil', r'\rceil', r'\ulcorner',
    r'\urcorner', r'\lbrack', r'\rbrack'
}
# TODO: looks like left-right do have to match
SIZE_PREFIX = ('left', 'right', 'big', 'Big', 'bigg', 'Bigg')
PUNCTUATION_COMMANDS = {command + opt_space + bracket
                        for command in SIZE_PREFIX
                        for opt_space in {'', ' '}
                        for bracket in BRACKETS_DELIMITERS.union({'|', '.'})}
PUNCTUATION_COMMANDS

In [None]:
min_example=r"""
\def\bean {\begin{eqnarray*}}  \def\eean {\end{eqnarray*}}
""".strip() #.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example))
#print(min_example)
min_example=r"""
the interval $t\in[0,1)$. 
""".strip() #.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example))
#print(min_example)
min_example=r"""







\beq
[\chF,\chG\}=\{\partial\chF,\chG\}.
\eeq

derivation $\CA\mapsto [\CB,\CA]$. 







The following characterizations of UAL chains are all equivalent:
\begin{itemize}
    \item[(1)] A skew-symmetric function $\cha:\Lambda^{q+1}\ra\mfkdal$ defines an element of $C_{q}(\mfkdal) $ if $\|\cha\|_{\alpha}<\infty$ for any $\alpha \in \NN$.
    \item[(2)] A skew-symmetric function $\cha:\Lambda^{q+1}\ra\mfkdal$ defines an element of $C_{q}(\mfkdal) $ if there is a function $b(r) \in \Orf$  such that for any $j_0,...,j_q$ the observable $\cha_{j_0...j_q}$ is $b$-localized at $j_a$ for any $a \in \{0,1,...,q\}$.
    \item[(3)] $C_{q}(\mfkdal) $ is the completion of $C_q(\mfkdl) $ with respect to the norms $\|\cdot\|_{\alpha}$.
\end{itemize}
\end{lemma}





""".strip() #.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
min_example=r"""
\newcommand\const{\operatorname{const}}
""".strip() #.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
min_example=r"""
\newcommand{\beq}{\begin{equation}}
\newcommand{\eeq}{\end{equation}}
\newcommand{\chF}{{\mathsf f}}
\newcommand{\chG}{{\mathsf g}}
\beq  
[\chF,\chG\}=\{\partial\chF,\chG\}.
\eeq
derivation $\CA\mapsto [\CB,\CA]$. 
""".strip().replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
min_example=r"""
\[
r_p=d(p,\cdot)\colon \Gamma \to [0,\infty)|~ p \in M\}
\]
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
min_example=r"""
$\bigl[ a \bigr)$
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
min_example=r"""

$\varepsilon\in]0,\varepsilon_\star[$,  

""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
min_example=r"""
\[
i\colon [0,\infty) 
\]
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)

In [None]:
min_example=r"""
\newcommand\1{{\mathds 1}}
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
# !! This bug was specific to my fork
min_example=r"""
\newcommand{\linebreakand}{%
    \end{@IEEEauthorhalign}
    \hfill\mbox{}\par
    \mbox{}\hfill\begin{@IEEEauthorhalign}
    }
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
min_example=r"""
 $S \subseteq \{0\} \bigcup [1,\infty) $ if $z^*_2=1$.  
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
# two inline math envs next to eachother
# !! probably not fixable given the approach used in TexSoup (needs stateful tokenization)
min_example=r"""
$\rm{W_{cyc} }\geq 0$$\;\;\square$
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
# \verb{char}...{char} is also an issue for parser
# !! probably not fixable given the approach used in TexSoup (needs stateful tokenization)
min_example=r"""
\verb+$TEXMF/tex/latex/elsevier/+, %$%%%%%%%%%%%%%%%%%%%%%%%%%%%%
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
# does not handle missing optional braces around arguments
min_example=r"""
$\sqrt {\frac 3 2} >p >1$
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
# \verb{char}...{char} is also an issue for parser
# !! probably not fixable given the approach used in TexSoup (needs stateful tokenization)
min_example=r"""
&$\rm{N_{Diskbb}}$$(\times 10^4) $
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
# \verb{char}...{char} is also an issue for parser
# !! probably not fixable given the approach used in TexSoup (needs stateful tokenization)
min_example=r"""
$\frac{j+1+\epsilon}{m^{\alpha}}[$
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
# \verb{char}...{char} is also an issue for parser
# !! probably not fixable given the approach used in TexSoup (needs stateful tokenization)
min_example=r"""
$1\le k< \frac n2 $ 
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
# \verb{char}...{char} is also an issue for parser
# !! probably not fixable given the approach used in TexSoup (needs stateful tokenization)
min_example=r"""
\begin{equation}
\begin{aligned}[t]
[T\tensor*[]{]}{_{\CT}^{\sp}} \\
[T]{_{\CT}^{\sp}}
\end{aligned}
\end{equation}
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)

In [None]:
# \verb{char}...{char} is also an issue for parser
# !! probably not fixable given the approach used in TexSoup (needs stateful tokenization)
with open('./data/test.txt', 'r') as infile:
    min_example=infile.read().strip()

TS.TexSoup(pre_format(min_example), tolerance=0)
#print(min_example)

In [26]:
# \verb{char}...{char} is also an issue for parser
# !! probably not fixable given the approach used in TexSoup (needs stateful tokenization)
min_example=r"""
\def\f{\frac}
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=0)
#print(min_example)

TypeError: [Line: 0, Offset 6] Malformed argument. First and last elements must match a valid argument format. In this case, TexSoup could not find matching punctuation for: {.
Just finished parsing: ['{', TexCmd('frac', [BraceGroup('}')])]

In [None]:
import pandas as pd
import numpy as np
pd.DataFrame(np.random.randint(0,100,size=(10, 3)), columns=list('ABC')).to_csv('~/Expire/test_console_upload.csv')

In [None]:
min_example=r"""
\renewcommand{\subsection}[1]{{\textit{#1.~}}}
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=0)
#print(min_example)