In [None]:
import pandas as pd
import json
from unicodedata import normalize
import numpy as np


from pathlib import Path
from proxann.utils import process_responses, collect_fit_rank_data, compute_correlations_one, compute_agreement_per_topic

In [None]:
def read_json(fpath):
    with open(fpath) as infile:
        return json.load(infile) # type: ignore

def load_data():    
    path_data = "/export/usuarios_ml4ds/lbartolome/Repos/umd/theta-evaluation/data/"

    data_jsons = [
        f"{path_data}json_out/config_wiki_part1.json",
        f"{path_data}json_out/config_wiki_part2.json",
        f"{path_data}json_out/config_bills_part1.json",
        f"{path_data}json_out/config_bills_part2.json",
    ]
    response_csvs = [
        f"{path_data}qualtrics/Cluster+Evaluation+-+Sort+and+Rank_December+12,+2024_05.19.csv",
        f"{path_data}qualtrics/Cluster+Evaluation+-+Sort+and+Rank+-+Bills_December+14,+2024_13.20.csv",
    ]
    start_date = "2024-12-06 09:00:00"

    responses = {}
    for csv in response_csvs:
        for topic_id, topic_responses in process_responses(csv, data_jsons, start_date=start_date, path_save=None, removal_condition="loose").items(): # type: ignore
            if topic_responses:
                responses[topic_id] = topic_responses

    _, _, _, corr_data = collect_fit_rank_data(responses) # type: ignore
    corr_data = sorted(corr_data, key=lambda x: x["id"])
    corr_data_as_dict = {item["id"]: item for item in corr_data}

    base_path = Path("../data/camera_ready_llm_out/mean/")
    model_output_paths = [
        sorted(Path(base_path, "wiki/gpt-4o-2024-08-06/").glob("*"))[0],#keep first
        sorted(Path(base_path, "bills/gpt-4o-2024-08-06/").glob("*"))[0],
    ]
    llm_fit_data = []
    for path in model_output_paths:
        llm_fit_data += read_json(f"{path}/llm_results_q2.json")
    llm_fit_data = sorted(llm_fit_data, key=lambda x: x["id"])

    llm_q1_data = []
    for path in model_output_paths:
        llm_q1_data += read_json(f"{path}/llm_results_q1.json")
    llm_q1_data = sorted(llm_q1_data, key=lambda x: x["id"])

    agreement_per_topic, _ =compute_agreement_per_topic(responses)

    corr_results = compute_correlations_one(corr_data, fit_llm_data=llm_fit_data)

    corr_user_results = agreement_per_topic.merge(corr_results, on=["id", "model", "topic"], how="left")
    
    return corr_user_results, llm_q1_data, llm_fit_data, corr_data_as_dict, responses

In [None]:
fit_alpha_threshold = 0.7  
tau_selection_count = 2
model_to_use = "fit_tau" #"fit_tau_users_gpt-4o-2024-08-06" #"fit_tau"

def get_agreement_disagreement_examples(corr_user_results, fit_alpha_threshold=0.7, tau_selection_count=2, model_to_use="fit_tau", keep_high_hh=False):


    # Highest human-human agreement (fit_alpha > 0.7), top 2
    high_hh_agreement = (
        corr_user_results[corr_user_results["fit_alpha"] > fit_alpha_threshold]
        .sort_values("fit_alpha", ascending=False)
        .head(2)
    )

    # Lowest human-human agreement (fit_alpha < 0.7), bottom 2
    low_hh_agreement = (
        corr_user_results[corr_user_results["fit_alpha"] < fit_alpha_threshold]
        .sort_values("fit_alpha")
        .head(2)
    )

    # Filter topics with high HH 
    high_hh_only = corr_user_results[corr_user_results["fit_alpha"] > fit_alpha_threshold]

    # Low human-model agreement (lowest tau among high HH)
    low_hm_agreement = (
        high_hh_only.sort_values(model_to_use)
        .head(tau_selection_count)
    )

    # High human-model agreement (highest tau among high HH)
    high_hm_agreement = (
        high_hh_only.sort_values(model_to_use, ascending=False)
        .head(tau_selection_count)
    )

    low_hh_agreement = low_hh_agreement.assign(
        agreement_label="Lowest HH ($\\alpha < 0.7$)"
    )
    low_hm_agreement = low_hm_agreement.assign(
        agreement_label="Low HM (High HH $\\alpha > 0.7$)"
    )

    keep_high_hh = False
    if keep_high_hh:
        high_hh_agreement = high_hh_agreement.assign(
            agreement_label="Highest HH ($\\alpha > 0.7$)"
        )
        high_hm_agreement = high_hm_agreement.assign(
            agreement_label="High HM (High HH $\\alpha > 0.7$)"
        )
        selected = pd.concat([
            low_hh_agreement,
            high_hh_agreement,
            low_hm_agreement,
            high_hm_agreement
        ], ignore_index=True)
    else:
        high_hm_agreement = high_hm_agreement.assign(
            agreement_label="High HM (High HH $\\alpha > 0.7$)"
        )
        selected = pd.concat([
            low_hh_agreement,
            low_hm_agreement,
            high_hm_agreement
        ], ignore_index=True)

    categories = categories=[
            "Lowest HH ($\\alpha < 0.7$)",
            "Highest HH ($\\alpha > 0.7$)",
            "Low HM (High HH $\\alpha > 0.7$)",
            "High HM (High HH $\\alpha > 0.7$)"
        ] if keep_high_hh else [
            "Lowest HH ($\\alpha < 0.7$)",
            "Low HM (High HH $\\alpha > 0.7$)",
            "High HM (High HH $\\alpha > 0.7$)"
        ]

    selected["agreement_label"] = pd.Categorical(
        selected["agreement_label"],
        categories=categories,
        ordered=True
    )
    
    return selected

In [6]:
def sanitize(text):
    if not isinstance(text, str):
        text = str(text)
        text = normalize("NFKD", text).encode("ascii", "ignore").decode("ascii")
    return (text.replace("&", "\\&")
                .replace("%", "\\%")
                #.replace("$", "\\$")
                .replace("#", "\\#")
                .replace("_", "\\_")
                .replace("{", "\\{")
                .replace("}", "\\}"))

def truncate(text, max_chars=150):
    text = sanitize(text)
    if len(text) <= max_chars:
        return text

    # Try to find the end of the first sentence
    first_period = text.find(". ")
    if 0 < first_period + 1 <= max_chars:
        return text[:first_period + 1]

    # Otherwise, truncate to the nearest space before max_chars
    cutoff = text[:max_chars].rstrip()
    last_space = cutoff.rfind(" ")
    if last_space != -1:
        cutoff = cutoff[:last_space]
    return cutoff + " [...]"


def format_exemplar(doc, fit_scores=None, llm_fit_scores=None):
    label = "\\ul{Text:}"
    content = sanitize(truncate(doc["text"]))
    doc_id = f"\\textbf{{Doc ID}} {doc['doc_id']}:"

    stats = []
    if fit_scores is not None:
        stats.append(f"$\\text{{Human Fit}}={fit_scores.mean():.2f}\\pm{fit_scores.std():.2f}$")
    if llm_fit_scores is not None:
        stats.append(f"$\\text{{LLM Fit}}={llm_fit_scores:.2f}$")
    stats.append(f"$\\theta_d={doc['prob']:.2f}$")

    stats_line = ", ".join(stats)

    return (
        "\\begin{minipage}[t]{\\linewidth}\n"
        f"\\textit{{{doc_id} {stats_line}}} \\\\\n"
        f"{label} {content}\n"
        "\\end{minipage}"
    )




## Table 6

In [44]:
corr_user_results, llm_q1_data, llm_fit_data, corr_data_as_dict, responses = load_data()
selected = get_agreement_disagreement_examples(corr_user_results,model_to_use="fit_tau")

Total responses: 142
Total responses: 142
Removed: 25
Total responses: 121
Total responses: 121
Removed: 20


In [45]:
selected

Unnamed: 0,id,model,topic,fit_alpha,fit_alpha_p,fit_ac2,fit_ac2_p,rank_alpha,rank_alpha_p,rank_ac2,...,rank_rho,rank_tau,fit_agree,fit_ndcg,rank_ndcg,fit_tau_users_gpt-4o-2024-08-06,fit_tau_tm_gpt-4o-2024-08-06,fit_agree_users_gpt-4o-2024-08-06,fit_agree_tm_gpt-4o-2024-08-06,agreement_label
0,bills-labeled/bertopic/3,bertopic,3,-0.16482,0.01843087,0.18325,0.1792181,0.33036,0.164627,0.30556,...,0.063636,-0.05,0.142857,0.912672,0.778174,0.370479,0.48795,0.0,0.857143,Lowest HH ($\alpha < 0.7$)
1,bills-labeled/mallet/12,mallet,12,-0.14565,0.004403206,0.31979,0.01321369,0.08036,0.637214,0.0463,...,0.654654,0.450564,0.714286,0.994317,0.953366,0.899735,0.714286,0.857143,0.571429,Lowest HH ($\alpha < 0.7$)
2,wikitext-labeled/bertopic/21,bertopic,21,0.71804,0.01954081,0.83725,5.003309e-05,0.41113,0.102941,0.45238,...,0.245455,0.15,1.0,0.932048,0.747733,0.617213,0.58554,0.857143,0.857143,Low HM (High HH $\alpha > 0.7$)
3,wikitext-labeled/mallet/5,mallet,5,0.80974,0.0008579068,0.886,2.553467e-05,0.66029,0.025912,0.73426,...,0.594619,0.48795,0.571429,0.85617,0.90871,0.720082,0.238095,0.142857,0.571429,Low HM (High HH $\alpha > 0.7$)
4,wikitext-labeled/ctm/35,ctm,35,0.94901,1.589718e-06,0.97392,9.474791e-09,0.56952,0.028607,0.625,...,0.504525,0.39036,0.571429,1.0,0.82331,0.732467,0.428571,0.714286,0.285714,High HM (High HH $\alpha > 0.7$)
5,wikitext-labeled/ctm/2,ctm,2,0.98077,4.531295e-09,0.98191,5.234873e-09,0.61199,0.000596,0.59259,...,0.727393,0.550689,1.0,1.0,0.969268,0.583212,0.428571,0.571429,0.571429,High HM (High HH $\alpha > 0.7$)


In [None]:
# Build LaTeX rows
latex_rows = []

for label, rows in selected.groupby("agreement_label"):
    label_title = f"\\textbf{{{label}}}"

    #latex_rows.append(f"\\rowcolor{{gray!15}} \\multicolumn{{6}}{{c}}{{{label_title}}} \\\\")
    #latex_rows.append("\\cmidrule(lr){1-6}")
    latex_rows.append("\\midrule")
    latex_rows.append(f"\\rowcolor{{gray!15}} \\multicolumn{{6}}{{c}}{{{label_title}}} \\\\")

    for _, row in rows.iterrows():
        topic_id = row["id"]
        #print(f"Processing topic: {topic_id}")
        model_type = topic_id.split("/")[-2]
        if "wiki" in topic_id.lower():
            topic_id_formatted = f"Wiki on {model_type.upper()} ($\\alpha = {row['fit_alpha']:.2f}$)"
        elif "bills" in topic_id.lower():
            topic_id_formatted = f"Bills on {model_type.upper()} ($\\alpha = {row['fit_alpha']:.2f}$)"
        
        if row["agreement_label"] == "High HM (High HH $\\alpha > 0.7$)":
            topic_id_formatted = topic_id_formatted.split(")")[0]
            topic_id_formatted += f", $\\tau = {row['fit_tau']:.2f}$)"
        topic_info = responses[topic_id][0]

        # Italicized topic ID row
        latex_rows.append(f"\\multicolumn{{6}}{{c}}{{\\textit{{{sanitize(topic_id_formatted)}}}}} \\\\")
        #latex_rows.append("\\cmidrule(lr){1-6}")

        # Content row
        topic_words = sanitize(", ".join(topic_info["topic_words"][:8]))

        raw_categories = list(set(sanitize(r.get("category", "")) for r in responses[topic_id]))
        cat_items = "\n".join(f"\\item {c}" for c in raw_categories)
        categories = (
            "\\begin{minipage}[t]{\\linewidth}\n"
            "\\begin{itemize}\n"
            "\\setlength{\\itemsep}{0pt}\n"
            "\\setlength{\\parskip}{0pt}\n"
            f"{cat_items}\n"
            "\\end{itemize}\n"
            "\\end{minipage}"
        )
        
        # we keep as exemplars the first two exemplar docs 
        exemplars = topic_info.get("exemplar_docs", [])
        ex1 = format_exemplar(exemplars[0]) if len(exemplars) > 0 else ""
        ex2 = format_exemplar(exemplars[1]) if len(exemplars) > 1 else ""

        # eval docs we select two in which the model (LLM or TM) disagreed the most with the users
        eval_docs = topic_info.get("eval_docs", [])
        #ev1 = format_exemplar(eval_docs[0]) if len(eval_docs) > 0 else ""
        #ev2 = format_exemplar(eval_docs[1]) if len(eval_docs) > 1 else ""
        llm_fit_topic = [dict for dict in llm_fit_data if dict["id"] ==  topic_id][0]
        corr_topic = corr_data_as_dict[topic_id]
        responses_topic = responses[topic_id][0]
            
        disagreeable_docs = []
        all_human_diffs = []
        all_human_tm_diffs = []
        fit_scores = []
        for i in range(7):
            llm_fit_doc = llm_fit_topic["fit_data"][0][i]
            human_fit_docs = corr_topic["fit_data"][:, i]
            tm_prob_doc = corr_topic["prob_data"][i]
            
            eval_doc = responses_topic["eval_docs"][i]
            
            human_diff = np.abs(human_fit_docs.min() - human_fit_docs.max())
            human_tm_diff = np.abs(tm_prob_doc - human_fit_docs.mean())
            
            #if topic_id == "wikitext-labeled/ctm/35":
            #    print(f"Human fit: {human_fit_docs}, TM prob: {tm_prob_doc}, Human diff: {human_diff}, Human-TM diff: {human_tm_diff}, Eval doc: {eval_doc}")
            all_human_diffs.append(human_diff)
            all_human_tm_diffs.append(human_tm_diff)
            fit_scores.append(human_fit_docs)

        # find the two documents with the highest human differences
        sorted_indices = np.argsort(all_human_diffs)[::-1]
        low_hh_agreement_evals = sorted_indices[:2]

        # find the two documents with the lowest human differences
        high_hh_agreement_evals = sorted_indices[-2:]

        # find the two documents with the highest human-tm differences
        low_hm_agreement_evals = np.argsort(all_human_tm_diffs)[-2:]#[:2]
            
        if row["agreement_label"] == "Lowest HH ($\\alpha < 0.7$)":
            indexes_eval = low_hh_agreement_evals
        elif row["agreement_label"] == "Low HM (High HH $\\alpha > 0.7$)":
            indexes_eval = low_hm_agreement_evals
        elif row["agreement_label"] == "High HM (High HH $\\alpha > 0.7$)":
            indexes_eval = high_hh_agreement_evals
        
        ev1 = format_exemplar(eval_docs[indexes_eval[0]], fit_scores=fit_scores[indexes_eval[0]]) if len(eval_docs) > 0 else ""
        ev2 = format_exemplar(eval_docs[indexes_eval[1]], fit_scores=fit_scores[indexes_eval[0]]) if len(eval_docs) > 1 else ""
        
        #if topic_id == "wikitext-labeled/ctm/35":
        #    print(f"Eval Doc 1: {ev1}")
        #    print(f"Eval Doc 2: {ev2}")
            
        latex_rows.append(
            f"{topic_words} & {categories} & {ex1} & {ex2} & {ev1} & {ev2} \\\\"
        )

        latex_rows.append("\\addlinespace")

latex_table = f"""
\\renewcommand{{\\arraystretch}}{{1.4}}
\\begin{{tabular}}{{ >{{\\centering\\arraybackslash}}p{{3cm}} p{{5cm}} p{{4.5cm}} p{{4.5cm}} p{{4.5cm}} p{{4.5cm}} }}
\\toprule
\\textbf{{Topic Words}} & \\textbf{{Categories}} & \\textbf{{Exemplar Document 1}} & \\textbf{{Exemplar document 2}} & \\textbf{{Evaluation Document 1}} & \\textbf{{Evaluation Document 2}} \\\\
\\midrule
{chr(10).join(latex_rows)}
\\bottomrule
\\end{{tabular}}
"""

print(latex_table)


\renewcommand{\arraystretch}{1.4}
\begin{tabular}{ >{\centering\arraybackslash}p{3cm} p{5cm} p{4.5cm} p{4.5cm} p{4.5cm} p{4.5cm} }
\toprule
\textbf{Topic Words} & \textbf{Categories} & \textbf{Exemplar Document 1} & \textbf{Exemplar document 2} & \textbf{Evaluation Document 1} & \textbf{Evaluation Document 2} \\
\midrule
\midrule
\rowcolor{gray!15} \multicolumn{6}{c}{\textbf{Lowest HH ($\alpha < 0.7$)}} \\
\multicolumn{6}{c}{\textit{Bills on BERTOPIC ($\alpha = -0.16$)}} \\
student, school, students, education, schools, leas, higher, ihes & \begin{minipage}[t]{\linewidth}
\begin{itemize}
\setlength{\itemsep}{0pt}
\setlength{\parskip}{0pt}
\item Aiding children in school to receive a proper, well informed, education in school and post secondary.
\item Educational reform and students' welfare
\item K-12 federal education legislation
\item High School Student Initiative
\end{itemize}
\end{minipage} & \begin{minipage}[t]{\linewidth}
\textit{\textbf{Doc ID} 1947: $\theta_d=1.00$} \\
\ul{Te

  for label, rows in selected.groupby("agreement_label"):


## Table 7

In [11]:
corr_user_results, llm_q1_data, llm_fit_data, corr_data_as_dict, responses = load_data()
selected = get_agreement_disagreement_examples(corr_user_results, model_to_use="fit_tau_users_gpt-4o-2024-08-06")
# keep only those with agreement label "Low HM (High HH $\alpha > 0.7$)"
selected = selected[selected["agreement_label"] == "Low HM (High HH $\\alpha > 0.7$)"]

Total responses: 142
Total responses: 142
Removed: 25
Total responses: 121
Total responses: 121
Removed: 20


In [12]:
def generate_latex_table_for_topic(index):
    row = selected.iloc[index]
    topic_id = row["id"]
    model_type = topic_id.split("/")[-2]

    alpha = row['fit_alpha']
    tau = row['fit_tau_users_gpt-4o-2024-08-06']
    stats = f"\\(\\alpha = {alpha:.2f},\\ \\tau = {tau:.2f}\\)"

    if "wiki" in topic_id.lower():
        topic_id_formatted = f"Wiki on {model_type.upper()} ({stats})"
    elif "bills" in topic_id.lower():
        topic_id_formatted = f"Bills on {model_type.upper()} ({stats})"
    else:
        topic_id_formatted = f"{model_type.upper()} Topic ({stats})"
        
    topic_info = responses[topic_id][0]
    topic_words = sanitize(", ".join(topic_info["topic_words"][:8]))

    raw_categories = list(set(sanitize(r.get("category", "")) for r in responses[topic_id]))
    cat_items = "\n".join(f"\\item {c}" for c in raw_categories)
    categories = (
        "\\begin{minipage}[t]{\\linewidth}\n"
        "\\begin{itemize}\n"
        "\\setlength{\\itemsep}{2pt}\n"
        "\\setlength{\\parskip}{0pt}\n"
        f"{cat_items}\n"
        "\\end{itemize}\n"
        "\\end{minipage}"
    )

    llm_label_topic = sanitize(next(d for d in llm_q1_data if d["id"] == topic_id)["categories"][0])

    exemplars = [format_exemplar(e) for e in topic_info.get("exemplar_docs", [])][:3]
    ex_docs_items = "\n".join(f"\\item {ed}" for ed in exemplars)
    ex_docs_formatted = (
        "\\begin{minipage}[t]{\\linewidth}\n"
        "\\begin{itemize}\n"
        "\\setlength{\\itemsep}{4pt}\n"
        "\\setlength{\\parskip}{0pt}\n"
        f"{ex_docs_items}\n"
        "\\end{itemize}\n"
        "\\end{minipage}"
    )

    eval_docs = topic_info.get("eval_docs", [])
    llm_fit_topic = next(d for d in llm_fit_data if d["id"] == topic_id)
    corr_topic = corr_data_as_dict[topic_id]

    fit_scores, llm_fit_scores = [], []
    for i in range(7):
        llm_fit_doc = llm_fit_topic["fit_data"][0][i]
        human_fit_docs = corr_topic["fit_data"][:, i]
        fit_scores.append(human_fit_docs)
        llm_fit_scores.append(llm_fit_doc)

    for i in range(len(eval_docs)):
        eval_docs[i] = format_exemplar(eval_docs[i], fit_scores=fit_scores[i], llm_fit_scores=llm_fit_scores[i])
    eval_docs_items = "\n".join(f"\\item {ed}" for ed in eval_docs)
    eval_docs_formatted = (
        "\\begin{minipage}[t]{\\linewidth}\n"
        "\\begin{itemize}\n"
        "\\setlength{\\itemsep}{4pt}\n"
        "\\setlength{\\parskip}{0pt}\n"
        f"{eval_docs_items}\n"
        "\\end{itemize}\n"
        "\\end{minipage}"
    )

    table = f"""
\\renewcommand{{\\arraystretch}}{{1.4}}
\\begin{{tabular}}{{p{{5cm}}p{{6cm}}p{{5cm}}}}
\\hline
\\rowcolor{{gray!15}}
\\multicolumn{{3}}{{c}}{{\\textbf{{{sanitize(topic_id_formatted)}}}}} \\\\
\\textbf{{Topic Words}} & \\textbf{{Human Categories}} & \\textbf{{LLM Category}} \\\\
{topic_words} & {categories} & {llm_label_topic} \\\\ \\addlinespace \\midrule
\\multicolumn{{3}}{{l}}{{\\textbf{{Exemplar Documents}}:}} \\\\
\\multicolumn{{3}}{{l}}{{{ex_docs_formatted}}} \\\\ \\addlinespace \\midrule
\\multicolumn{{3}}{{l}}{{\\textbf{{Evaluation Documents}}:}} \\\\
\\multicolumn{{3}}{{l}}{{{eval_docs_formatted}}} \\\\
\\addlinespace
\\bottomrule
\\end{{tabular}}
"""
    print(table)

In [13]:
generate_latex_table_for_topic(0)


\renewcommand{\arraystretch}{1.4}
\begin{tabular}{p{5cm}p{6cm}p{5cm}}
\hline
\rowcolor{gray!15}
\multicolumn{3}{c}{\textbf{Bills on BERTOPIC (\(\alpha = 0.76,\ \tau = 0.48\))}} \\
\textbf{Topic Words} & \textbf{Human Categories} & \textbf{LLM Category} \\
spirits, distilled, beer, wine, excise, brewers, cider, wines & \begin{minipage}[t]{\linewidth}
\begin{itemize}
\setlength{\itemsep}{2pt}
\setlength{\parskip}{0pt}
\item tax reform on alcohol products
\item Distilled Goods Legislation
\item LEGAL INVOICE
\item Alcohol Internal Revenue Code
\end{itemize}
\end{minipage} & Alcoholic Beverage Taxation and Regulation \\ \addlinespace \midrule
\multicolumn{3}{l}{\textbf{Exemplar Documents}:} \\
\multicolumn{3}{l}{\begin{minipage}[t]{\linewidth}
\begin{itemize}
\setlength{\itemsep}{4pt}
\setlength{\parskip}{0pt}
\item \begin{minipage}[t]{\linewidth}
\textit{\textbf{Doc ID} 7046: $\theta_d=1.00$} \\
\ul{Text:} Amends the Internal Revenue Code to exclude from determination of the production p

In [14]:
generate_latex_table_for_topic(1)


\renewcommand{\arraystretch}{1.4}
\begin{tabular}{p{5cm}p{6cm}p{5cm}}
\hline
\rowcolor{gray!15}
\multicolumn{3}{c}{\textbf{Wiki on CTM (\(\alpha = 0.98,\ \tau = 0.58\))}} \\
\textbf{Topic Words} & \textbf{Human Categories} & \textbf{LLM Category} \\
career, hit, games, season, league, baseball, major\_league\_baseball, signed & \begin{minipage}[t]{\linewidth}
\begin{itemize}
\setlength{\itemsep}{2pt}
\setlength{\parskip}{0pt}
\item Former MLB players
\item American baseball league
\item Professional baseball facts and figures
\end{itemize}
\end{minipage} & Major League Baseball Players and Achievements \\ \addlinespace \midrule
\multicolumn{3}{l}{\textbf{Exemplar Documents}:} \\
\multicolumn{3}{l}{\begin{minipage}[t]{\linewidth}
\begin{itemize}
\setlength{\itemsep}{4pt}
\setlength{\parskip}{0pt}
\item \begin{minipage}[t]{\linewidth}
\textit{\textbf{Doc ID} 2943: $\theta_d=0.61$} \\
\ul{Text:} Brian Wilson ( baseball ) = Brian Patrick Wilson ( born March 16 , 1982 ) is a former America