# make_tables.ipynb
**Purpose:** Generate LaTeX tables used in the manuscript (e.g., tuned hyperparameters, feature rankings, thresholds).  
**Inputs:** Excel/CSV outputs from Stage 1 and Stage 2 tuning scripts.  
**Outputs:** LaTeX `.txt` table files.

**Part of repository:** `3_Graphs_Tables`  
**Reproducibility:** This notebook is deterministic given the provided model output files.


In [5]:
import pandas as pd

In [4]:
SUBREDDIT_LABELS = {
    "conspiracy": "r/Conspiracy",
    "crypto": "r/CryptoCurrency",
    "politics": "r/politics",
}

In [None]:
# want header - title - table_header - tabular - table_footer - caption

supp_mat_table = {
    "header": r"\paragraph*{S Table.}{\bf",
    "table_header":
r"""}
\label{S-Table.}
\begin{table}[!ht]
\centering""",
    "table_footer": r"""\end{table}"""
}

In [None]:
def built_supp_mat_table(title, tabular, caption):
    tabular = tabular.replace(r"\\", r"\\ \hline")
    to_repl = [r'\toprule', r'\midrule', r'\bottomrule']
    for s in to_repl:
        tabular = tabular.replace(s, "")
    return supp_mat_table["header"] + title + supp_mat_table["table_header"] + tabular + supp_mat_table["table_footer"] + caption

In [None]:
hyperparam_dfs = {}
sheet_name = "all_hyperparams"
for subreddit in SUBREDDIT_LABELS:
    eval_filepath = f"/home/cara/Documents/reddit_analyses/thread-size/Outputs/1_thread_start/{subreddit}/4_model/evaluation.xlsx"
    df = pd.read_excel(eval_filepath, sheet_name=sheet_name)
    
    # Ensure correct column names
    df.columns = ["Features", "Parameter", "Value"]

    # Forward-fill N_feats to propagate block identifiers
    df["Features"] = df["Features"].ffill().astype(int)

    # Pivot into the desired wide format
    table = df.pivot(index="Features", columns="Parameter", values="Value")

    hyperparam_dfs[subreddit] = table.copy()

    table = table.reset_index()  # if N_feats was the index

    latex_str = table.to_latex(
        index=False,                    # don’t print the row index
        header=True,
        float_format="%.4f",         # control float precision
        column_format="|l|r|r|r|r|r|r|r|r|",      # LaTeX alignment (1 left + 8 right)
        #caption=f"{SUBREDDIT_LABELS[subreddit]} tuned thread start LightGBM hyperparameters by number of features.",
        #label=f"tab:s1-{subreddit}-hyperparams",
        escape=True                    # so underscores in col names are not escaped weirdly
    )
    
    caption = f"Optimal LightGBM tree hyperparameters selected via cross-validated Optuna/TPE search for each number of features for {SUBREDDIT_LABELS[subreddit]}. Values represent cross-fold aggregated hyperparameters, using the mode for integer parameters and the mean for continuous parameters. These configurations were used for the final thread start model evaluation."
    title = f"{subreddit} tuned thread start LightGBM hyperparameters by number of features."
    full_latex = built_supp_mat_table(title, latex_str, caption)
    # Save to file
    with open(f"/home/cara/Documents/reddit_analyses/thread-size/Publication_Outputs/1_Thread_Start/hyperparam_outputs/{subreddit}.txt", "w") as f:
        f.write(full_latex)

outfile = f"/home/cara/Documents/reddit_analyses/thread-size/Publication_Outputs/1_Thread_Start/hyperparam_outputs/hyperparams.xlsx"
with pd.ExcelWriter(outfile) as writer:
    for sub, df in hyperparam_dfs.items():
        df.to_excel(writer, sheet_name=sub)


In [8]:
df

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Key,Value
0,1.0,0,n_feats,1
1,,1,MCC_before_thresh,0.061094
2,,2,final_threshold,0.642757
3,,3,MCC_after_thresh,0.102834
4,,4,final_class_weights,"{0: np.float64(1.0), 1: np.float64(1.752410111..."
...,...,...,...,...
195,,3,MCC_after_thresh,0.27879
196,,4,final_class_weights,"{0: np.float64(1.449421193232413), 1: np.float..."
197,,5,features,"['author_freq', 'question_ratio', 'domain_freq..."
198,,6,best_MCC,0.369864


In [13]:
model_threshold_dfs = {}
sheet_name = "all_params"
for subreddit in SUBREDDIT_LABELS:
    eval_filepath = f"/home/cara/Documents/reddit_analyses/thread-size/Outputs/1_thread_start/{subreddit}/4_model/evaluation.xlsx"
    df = pd.read_excel(eval_filepath, sheet_name=sheet_name)
    
    # Ensure correct column names
    df.columns = ["Features", "param_index", "Parameter", "Value"]

    # Forward-fill N_feats to propagate block identifiers
    df["Features"] = df["Features"].ffill().astype(int)

    # Pivot into the desired wide format
    table = df.pivot(index="Features", columns="Parameter", values="Value")

    model_threshold_dfs[subreddit] = table.copy()[['model_threshold']]


In [16]:
pd.concat(model_threshold_dfs, axis=1).to_csv(f"/home/cara/Documents/reddit_analyses/thread-size/Publication_Outputs/1_Thread_Start/metrics/model_thresholds.xlsx")

In [11]:
table[['model_threshold']]

Parameter,model_threshold
Features,Unnamed: 1_level_1
1,0.412345
2,0.393197
3,0.426633
4,0.502143
5,0.43905
6,0.462475
7,0.399566
8,0.416296
9,0.463276
10,0.421228


In [None]:

outfile = f"/home/cara/Documents/reddit_analyses/thread-size/Publication_Outputs/1_Thread_Start/hyperparam_outputs/hyperparams.xlsx"
with pd.ExcelWriter(outfile) as writer:
    for sub, df in hyperparam_dfs.items():
        df.to_excel(writer, sheet_name=sub)