In [1]:
import regex as re

# Define the feature and policy names
featnames = {'lexdiv': 'Lexical Diversity',
            'sentclass': 'Sentiment',
            'sim': 'Topical Similarity',
            'smog': 'Readability'}
polnames = {'policy_chron': 'Chronological',
            'policy_least-neg-abs': 'Rev. Downvotes',
            'policy_neg-abs': 'Downvotes',
            'policy_pos-abs': 'Upvotes',
            'policy_pos-rel': 'Relative Votes',
            'policy_rev-chron': 'Rev. Chronological',
            'policy_random': 'Random',
            'policy_pin-pred-lr': 'Pred. Editors\' Picks (LR)',
            'policy_pin-pred-rf': 'Pred. Editors\' Picks (RF)',
            'policy_pred-nb': 'Pred. Upvotes (NBR)',
            'policy_pred-rf': 'Pred. Upvotes (RF)',
            'replies_rh': 'Replies Hidden',
            'replies_rt': 'Reply Trees Shown',
            'replies_rl': 'Replies Loose',
            'replies hidden': 'Replies Hidden',
            'reply trees': 'Reply Trees Shown',
            'replies loose': 'Replies Loose',
            'pinned': 'Editors\' Picks Pinned',
            'not_pinned': 'Editors\' Picks Not Pinned'}
titles = {'10': 'First 10 comments','N': 'Full Comment Discussion'}
polnames = {k.replace('-', '.'): v for k, v in polnames.items()}

def read_model_output(filename):
    """Reads the model output file and extracts relevant data."""
    main_data = []
    replyh_data = []
    replyt_data = []
    pin_data = []
    model_stats = {}
    with open(filename, 'r') as file:
        lines = file.readlines()
        start_reading = False
        read_vals = False
        for line in lines:
            if line.strip().startswith("AIC"):
                read_vals = True
                continue
            if read_vals:
                # Read model statistics line which typically starts after the main table
                stats_parts = line.split()
                model_stats['AIC'] = stats_parts[0]
                model_stats['BIC'] = stats_parts[1]
                model_stats['logLik'] = stats_parts[2]
                read_vals = False
                continue
            if line.strip().startswith("Dispersion"):
                model_stats['dispersion'] = line.split()[-1]
                continue
            if line.strip().startswith("Conditional model:"):
                start_reading = True
                continue
            if start_reading:
                if line.strip() == "---":
                    break
                parts = line.split()
                if len(parts) >= 5 and parts[0] != '---':
                    variable = ' '.join(parts[:1])
                    estimate = parts[1]
                    std_error = parts[2]
                    z_value = parts[3]
                    p_value = ' '.join(parts[4:-1])
                    stars = parts[-1]
                    if '*' in stars:
                        # apply scientific notatrion formatting
                        p_value = re.sub(r'e(-?\d+)', r'\\times 10 ^ {\1}', p_value)
                        p_value = '$' + p_value + '$' + stars
                    elif stars[-1] == '.':
                        p_value = '$' + p_value + '$' + stars
                    else:
                        p_value = '$' + stars[:-1] + '$' + '.'
                    if ':replies_rh' in variable:
                        replyh_data.append((variable, estimate, std_error, z_value, p_value))
                    elif ':replies_rt' in variable:
                        replyt_data.append((variable, estimate, std_error, z_value, p_value))
                    elif ':pinned' in variable:
                        pin_data.append((variable, estimate, std_error, z_value, p_value))
                    else:
                        main_data.append((variable, estimate, std_error, z_value, p_value))
    data = main_data + ['Reply Structure Interactions'] + replyh_data + replyt_data + ['Editors\' Picks Interactions'] + pin_data
        
    # Assume sample size is located somewhere in the text, adapt this line if necessary
    model_stats['sample_size'] = "172,590"  # Update this if you know where to find it or set manually
    return data[1:], model_stats

def format_latex_table(data, model_stats, featname, n):
    """Formats the data into a LaTeX table and returns it as a string."""
    fname = "$\\text{FORUM}^{\\text{%s}}_{\\text{%s}}$" %(featnames[featname], n)
    preamble = ["\\begin{table*}[htbp]\n", "\\centering\n"]

    pre_table = ["\\begin{tabular}{c|c|c|c|c|c}\n",
                 "Outcome & Sample Size & AIC & BIC & Log-Likelihood & Dispersion ($\\phi$) \\\\ \\hline \n",
                f"{fname} & {model_stats['sample_size']} & {model_stats['AIC']} & {model_stats['BIC']} & {model_stats['logLik']} & {model_stats['dispersion']} \\\\\n",
                "\\end{tabular}\n\n\\ \n\n\\ \n\n",]
    
    main_table = ["\\begin{tabular}{l|r|r|r|c}\n",
                "Variable & Estimate & Std. Error & $z$-value & $p$-value \\\\\n",
                "\\hline\n",
                "---\\textit{Main Effects} & & & & \\\\\n"]
    for d in data:
        try:
            var, est, err, z, p = d
            formatted_line = f"{var} & {est} & {err} & {z} & {p} \\\\\n"
        except ValueError:
            formatted_line = "---\\textit{%s} & & & & \\\\\n" %d
        main_table.append(formatted_line.replace(':', ' : '))
    main_table.append("\\end{tabular}\n")

    postamble = ["\\caption{Regression output for modelling FORUM score for %s over the %s (***: $p<0.001$, **: $p<0.01$, *: $p<0.05$, .: $p \ge 0.05$).}\n" %(featnames[featname], titles[n].lower()),
                 "\\label{tab:results-%s_%s}\n" %(featname, n),
                 "\\end{table*}\n"]

    table = ''.join(preamble + pre_table + main_table + postamble)
    for k, v in polnames.items():
        table = table.replace(k, v)
    return table



In [2]:
featname = 'lexdiv'
n = '10'

for featname in featnames:
    for n in ['10', 'N']:
        data, model_stats = read_model_output(f'data/model_summaries/{featname}_{n}.txt')
        latex_table_string = format_latex_table(data, model_stats, featname, n)
        print(latex_table_string)

\begin{table*}[htbp]
\centering
\begin{tabular}{c|c|c|c|c|c}
Outcome & Sample Size & AIC & BIC & Log-Likelihood & Dispersion ($\phi$) \\ \hline 
$\text{FORUM}^{\text{Lexical Diversity}}_{\text{10}}$ & 172,590 & -294457.9 & -294005.3 & 147273.9 & 21.1 \\
\end{tabular}

\ 

\ 

\begin{tabular}{l|r|r|r|c}
Variable & Estimate & Std. Error & $z$-value & $p$-value \\
\hline
---\textit{Main Effects} & & & & \\
(Intercept) & -0.020218 & 0.006797 & -2.97 & $0.002933$** \\
Downvotes & 0.251277 & 0.009645 & 26.05 & $< 2\times 10 ^ {-16}$*** \\
Rev. Downvotes & 0.157662 & 0.009627 & 16.38 & $< 2\times 10 ^ {-16}$*** \\
Upvotes & 0.392939 & 0.009682 & 40.58 & $< 2\times 10 ^ {-16}$*** \\
Relative Votes & 0.378964 & 0.009677 & 39.16 & $< 2\times 10 ^ {-16}$*** \\
Rev. Chronological & 0.046104 & 0.009614 & 4.80 & $1.62\times 10 ^ {-06}$*** \\
Chronological & -0.134353 & 0.009617 & -13.97 & $< 2\times 10 ^ {-16}$*** \\
Pred. Editors' Picks (LR) & 0.892746 & 0.009992 & 89.35 & $< 2\times 10 ^ {-16}$***