In [1]:
import regex as re
import pandas as pd

# Define the feature and policy names
featnames = {'lexdiv': 'Lexical Diversity',
            'sentcomp': 'Sentiment',
            'sim': 'Topical Similarity',
            'smog': 'Readability'}
polnames = {'policy_chron': 'Chronological',
            'policy_least-neg-abs': 'Rev. Downvotes',
            'policy_neg-abs': 'Downvotes',
            'policy_pos-abs': 'Upvotes',
            'policy_pos-rel': 'Relative Votes',
            'policy_rev-chron': 'Rev. Chronological',
            'policy_random': 'Random',
            'policy_pin-pred-lr': 'Pred. Editors\' Picks (LR)',
            'policy_pin-pred-xgb': 'Pred. Editors\' Picks (XGB)',
            'policy_pred-nb': 'Pred. Upvotes (NBR)',
            'policy_pred-xgb': 'Pred. Upvotes (XGB)',
            'replies_rh': 'Replies Hidden',
            'replies_rt': 'Reply Trees Shown',
            'replies_rl': 'Replies Loose',
            'replies hidden': 'Replies Hidden',
            'reply trees': 'Reply Trees Shown',
            'replies loose': 'Replies Loose',
            'pinned': 'Editors\' Picks Pinned',
            'not_pinned': 'Editors\' Picks Not Pinned'}
titles = {'10': 'First 10 comments','N': 'Full Comment Discussion'}
polnames = {k.replace('-', '.'): v for k, v in polnames.items()}

def read_model_output(filename):
    """Reads the model output file and extracts relevant data."""
    main_data = []
    replyh_data = []
    replyt_data = []
    pin_data = []
    model_stats = {}
    with open(filename, 'r') as file:
        lines = file.readlines()
        start_reading = False
        read_vals = False
        for line in lines:
            if line.strip().startswith("AIC"):
                read_vals = True
                continue
            if read_vals:
                # Read model statistics line which typically starts after the main table
                stats_parts = line.split()
                model_stats['AIC'] = stats_parts[0]
                model_stats['BIC'] = stats_parts[1]
                model_stats['logLik'] = stats_parts[2]
                read_vals = False
                continue
            if line.strip().startswith("Dispersion"):
                model_stats['dispersion'] = line.split()[-1]
                continue
            if line.strip().startswith("Conditional model:"):
                start_reading = True
                continue
            if start_reading:
                if line.strip() == "---":
                    break
                parts = line.split()
                if len(parts) >= 5 and parts[0] != '---':
                    variable = ' '.join(parts[:1])
                    estimate = parts[1]
                    std_error = parts[2]
                    z_value = parts[3]
                    p_value = ' '.join(parts[4:-1])
                    stars = parts[-1]
                    if '*' in stars:
                        # apply scientific notatrion formatting
                        p_value = re.sub(r'e(-?\d+)', r'\\times 10 ^ {\1}', p_value)
                        p_value = '$' + p_value + '$' + stars
                    elif stars[-1] == '.':
                        p_value = '$' + p_value + '$' + stars
                    else:
                        p_value = '$' + stars[:-1] + '$' + '.'
                    if ':replies_rh' in variable:
                        replyh_data.append((variable, estimate, std_error, z_value, p_value))
                    elif ':replies_rt' in variable:
                        replyt_data.append((variable, estimate, std_error, z_value, p_value))
                    elif ':pinned' in variable:
                        pin_data.append((variable, estimate, std_error, z_value, p_value))
                    else:
                        main_data.append((variable, estimate, std_error, z_value, p_value))
    data = main_data + ['Reply Structure Interactions'] + replyh_data + replyt_data + ['Editors\' Picks Interactions'] + pin_data
        
    # Assume sample size is located somewhere in the text, adapt this line if necessary
    model_stats['sample_size'] = "172,590"  # Update this if you know where to find it or set manually
    return data[1:], model_stats

def format_latex_table(data, model_stats, featname, n):
    """Formats the data into a LaTeX table and returns it as a string."""
    fname = "$\\text{FORUM}^{\\text{%s}}_{\\text{%s}}$" %(featnames[featname], n)
    preamble = ["\\begin{table}[]\n", "\\centering\n",
                "\\tbl{Regression output for modelling FORUM score for %s over the %s (***: $p<0.001$, **: $p<0.01$, *: $p<0.05$, .: $p \\ge 0.05$).}{\n" %(featnames[featname], titles[n].lower())]

    pre_table = ["\\begin{tabular}{c|c|c|c|c|c}\n",
                 "Outcome & Sample Size & AIC & BIC & Log-Likelihood & Dispersion ($\\phi$) \\\\ \\hline \n",
                f"{fname} & {model_stats['sample_size']} & {model_stats['AIC']} & {model_stats['BIC']} & {model_stats['logLik']} & {model_stats['dispersion']} \\\\\n",
                "\\end{tabular}}\n\n\\ \n\n\\ \n\n",]
    
    main_table = ["\\resizebox{\\textwidth}{!}{\n",
                  "\\begin{tabular}{l|r|r|r|c}\n",
                  "Variable & Estimate & Std. Error & $z$-value & $p$-value \\\\\n",
                  "\\hline\n",
                  "---\\textit{Main Effects} & & & & \\\\\n"]
    for d in data:
        try:
            var, est, err, z, p = d
            formatted_line = f"{var} & {est} & {err} & {z} & {p} \\\\\n"
        except ValueError:
            formatted_line = "---\\textit{%s} & & & & \\\\\n" %d
        main_table.append(formatted_line.replace(':', ' : '))
    main_table.append("\\end{tabular}}\n")

    postamble = ["\\label{tab:results-%s_%s}\n" %(featname, n),
                 "\\end{table}\n"]

    table = ''.join(preamble + pre_table + main_table + postamble)
    for k, v in polnames.items():
        table = table.replace(k, v)
    return table



In [2]:
for featname in featnames:
    for n in ['10', 'N']:
        data, model_stats = read_model_output(f'model_output/model_summaries/{featname}_{n}.txt')
        latex_table_string = format_latex_table(data, model_stats, featname, n)
        print(latex_table_string)

\begin{table}[]
\centering
\tbl{Regression output for modelling FORUM score for Lexical Diversity over the first 10 comments (***: $p<0.001$, **: $p<0.01$, *: $p<0.05$, .: $p \ge 0.05$).}{
\begin{tabular}{c|c|c|c|c|c}
Outcome & Sample Size & AIC & BIC & Log-Likelihood & Dispersion ($\phi$) \\ \hline 
$\text{FORUM}^{\text{Lexical Diversity}}_{\text{10}}$ & 172,590 & -295289.0 & -294836.4 & 147689.5 & 21.6 \\
\end{tabular}}

\ 

\ 

\resizebox{\textwidth}{!}{
\begin{tabular}{l|r|r|r|c}
Variable & Estimate & Std. Error & $z$-value & $p$-value \\
\hline
---\textit{Main Effects} & & & & \\
(Intercept) & -0.0286517 & 0.0067262 & -4.26 & $2.05\times 10 ^ {-05}$*** \\
Downvotes & 0.2621861 & 0.0095443 & 27.47 & $< 2\times 10 ^ {-16}$*** \\
Rev. Downvotes & 0.1308048 & 0.0095217 & 13.74 & $< 2\times 10 ^ {-16}$*** \\
Upvotes & 0.3997788 & 0.0095808 & 41.73 & $< 2\times 10 ^ {-16}$*** \\
Relative Votes & 0.3852549 & 0.0095761 & 40.23 & $< 2\times 10 ^ {-16}$*** \\
Rev. Chronological & 0.0450165 

In [3]:
bics = pd.read_csv('model_output/bic_table.csv', index_col=0)
bics.columns = ['Feature', 'n', 1, 2, 3, 4, 5, 6]
bics['Feature'] = bics['Feature'].map(featnames)
bics[list(range(1,7))] = bics[range(1,7)].astype(int)
bics

Unnamed: 0,Feature,n,1,2,3,4,5,6
1,Lexical Diversity,10,-288796,-292887,-290700,-294836,-294867,-294822
2,Sentiment,10,-170377,-170345,-170390,-170358,-170339,-170166
3,Topical Similarity,10,-148677,-152821,-148822,-152963,-153059,-153135
4,Readability,10,-321775,-323927,-322600,-324763,-324758,-324646
5,Lexical Diversity,N,-507087,-538298,-507043,-538269,-538260,-538039
6,Sentiment,N,-575856,-577116,-575925,-577187,-577198,-577299
7,Topical Similarity,N,-442190,-508419,-442621,-509120,-509382,-511008
8,Readability,N,-558329,-581645,-558244,-581565,-581547,-581315


In [4]:
def ff(s):
    ll = s.nsmallest(2)
    return s.apply(lambda x: '\\textbf{' + str(x) + '}' if x==ll.iloc[0]
                   else '\\textit{' + str(x) + '}' if x==ll.iloc[1] else x)

bicstr = bics.copy()

bicstr[list(range(1,7))] = bicstr[range(1,7)].apply(ff, axis=1)

print(bicstr.to_latex())

\begin{tabular}{lllllllll}
\toprule
 & Feature & n & 1 & 2 & 3 & 4 & 5 & 6 \\
\midrule
1 & Lexical Diversity & 10 & -288796 & -292887 & -290700 & \textit{-294836} & \textbf{-294867} & -294822 \\
2 & Sentiment & 10 & \textit{-170377} & -170345 & \textbf{-170390} & -170358 & -170339 & -170166 \\
3 & Topical Similarity & 10 & -148677 & -152821 & -148822 & -152963 & \textit{-153059} & \textbf{-153135} \\
4 & Readability & 10 & -321775 & -323927 & -322600 & \textbf{-324763} & \textit{-324758} & -324646 \\
5 & Lexical Diversity & N & -507087 & \textbf{-538298} & -507043 & \textit{-538269} & -538260 & -538039 \\
6 & Sentiment & N & -575856 & -577116 & -575925 & -577187 & \textit{-577198} & \textbf{-577299} \\
7 & Topical Similarity & N & -442190 & -508419 & -442621 & -509120 & \textit{-509382} & \textbf{-511008} \\
8 & Readability & N & -558329 & \textbf{-581645} & -558244 & \textit{-581565} & -581547 & -581315 \\
\bottomrule
\end{tabular}



In [5]:
pre = """\\begin{table*}[h]
\\centering"""
body = bicstr.to_latex(index=False
                        ).replace('\n\\midrule', ' \\hline'
                        ).replace('\n\\toprule', ''
                        ).replace('\n\\bottomrule', ''
                        ).replace('\\\\\nLexical Diversity & N', '\\\\ \\hline\nLexical Diversity & N'
                        ).replace('llllllll', 'l|c|rrrrrr'
                        ).replace('Feature', ' & & \\multicolumn{6}{c}{Model} \\\\ \nFeature'
                        )[:-1]

post = """\\caption{BIC scores for beta regression fit for each model on each feature and number of comments considered. Best models indicated in bold, second best indicated in italics.}
\\label{tab:BICs}
\\end{table*}"""

print('\n'.join([pre, body, post]))

\begin{table*}[h]
\centering
\begin{tabular}{l|c|rrrrrr}
 & & \multicolumn{6}{c}{Model} \\ 
Feature & n & 1 & 2 & 3 & 4 & 5 & 6 \\ \hline
Lexical Diversity & 10 & -288796 & -292887 & -290700 & \textit{-294836} & \textbf{-294867} & -294822 \\
Sentiment & 10 & \textit{-170377} & -170345 & \textbf{-170390} & -170358 & -170339 & -170166 \\
Topical Similarity & 10 & -148677 & -152821 & -148822 & -152963 & \textit{-153059} & \textbf{-153135} \\
Readability & 10 & -321775 & -323927 & -322600 & \textbf{-324763} & \textit{-324758} & -324646 \\ \hline
Lexical Diversity & N & -507087 & \textbf{-538298} & -507043 & \textit{-538269} & -538260 & -538039 \\
Sentiment & N & -575856 & -577116 & -575925 & -577187 & \textit{-577198} & \textbf{-577299} \\
Topical Similarity & N & -442190 & -508419 & -442621 & -509120 & \textit{-509382} & \textbf{-511008} \\
Readability & N & -558329 & \textbf{-581645} & -558244 & \textit{-581565} & -581547 & -581315 \\
\end{tabular}
\caption{BIC scores for beta regression

In [6]:
bicranks = bics.copy()
bicranks[list(range(1,7))] = bicranks[list(range(1,7))].rank(axis=1).astype(int)
bicranks

Unnamed: 0,Feature,n,1,2,3,4,5,6
1,Lexical Diversity,10,6,4,5,2,1,3
2,Sentiment,10,2,4,1,3,5,6
3,Topical Similarity,10,6,4,5,3,2,1
4,Readability,10,6,4,5,1,2,3
5,Lexical Diversity,N,5,1,6,2,3,4
6,Sentiment,N,6,4,5,3,2,1
7,Topical Similarity,N,6,4,5,3,2,1
8,Readability,N,5,1,6,2,3,4


In [7]:

pre = """\\begin{table}[h]
\\centering"""
body = bicranks.to_latex(index=False
                        ).replace('\n\\midrule', ' \\hline'
                        ).replace('\n\\toprule', ''
                        ).replace('\n\\bottomrule', ''
                        ).replace('\\\\\nLexical Diversity & N', '\\\\ \\hline\nLexical Diversity & N'
                        ).replace('llrrrrrr', 'l|c|rrrrrr'
                        ).replace('Feature', ' & & \\multicolumn{6}{c}{Model} \\\\ \nFeature'
                        )[:-1]

post = """\\caption{BIC ranks for beta regression fit for each model on each feature and number of comments considered.}
\\label{tab:BICranks}
\\end{table}"""

print('\n'.join([pre, body, post]))

\begin{table}[h]
\centering
\begin{tabular}{l|c|rrrrrr}
 & & \multicolumn{6}{c}{Model} \\ 
Feature & n & 1 & 2 & 3 & 4 & 5 & 6 \\ \hline
Lexical Diversity & 10 & 6 & 4 & 5 & 2 & 1 & 3 \\
Sentiment & 10 & 2 & 4 & 1 & 3 & 5 & 6 \\
Topical Similarity & 10 & 6 & 4 & 5 & 3 & 2 & 1 \\
Readability & 10 & 6 & 4 & 5 & 1 & 2 & 3 \\ \hline
Lexical Diversity & N & 5 & 1 & 6 & 2 & 3 & 4 \\
Sentiment & N & 6 & 4 & 5 & 3 & 2 & 1 \\
Topical Similarity & N & 6 & 4 & 5 & 3 & 2 & 1 \\
Readability & N & 5 & 1 & 6 & 2 & 3 & 4 \\
\end{tabular}
\caption{BIC ranks for beta regression fit for each model on each feature and number of comments considered.}
\label{tab:BICranks}
\end{table}
