In [83]:
import pandas as pd
import numpy as np
from globals import BASE_DIR
import os

available_datasets = []

In [84]:
available_datasets = ["brightkite", "foursquaretky", "gowalla", "snowcard", "yelp"]

dataset = "brightkite"

In [85]:
df = pd.read_csv(f"{BASE_DIR}/{dataset}_dataset/evaluation_results.csv")

In [86]:
df

Unnamed: 0,dataset,model,method,user_group,ndcg,arp,poplift,js
0,brightkite,BPR,baseline,high,0.102547,0.057253,0.132808,0.123250
1,brightkite,BPR,baseline,medium,0.085653,0.050706,0.403658,0.116706
2,brightkite,BPR,baseline,low,0.072833,0.043878,0.854836,0.121113
3,brightkite,BPR,baseline,all,0.086468,0.050650,0.439724,0.119978
4,brightkite,BPR,cp,high,0.102547,0.057253,0.132808,0.123250
...,...,...,...,...,...,...,...,...
59,brightkite,SimpleX,cp_min_js,all,0.083867,0.040791,0.149150,0.000476
60,brightkite,SimpleX,upd,high,0.056563,0.049988,-0.034779,0.229380
61,brightkite,SimpleX,upd,medium,0.045232,0.036394,-0.015228,0.294947
62,brightkite,SimpleX,upd,low,0.040715,0.024285,-0.008525,0.093430


In [87]:
def format_best_second(row, col_group, best_criteria):
    """Format the best values as bold and the second-best as underlined, handling ties."""
    values = row[col_group]
    
    if best_criteria == "highest":  # For ndcg
        sorted_indices = np.argsort(values)[::-1]  # Descending
    elif best_criteria == "lowest":  # For arp
        sorted_indices = np.argsort(values)  # Ascending
    elif best_criteria == "closest_to_zero":  # For poplift
        sorted_indices = np.argsort(np.abs(values))  # Closest to zero

    formatted = values.astype(str)  # Convert to strings for LaTeX formatting

    if len(sorted_indices) > 0:
        # Identify the highest value and all its ties
        best_value = values[sorted_indices[0]]
        best_indices = np.where(values == best_value)[0]

        # Bold all highest values
        for idx in best_indices:
            formatted[idx] = f"\\textbf{{{formatted[idx]}}}"

        # Identify the second-highest value, if applicable
        if len(best_indices) == 1 and len(sorted_indices) > 1:
            second_value = values[sorted_indices[np.where(values[sorted_indices] != best_value)[0][0]]]
            second_indices = np.where(values == second_value)[0]

            # Underline all second-highest values
            for idx in second_indices:
                formatted[idx] = f"\\underline{{{formatted[idx]}}}"

    return formatted



def format_best_second_js(row, method_cols):
    for method_col in method_cols:
        values = row[method_col]
        min_value_idx = values.idxmin()  # Get the index of the minimum value
        second_min_value_idx = values.nsmallest(2).idxmax()  # Get the index of the second smallest value

        # Apply bold and underline formatting
        row[method_col] = row[method_col].apply(
            lambda x: f"\\textbf{{{x:.4f}}}" if x == min_value_idx 
            else (f"\\underline{{{x:.4f}}}" if x == second_min_value_idx 
                  else f"{x:.4f}")
        )
    
    return row




In [88]:
for dataset in available_datasets:
    df = pd.read_csv(f"{BASE_DIR}/{dataset}_dataset/evaluation_results.csv")
    
    df["user_group"] = df["user_group"].apply(lambda x: x.replace("high", "g1").replace("medium", "g2").replace("low", "g3"))

    float_columns = df.select_dtypes(include="float").columns
    df[float_columns] = df[float_columns].round(4)


    df_pivot = df.pivot(
    index=["model", "user_group"],
    columns="method",
    values=["ndcg", "arp", "poplift"]
    )

    # Apply formatting for each metric group
    metric_groups = ["ndcg", "arp", "poplift"]
    criteria = {"ndcg": "highest", "arp": "lowest", "poplift": "closest_to_zero"}

    for metric in metric_groups:
        method_cols = [col for col in df_pivot.columns if col[0] == metric]
        df_pivot[method_cols] = df_pivot.apply(
            lambda row: format_best_second(row, method_cols, criteria[metric]), axis=1
        )


    # Reset the index to include 'model' and 'user_group' in the DataFrame
    df_reset = df_pivot.reset_index()

    # Export to LaTeX while keeping multicolumn and multirow formatting
    latex = df_reset.to_latex(
        escape=False,  # To allow LaTeX formatting
        index=False,   # Don't write the default DataFrame index
        multicolumn=True,
        multirow=True,
        column_format="llcccccccccccc",  # Adjust for the number of columns
        header=True
    )

    # Modify LaTeX for custom formatting
    latex = latex.replace(
        "\\toprule", 
        "\\toprule\n\\multicolumn{2}{c}{} & \\multicolumn{4}{c}{\\textbf{ndcg}} & \\multicolumn{4}{c}{\\textbf{arp}} & \\multicolumn{4}{c}{\\textbf{poplift}} \\\\ \\cmidrule(lr){3-6} \\cmidrule(lr){7-10} \\cmidrule(lr){11-14}"
    )
    latex = latex.replace("\\midrule", "\\hline")
    latex = latex.replace("\\bottomrule", "\\hline")

    # Save

    with open(f"{BASE_DIR}/{dataset}_dataset/{dataset}_eval_test_if_okay.tex", "w") as f:
        f.write(latex)

    
    df_js = df[["model", "user_group", "js"]]
    float_columns = df_js.select_dtypes(include="float").columns
    df_js[float_columns] = df_js[float_columns].round(4)

   
    # First, create the JS table pivot (as before)
    df_pivot_js = df.pivot(
        index=["model", "user_group"],
        columns="method",
        values=["js"]
    )

    df_reset_js = df_pivot_js.reset_index()
   


    # Modify LaTeX table to apply formatting
    js_latex = df_reset_js.to_latex(
        escape=False,
        index=False,
        multicolumn=True,
        multirow=True,
        column_format="llcccc",  # Adjust for JS columns
        header=True
    )

    # Custom LaTeX formatting for JS table
    js_latex = js_latex.replace(
        "\\toprule", 
        "\\toprule\n\\multicolumn{2}{c}{} & \\multicolumn{4}{c}{\\textbf{Jensen-Shannon Divergence (JSD)}}\\\\ \\cmidrule(lr){3-6}"
    )
    js_latex = js_latex.replace("\\midrule", "\\hline")
    js_latex = js_latex.replace("\\bottomrule", "\\hline")

    # Save JS-only table
    with open(f"{BASE_DIR}/{dataset}_dataset/{dataset}_js_eval.tex", "w") as f:
        f.write(js_latex)
    

    

  best_value = values[sorted_indices[0]]
  formatted[idx] = f"\\textbf{{{formatted[idx]}}}"
  formatted[idx] = f"\\textbf{{{formatted[idx]}}}"
  second_value = values[sorted_indices[np.where(values[sorted_indices] != best_value)[0][0]]]
  formatted[idx] = f"\\underline{{{formatted[idx]}}}"
  formatted[idx] = f"\\underline{{{formatted[idx]}}}"
  best_value = values[sorted_indices[0]]
  formatted[idx] = f"\\textbf{{{formatted[idx]}}}"
  formatted[idx] = f"\\textbf{{{formatted[idx]}}}"
  best_value = values[sorted_indices[0]]
  formatted[idx] = f"\\textbf{{{formatted[idx]}}}"
  formatted[idx] = f"\\textbf{{{formatted[idx]}}}"
  best_value = values[sorted_indices[0]]
  formatted[idx] = f"\\textbf{{{formatted[idx]}}}"
  formatted[idx] = f"\\textbf{{{formatted[idx]}}}"
  second_value = values[sorted_indices[np.where(values[sorted_indices] != best_value)[0][0]]]
  formatted[idx] = f"\\underline{{{formatted[idx]}}}"
  formatted[idx] = f"\\underline{{{formatted[idx]}}}"
  best_value = values[so

ValueError: Columns must be same length as key

In [38]:
df_pivot_js

Unnamed: 0_level_0,Unnamed: 1_level_0,js,js,js,js
Unnamed: 0_level_1,method,baseline,cp,cp_min_js,upd
model,user_group,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
BPR,all,0.2545,0.1248,0.0042,0.0943
BPR,g1,0.1591,0.1567,0.0106,0.1622
BPR,g2,0.3105,0.299,0.007,0.167
BPR,g3,0.266,0.0008,0.0008,0.0459
LORE,all,0.0263,0.0019,0.0004,0.0871
LORE,g1,0.0769,0.001,0.001,0.1622
LORE,g2,0.0253,0.0007,0.0007,0.1449
LORE,g3,0.0114,0.0115,0.0003,0.0489
SimpleX,all,0.2081,0.114,0.0003,0.094
SimpleX,g1,0.1555,0.1545,0.0005,0.1622


In [None]:
# for dataset in available_datasets:
#     df = pd.read_csv(f"{BASE_DIR}/{dataset}_dataset/evaluation_results.csv")
    
#     df["user_group"] = df["user_group"].apply(lambda x: x.replace("high", "g1").replace("medium", "g2").replace("low", "g3"))

#     float_columns = df.select_dtypes(include="float").columns
#     df[float_columns] = df[float_columns].round(4)


#     df_pivot = df.pivot(
#     index=["model", "user_group"],
#     columns="method",
#     values=["ndcg", "arp", "poplift"]
#     )

#     # Apply formatting for each metric group
#     metric_groups = ["ndcg", "arp", "poplift"]
#     criteria = {"ndcg": "highest", "arp": "lowest", "poplift": "closest_to_zero"}

#     for metric in metric_groups:
#         method_cols = [col for col in df_pivot.columns if col[0] == metric]
#         df_pivot[method_cols] = df_pivot.apply(
#             lambda row: format_best_second(row, method_cols, criteria[metric]), axis=1
#         )


#     df_pivot_js = df.pivot(
#     index=["model", "user_group"],
#     columns="method",
#     values=["ndcg", "arp", "poplift", "js"]
#     )
#     # ----- Extra Table for JS -----
#     # Filter JS columns
#     js_cols = [col for col in df_pivot_js.columns if col[0] == "js"]
#     js_table = df_pivot_js[js_cols].reset_index()

#     # Apply formatting for JS (highlight the lowest value)
#     js_table[js_cols] = js_table.apply(
#         lambda row: format_best_second(row, js_cols, criteria["js"]), axis=1
#     )

#     # Export JS-only table to LaTeX
#     js_latex = js_table.to_latex(
#         escape=False,
#         index=False,
#         multicolumn=True,
#         multirow=True,
#         column_format="llcccc",  # Adjust for JS columns
#         header=True
#     )

#     # Custom LaTeX formatting for JS table
#     js_latex = js_latex.replace(
#         "\\toprule", 
#         "\\toprule\n\\multicolumn{2}{c}{} & \\multicolumn{4}{c}{\\textbf{Jensen-Shannon Divergence (JSD)}}\\\\ \\cmidrule(lr){3-6}"
#     )
#     js_latex = js_latex.replace("\\midrule", "\\hline")
#     js_latex = js_latex.replace("\\bottomrule", "\\hline")

#     # Save JS-only table
#     with open(f"{BASE_DIR}/{dataset}_dataset/{dataset}_js_eval.tex", "w") as f:
#         f.write(js_latex)

  best_value = values[sorted_indices[0]]
  formatted[idx] = f"\\textbf{{{formatted[idx]}}}"
  formatted[idx] = f"\\textbf{{{formatted[idx]}}}"
  second_value = values[sorted_indices[np.where(values[sorted_indices] != best_value)[0][0]]]
  formatted[idx] = f"\\underline{{{formatted[idx]}}}"
  formatted[idx] = f"\\underline{{{formatted[idx]}}}"
  best_value = values[sorted_indices[0]]
  formatted[idx] = f"\\textbf{{{formatted[idx]}}}"
  formatted[idx] = f"\\textbf{{{formatted[idx]}}}"
  second_value = values[sorted_indices[np.where(values[sorted_indices] != best_value)[0][0]]]
  formatted[idx] = f"\\underline{{{formatted[idx]}}}"
  formatted[idx] = f"\\underline{{{formatted[idx]}}}"
  best_value = values[sorted_indices[0]]
  formatted[idx] = f"\\textbf{{{formatted[idx]}}}"
  formatted[idx] = f"\\textbf{{{formatted[idx]}}}"
  second_value = values[sorted_indices[np.where(values[sorted_indices] != best_value)[0][0]]]
  formatted[idx] = f"\\underline{{{formatted[idx]}}}"
  formatted[idx] = 