In [4]:
from pathlib import Path

twitter_datasetFolder = Path("../datasets/twitter")
exp01c_resultsFolder = Path("../experiments/exp01c/results")

In [5]:
import pandas as pd
import json
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
pd.set_option('display.max_columns', None)
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')
# Show all columns in a single line without wrapping
pd.set_option('display.max_columns', None)
# Also, make the display wider so it fits more
pd.set_option('display.width', 2000)

In [6]:
# load twitter dataset
twitter_dataset = pd.read_csv(twitter_datasetFolder / "index.csv")
# columns tweet_id must be loaded as string
twitter_dataset["tweet_id"] = twitter_dataset["tweet_id"].astype(str)
# map class column to 1 and 0
twitter_dataset["class"] = twitter_dataset["class"].map({"positive": 1, "negative": 0})

print(twitter_dataset.head())



              tweet_id  class  error[Truncated axis]  error[Dual axis]  error[Value as area/volume]  error[Inverted axis]  error[Uneven binning]  error[Unclear encoding]  error[Inappropriate encoding]  error[Cherry-picking]  error[Setting an arbitrary threshold]  error[Causal inference]  error[Issues with data validity]  error[Failure to account for statistical nuance]  error[Misrepresentation of scientific studies]  error[Incorrect reading of chart]
0  1220060594868555778      1                  False             False                         True                 False                  False                    False                          False                  False                                  False                    False                             False                                             False                                           False                              False
1  1234688701114060800      1                  False             False                    

In [7]:
# load model results. structure: results[modelName][tweet_id] = result_dict
results = {}

for modelFolder in exp01c_resultsFolder.glob("*"):
    if not modelFolder.is_dir():
        continue
    modelName = modelFolder.name
    results[modelName] = {}

    for resultFile in modelFolder.glob("*.json"):
        with open(resultFile, "r") as f:
            result = json.load(f)
            twid = result["tweet_id"]
            results[modelName][twid] = result
    print(f"Loaded model results: {modelName} -- {len(results[modelName])} tweets")

Loaded model results: qwen2.5vl:72b -- 2336 tweets
Loaded model results: gemma3:27b -- 2336 tweets


In [8]:
# analysis of classification results
df_classification = twitter_dataset.copy()[["tweet_id", "class"]]


for modelName in sorted(results.keys()):
    df_classification.insert(len(df_classification.columns), modelName, 0)
    for tweetId in results[modelName]:
        result = results[modelName][tweetId]
        isMisleading = result["response"]["is_misleading"]
        df_classification.loc[df_classification["tweet_id"] == tweetId, modelName] = int(isMisleading)


print("Classification Results RAW:")
print(df_classification.head())

Classification Results RAW:
              tweet_id  class  gemma3:27b  qwen2.5vl:72b
0  1220060594868555778      1           1              1
1  1234688701114060800      1           1              0
2  1236331391643779074      1           1              1
3  1238453491917631489      1           1              1
4  1239398342599364609      1           1              1


In [9]:
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score,
    roc_auc_score, confusion_matrix
)
import pandas as pd

df = df_classification.copy()
summary = []

_ROUNDING = 3  # decimal places for rounding metrics

for modelName in sorted(results.keys()):
    y_true = df["class"].astype(int)
    y_pred = df[modelName].astype(int)

    # Main metrics
    accuracy = accuracy_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_pred)
    
    # Macro averages (treat both classes equally)
    f1_macro = f1_score(y_true, y_pred, average="macro")
    precision_macro = precision_score(y_true, y_pred, average="macro")
    recall_macro = recall_score(y_true, y_pred, average="macro")
    

    # Per-class metrics
    precision_0 = precision_score(y_true, y_pred, pos_label=0)
    recall_0 = recall_score(y_true, y_pred, pos_label=0)
    f1_0 = f1_score(y_true, y_pred, pos_label=0)

    precision_1 = precision_score(y_true, y_pred, pos_label=1)
    recall_1 = recall_score(y_true, y_pred, pos_label=1)
    f1_1 = f1_score(y_true, y_pred, pos_label=1)

    # Confusion matrix components
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    summary.append({
        "model": modelName,
        "accuracy": accuracy,
        "roc_auc": roc_auc,
        "f1_macro": f1_macro,
        "precision_macro": precision_macro,
        "recall_macro": recall_macro,
        "precision_NotMis": precision_0,
        "recall_NotMis": recall_0,
        "f1_NotMis": f1_0,
        "precision_Mis": precision_1,
        "recall_Mis": recall_1,
        "f1_Mis": f1_1,
        "TP": tp,
        "FP": fp,
        "FN": fn,
        "TN": tn
    })

summary_df = pd.DataFrame(summary).round(_ROUNDING)
summary_df = summary_df.sort_values(by="roc_auc", ascending=False)

print("\n\nSummary of classification metrics:\n\n")

print(summary_df)

print("\n\n--------------------------------------------------\n\n")
print("Detailed classification reports:\n\n")

for modelName in sorted(results.keys()):
    y_true = df["class"]
    y_pred = df[modelName]

    print(f"Classification report for {modelName}:")
    print(classification_report(y_true, y_pred, target_names=["not-misleading", "misleading"], digits=_ROUNDING))
    
    roc_auc = roc_auc_score(y_true, y_pred)
    print(f"\nROC AUC score for {modelName}: {roc_auc}\n")

    print(f"Confusion matrix for {modelName}:")
    print(confusion_matrix(y_true, y_pred))
    
    print("\n\n--------------------------------------------------\n\n")




Summary of classification metrics:


           model  accuracy  roc_auc  f1_macro  precision_macro  recall_macro  precision_NotMis  recall_NotMis  f1_NotMis  precision_Mis  recall_Mis  f1_Mis    TP   FP   FN   TN
1  qwen2.5vl:72b     0.580    0.580     0.531            0.639         0.580             0.728          0.257       0.38          0.549       0.904   0.683  1056  868  112  300
0     gemma3:27b     0.562    0.562     0.499            0.627         0.562             0.717          0.206       0.32          0.536       0.919   0.677  1073  927   95  241


--------------------------------------------------


Detailed classification reports:


Classification report for gemma3:27b:
                precision    recall  f1-score   support

not-misleading      0.717     0.206     0.320      1168
    misleading      0.536     0.919     0.677      1168

      accuracy                          0.562      2336
     macro avg      0.627     0.562     0.499      2336
  weighted avg      

In [10]:
# model  accuracy    TP   FP   FN   TN
# generate a subtable

subtable = summary_df[["model", "accuracy", "TP", "FP", "FN", "TN"]]
print("Subtable:")
print(subtable)

Subtable:
           model  accuracy    TP   FP   FN   TN
1  qwen2.5vl:72b     0.580  1056  868  112  300
0     gemma3:27b     0.562  1073  927   95  241
