In [None]:
from pathlib import Path
import re
from typing import cast
import json
import pandas as pd


def extract_json(log_content: str) -> dict | None:
    match = re.search(r"\{.*\}", log_content, re.DOTALL)
    if match:
        return cast(dict, json.loads(match.group(0)))
    return None


amlt_path = Path("/data/share_folder_local/amlt")

amlt_names = [
    "adjusted-phoenix",
    "mighty-whippet",
    "devoted-louse",
    "warm-tahr"
]

records = []

for amlt_name in amlt_names:
    amlt_dir = amlt_path / amlt_name

    for competition_dir in amlt_dir.iterdir():
        if not competition_dir.is_dir():
            continue

        grade_file = competition_dir / "grade.log"
        if not grade_file.exists():
            continue

        grade_data = extract_json(grade_file.read_text())
        if grade_data is None:
            continue

        records.append({
            "amlt_name": amlt_name,
            "experiment_name": competition_dir.name,
            "competition_id": grade_data.get("competition_id"),
            "score": grade_data.get("score"),
            "any_medal": grade_data.get("any_medal", False),
        })

# ==== Âª∫ dataframe ====
df = pd.DataFrame(records)

# üëâ ‰∏∫Âêå‰∏Ä‰∏™ (competition_id, amlt_name) ÁöÑÈáçÂ§çÂÆûÈ™åÁºñÂè∑
df = df.sort_values(
    ["competition_id", "amlt_name", "experiment_name"]
)

df["run_id"] = (
    df.groupby(["competition_id", "amlt_name"])
      .cumcount()
      .add(1)
)

df["score_col"] = (
    df["amlt_name"] + "_score_" + df["run_id"].astype(str)
)
df["medal_col"] = (
    df["amlt_name"] + "_medal_" + df["run_id"].astype(str)
)

# df["score_col"] = df["amlt_name"] + "_score_1"
# df["medal_col"] = df["amlt_name"] + "_medal_1"


# ==== pivotÔºà‰∏çÂÜçÂêàÂπ∂ÈáçÂ§çÂÆûÈ™åÔºâ====
score_wide = df.pivot(
    index="competition_id",
    columns="score_col",
    values="score"
)

medal_wide = df.pivot(
    index="competition_id",
    columns="medal_col",
    values="any_medal"
)

# ==== ÂêàÂπ∂ ====
result = (
    pd.concat([score_wide, medal_wide], axis=1)
      .reset_index()
)

print(result.head())


In [None]:
result = result.drop(
    columns=["adjusted-phoenix_score_2", "adjusted-phoenix_score_1"]
)


In [None]:
from dotenv import load_dotenv
import os

load_dotenv("/data/userdata/v-lijingyuan/RD-agent-mcts/RD-Agent/.env", override=True)

# È™åËØÅ
print(os.getenv("CHAT_TEMPERATURE"))

In [None]:
PICKLE_CACHE_FOLDER_PATH_STR="/home/bowen/RD-Agent_cache"
from rdagent.scenarios.kaggle.kaggle_crawler import get_metric_direction


In [None]:
# for comp in result["competition_id"]:
#     direction = get_metric_direction(comp)
#     print(f"Competition ID: {comp}, Metric Direction: {direction}")

In [None]:
score_cols = [c for c in result.columns if "_score_" in c]

def highlight_best(row):
    comp = row["competition_id"]
    higher_is_better = get_metric_direction(comp)  # True / False

    values = row[score_cols]

    if values.isna().all():
        return [""] * len(row)

    best_val = values.max() if higher_is_better else values.min()

    return [
        "font-weight: bold"
        if col in score_cols and row[col] == best_val
        else ""
        for col in row.index
    ]

styled = result.style.apply(highlight_best, axis=1)
styled


In [None]:
def get_topk_cols(row, k=1):
    comp = row["competition_id"]
    higher_is_better = get_metric_direction(comp)

    values = row[score_cols].dropna()
    if values.empty:
        return []

    # ÊéíÂ∫è
    sorted_vals = values.sort_values(ascending=not higher_is_better)

    # distinct ÂÄº
    distinct_vals = sorted_vals.unique()

    if len(distinct_vals) < k:
        return []

    target_val = distinct_vals[k - 1]
    return [c for c in score_cols if row[c] == target_val]

from collections import Counter

top1_counter = Counter()
top2_counter = Counter()

for _, row in result.iterrows():
    for c in get_topk_cols(row, k=1):
        top1_counter[c] += 1

    for c in get_topk_cols(row, k=2):
        top2_counter[c] += 1


In [None]:
def get_ranks(row):
    comp = row["competition_id"]
    higher_is_better = get_metric_direction(comp)

    values = row[score_cols]
    ranks = values.rank(
        ascending=not higher_is_better,
        method="min"
    )
    return ranks
from collections import Counter, defaultdict
import numpy as np

top1 = Counter()
top2 = Counter()
rank_sum = defaultdict(float)
rank_cnt = defaultdict(int)

for _, row in result.iterrows():
    # Top-k
    for c in get_topk_cols(row, k=1):
        top1[c] += 1
    for c in get_topk_cols(row, k=2):
        top2[c] += 1

    # Rank
    ranks = get_ranks(row)
    for c, r in ranks.items():
        if not np.isnan(r):
            rank_sum[c] += r
            rank_cnt[c] += 1
import pandas as pd

methods = score_cols

df = pd.DataFrame({
    "method": methods,
    "top1_count": [top1[m] for m in methods],
    "top2_count": [top2[m] for m in methods],
    "top12_count": [top1[m] + top2[m] for m in methods],
    "avg_rank": [
        rank_sum[m] / rank_cnt[m] if rank_cnt[m] > 0 else float("nan")
        for m in methods
    ],
})

df = df.sort_values(
    by=["top12_count", "top1_count", "avg_rank"],
    ascending=[False, False, True]
)


In [None]:
df

In [None]:
out_path = "/data/userdata/v-lijingyuan/icml2026/ml-master.csv"
result.to_csv(out_path, index=False)

In [None]:
a = "devoted-louse_score_1"
b = "mighty-whippet_score_1"

cmp = result[["competition_id", a, b]].dropna()

def devoted_louse_win(row):
    higher_is_better = get_metric_direction(row["competition_id"])
    if higher_is_better:
        return row[a] > row[b]
    else:
        return row[a] < row[b]

def mighty_whippet_win(row):
    higher_is_better = get_metric_direction(row["competition_id"])
    if higher_is_better:
        return row[b] > row[a]
    else:
        return row[b] < row[a]

stats = {
    "devoted_louse_win": cmp.apply(devoted_louse_win, axis=1).sum(),
    "mighty_whippet_win": cmp.apply(mighty_whippet_win, axis=1).sum(),
    "tie": (cmp[a] == cmp[b]).sum(),
    "total": len(cmp),
}

stats


In [None]:
texts_for_reward = ["of custom CNNs, I will use transfer learning from a high-performing pre-trained model designed for small images: EfficientNet-b0 from the `timm` library, finetuned for binary classification. I'll freeze most layers and train only the final classifier head to prevent overfitting, given the relatively small dataset. During training, I'll use moderate data augmentation to improve generalization, perform a stratified train/validation split, and report ROC AUC for the validation set. I'll save the test predictions as required in `./submission/submission.csv`, with each row containing the image id and the model-predicted probability for the cactus class.", "of using a CNN built from scratch as in previous attempts, I propose using transfer learning with a lightweight pretrained architecture (such as MobileNetV2) from torchvision. We'll adapt the model by fine-tuning only the classifier head on the 32x32 images, applying simple augmentations (random horizontal/vertical flips) to reduce overfitting. We'll use a validation split to monitor ROC-AUC, saving the best model, and output test probabilities to `submission/submission.csv`. This approach leverages the strong feature extractors of pretrained models, which should boost performance compared to training from scratch.", "of using a custom CNN or a classifier built from scratch, I will leverage transfer learning by fine-tuning a lightweight pretrained CNN from timm (such as EfficientNet-B0) for this small image binary classification task. Even though the input images are only 32x32, pretrained models can still help by capturing robust low-level features. I'll resize all images to fit the model's expected input size, replace the classifier head for binary output, apply basic augmentations, and use early stopping on a validation ROC AUC. After training, I'll predict probabilities for the test images and write them to `./submission/submission.csv`. The ROC AUC on a validation set will be printed after training."]


In [None]:
len(texts_for_reward)

In [None]:
texts_for_reward[0]

In [None]:
texts_for_reward = ['implement a straightforward text classification pipeline using TF-IDF features and a Logistic Regression classifier for each toxicity target (multi-label, six heads). Each classifier will independently predict the probability for one label, and the final performance will be evaluated using mean column-wise ROC-AUC on a hold-out validation set from the training data. After validation, we will use the entire training set to retrain the classifiers and produce test set predictions. The submission CSV will be saved at `./submission/submission.csv`.->ove the previous solution, I propose stacking a second-level non-linear model (e.g., a LightGBM classifier) on top of the LogisticRegression base model\'s out-of-fold predictions. The first step is to generate out-of-fold predictions on the training set and predictions on the validation set for each target using the existing TF-IDF+LR pipeline. Then, these predictions (together with the original TF-IDF features, if desired) are used as input to train LightGBM OneVsRest classifiers. This stacking introduces nonlinearity, can leverage differing strengths of the models, and often boosts ROC-AUC in multilabel settings. We\'ll report the mean column-wise ROC-AUC on the validation set and output the test submission CSV as before.->her improve the existing stacked model (TF-IDF + Logistic Regression + LGBM) without major architectural change, I will add character n-gram features to the TF-IDF vectorizer in addition to word n-grams. Character n-grams are particularly effective for toxicity detection, as they can capture variations in offensive language and spelling that word n-grams may miss. Specifically, I will set `analyzer="char_wb"` with n-gram range (3,5) for a second vectorizer, concatenate its features with the word-based TF-IDF, and proceed as before. The code below includes this change, produces validation metrics, and outputs the required submission file.->ove the previous solution, I propose to add text preprocessing/cleaning before feature extraction, specifically by removing special characters, unnecessary white spaces, and converting all text to lowercase. This standardization can improve the quality of the TF-IDF features, potentially resulting in improved model performance. The rest of the modeling pipeline will remain unchanged, so we can directly observe the impact of preprocessing on validation ROC-AUC. The new cleaning step will be applied to all train, validation, and test comment texts before TF-IDF vectorization.->ove the previous stacked TF-IDF solution, I propose to add feature enrichment using a pretrained transformer-based sentence embedding model (specifically, `sentence-transformers/all-MiniLM-L6-v2`). We\'ll extract dense vector representations for each comment and concatenate these embeddings to the existing TF-IDF features. The rationale is that transformer-based semantic features should complement ngram-based models and help with generalization when stacked with the existing pipeline. The approach will be implemented for both training and test data, used in both level-1 (LogisticRegression) and level-2 (LightGBM) models, and evaluated via mean column-wise ROC-AUC on the validation set. The final test predictions will be saved to `./submission/submission.csv`.', 'implement a straightforward text classification pipeline using TF-IDF features and a Logistic Regression classifier for each toxicity target (multi-label, six heads). Each classifier will independently predict the probability for one label, and the final performance will be evaluated using mean column-wise ROC-AUC on a hold-out validation set from the training data. After validation, we will use the entire training set to retrain the classifiers and produce test set predictions. The submission CSV will be saved at `./submission/submission.csv`.->ove the previous solution, I propose stacking a second-level non-linear model (e.g., a LightGBM classifier) on top of the LogisticRegression base model\'s out-of-fold predictions. The first step is to generate out-of-fold predictions on the training set and predictions on the validation set for each target using the existing TF-IDF+LR pipeline. Then, these predictions (together with the original TF-IDF features, if desired) are used as input to train LightGBM OneVsRest classifiers. This stacking introduces nonlinearity, can leverage differing strengths of the models, and often boosts ROC-AUC in multilabel settings. We\'ll report the mean column-wise ROC-AUC on the validation set and output the test submission CSV as before.->her improve the existing stacked model (TF-IDF + Logistic Regression + LGBM) without major architectural change, I will add character n-gram features to the TF-IDF vectorizer in addition to word n-grams. Character n-grams are particularly effective for toxicity detection, as they can capture variations in offensive language and spelling that word n-grams may miss. Specifically, I will set `analyzer="char_wb"` with n-gram range (3,5) for a second vectorizer, concatenate its features with the word-based TF-IDF, and proceed as before. The code below includes this change, produces validation metrics, and outputs the required submission file.->ove the previous solution, I propose to add text preprocessing/cleaning before feature extraction, specifically by removing special characters, unnecessary white spaces, and converting all text to lowercase. This standardization can improve the quality of the TF-IDF features, potentially resulting in improved model performance. The rest of the modeling pipeline will remain unchanged, so we can directly observe the impact of preprocessing on validation ROC-AUC. The new cleaning step will be applied to all train, validation, and test comment texts before TF-IDF vectorization.->ove upon the previous solution, I propose the following: The previous approach stacked Logistic Regression (Level 1) predictions using LightGBM as a meta-model, but it did not utilize domain-specific features beyond TF-IDF. As an actionable next step, I will engineer several hand-crafted features derived from the raw text (such as comment length, number of capital letters, number of exclamation/question marks, count of unique words, proportion of uppercase, etc.). These features will be concatenated with the TF-IDF features, and the same pipeline will be used (Logistic Regression + LightGBM stack). This improvement is expected to help especially with short/loud/shouty or ‚Äúobvious‚Äù toxic comments and should provide an incremental boost in validation ROC-AUC.', 'implement a straightforward text classification pipeline using TF-IDF features and a Logistic Regression classifier for each toxicity target (multi-label, six heads). Each classifier will independently predict the probability for one label, and the final performance will be evaluated using mean column-wise ROC-AUC on a hold-out validation set from the training data. After validation, we will use the entire training set to retrain the classifiers and produce test set predictions. The submission CSV will be saved at `./submission/submission.csv`.->ove the previous solution, I propose stacking a second-level non-linear model (e.g., a LightGBM classifier) on top of the LogisticRegression base model\'s out-of-fold predictions. The first step is to generate out-of-fold predictions on the training set and predictions on the validation set for each target using the existing TF-IDF+LR pipeline. Then, these predictions (together with the original TF-IDF features, if desired) are used as input to train LightGBM OneVsRest classifiers. This stacking introduces nonlinearity, can leverage differing strengths of the models, and often boosts ROC-AUC in multilabel settings. We\'ll report the mean column-wise ROC-AUC on the validation set and output the test submission CSV as before.->her improve the existing stacked model (TF-IDF + Logistic Regression + LGBM) without major architectural change, I will add character n-gram features to the TF-IDF vectorizer in addition to word n-grams. Character n-grams are particularly effective for toxicity detection, as they can capture variations in offensive language and spelling that word n-grams may miss. Specifically, I will set `analyzer="char_wb"` with n-gram range (3,5) for a second vectorizer, concatenate its features with the word-based TF-IDF, and proceed as before. The code below includes this change, produces validation metrics, and outputs the required submission file.->ove the previous solution, I propose to add text preprocessing/cleaning before feature extraction, specifically by removing special characters, unnecessary white spaces, and converting all text to lowercase. This standardization can improve the quality of the TF-IDF features, potentially resulting in improved model performance. The rest of the modeling pipeline will remain unchanged, so we can directly observe the impact of preprocessing on validation ROC-AUC. The new cleaning step will be applied to all train, validation, and test comment texts before TF-IDF vectorization.->vious solution uses TF-IDF features and stacking (LogisticRegression + LightGBM). An actionable improvement is to enrich the model by adding pretrained transformer embeddings (e.g., MiniLM) as additional features, concatenated to the TF-IDF representation. This provides semantic information the classic TF-IDF cannot capture, and could improve accuracy especially on subtle examples. I will use `sentence-transformers` to extract MiniLM embeddings, concatenate with TF-IDF, then rerun the same stacking architecture. This preserves the validation/test pipeline and submission logic.']

In [None]:
texts_for_reward = ['implement a straightforward text classification pipeline using TF-IDF features and a Logistic Regression classifier for each toxicity target (multi-label, six heads). Each classifier will independently predict the probability for one label, and the final performance will be evaluated using mean column-wise ROC-AUC on a hold-out validation set from the training data. After validation, we will use the entire training set to retrain the classifiers and produce test set predictions. The submission CSV will be saved at `./submission/submission.csv`.->ove the previous solution, I propose stacking a second-level non-linear model (e.g., a LightGBM classifier) on top of the LogisticRegression base model\'s out-of-fold predictions. The first step is to generate out-of-fold predictions on the training set and predictions on the validation set for each target using the existing TF-IDF+LR pipeline. Then, these predictions (together with the original TF-IDF features, if desired) are used as input to train LightGBM OneVsRest classifiers. This stacking introduces nonlinearity, can leverage differing strengths of the models, and often boosts ROC-AUC in multilabel settings. We\'ll report the mean column-wise ROC-AUC on the validation set and output the test submission CSV as before.->her improve the existing stacked model (TF-IDF + Logistic Regression + LGBM) without major architectural change, I will add character n-gram features to the TF-IDF vectorizer in addition to word n-grams. Character n-grams are particularly effective for toxicity detection, as they can capture variations in offensive language and spelling that word n-grams may miss. Specifically, I will set `analyzer="char_wb"` with n-gram range (3,5) for a second vectorizer, concatenate its features with the word-based TF-IDF, and proceed as before. The code below includes this change, produces validation metrics, and outputs the required submission file.', "implement a straightforward text classification pipeline using TF-IDF features and a Logistic Regression classifier for each toxicity target (multi-label, six heads). Each classifier will independently predict the probability for one label, and the final performance will be evaluated using mean column-wise ROC-AUC on a hold-out validation set from the training data. After validation, we will use the entire training set to retrain the classifiers and produce test set predictions. The submission CSV will be saved at `./submission/submission.csv`.->ove the previous solution, I propose stacking a second-level non-linear model (e.g., a LightGBM classifier) on top of the LogisticRegression base model's out-of-fold predictions. The first step is to generate out-of-fold predictions on the training set and predictions on the validation set for each target using the existing TF-IDF+LR pipeline. Then, these predictions (together with the original TF-IDF features, if desired) are used as input to train LightGBM OneVsRest classifiers. This stacking introduces nonlinearity, can leverage differing strengths of the models, and often boosts ROC-AUC in multilabel settings. We'll report the mean column-wise ROC-AUC on the validation set and output the test submission CSV as before.->vious solution uses a Logistic Regression + LightGBM stacking pipeline with only TF-IDF-based meta-features for the stacker. A simple, atomic improvement is to augment the meta-feature stack: For the LightGBM (level-2) model, in addition to the Logistic Regression OOF predictions, supply a few hand-crafted meta-features derived from the comments themselves‚Äîsuch as length of the comment, number of capital letters, number of exclamation/question marks, and ratio of punctuation. This will provide additional signals for the stacking model, potentially enhancing discrimination and mean ROC-AUC. All other parts (CV, vectorization, stacking) are unchanged for isolation and clarity of effect.", "implement a straightforward text classification pipeline using TF-IDF features and a Logistic Regression classifier for each toxicity target (multi-label, six heads). Each classifier will independently predict the probability for one label, and the final performance will be evaluated using mean column-wise ROC-AUC on a hold-out validation set from the training data. After validation, we will use the entire training set to retrain the classifiers and produce test set predictions. The submission CSV will be saved at `./submission/submission.csv`.->ove the previous solution, I propose stacking a second-level non-linear model (e.g., a LightGBM classifier) on top of the LogisticRegression base model's out-of-fold predictions. The first step is to generate out-of-fold predictions on the training set and predictions on the validation set for each target using the existing TF-IDF+LR pipeline. Then, these predictions (together with the original TF-IDF features, if desired) are used as input to train LightGBM OneVsRest classifiers. This stacking introduces nonlinearity, can leverage differing strengths of the models, and often boosts ROC-AUC in multilabel settings. We'll report the mean column-wise ROC-AUC on the validation set and output the test submission CSV as before.->her improve the stacking ensemble, I propose to extend the input to the Level-2 (LightGBM) model by concatenating both the original TF-IDF features and the out-of-fold Logistic Regression (Level-1) prediction probabilities. This gives LightGBM richer information and lets it use high-dimensional text features alongside meta-predictions, increasing its modeling capacity while retaining interpretability. The code below modifies the stacking process accordingly and prints the validation ROC-AUC. The submission file is written to the required location."]


In [None]:
len(texts_for_reward)

In [None]:
texts_for_reward[0].split("->")

In [None]:
texts_for_reward[1].split("->")

In [None]:
texts_for_reward[2].split("->")