In [4]:
# =========================
# 0) Imports & constants
# =========================
import os, re, glob, time, json
import numpy as np
import pandas as pd

SEED = 42
TARGET_COLS = [
    "task_achievement",
    "coherence_and_cohesion",
    "lexical_resource",
    "grammatical_range",
]
N_COMPONENTS_PCA = 256
ANCHOR_DIR = "./Anchors"
OPTUNA_TRIALS = 500        # reduce/increase as needed
TEMP_SOFTMAX = 20.0        # anchor similarity temperature
TOPK_ANCHORS = 5
VAL_SIZE = 0.2              # single 80:20 split for speed\


train_emb = pd.read_csv("train_embeddings.csv")
test_emb  = pd.read_csv("test_embeddings.csv")
train_df  = pd.read_csv("train_feature_eng.csv")
test_df   = pd.read_csv("test_feature_eng.csv")

In [None]:
# ==========================
# These code snippets are commented out as they were part of the original code 
# but migrated to Feature Engineering Notebook for consistency
# ==========================

# import numpy as np
# import pandas as pd
# import re
# import nltk
# from nltk import word_tokenize, pos_tag
# from collections import Counter
# from sentence_transformers import SentenceTransformer, util
# from nltk.corpus import wordnet as wn

# nltk.download("wordnet")
# nltk.download("omw-1.4")

# # ---------- Helper: Lexical sophistication ----------
# import requests

# url = "https://raw.githubusercontent.com/first20hours/google-10000-english/master/20k.txt"
# response = requests.get(url)
# response.raise_for_status()  # Raise an error if the request fails
# COMMON_WORDS = set(response.text.split()[:2000])

# def lexical_sophistication(tokens):
#     rare_words = [t for t in tokens if t.lower() not in COMMON_WORDS]
#     return len(rare_words) / len(tokens) if tokens else 0

# # ---------- Helper: Root & Corrected TTR ----------
# def root_ttr(tokens):
#     return len(set(tokens)) / np.sqrt(len(tokens)) if tokens else 0

# def corrected_ttr(tokens):
#     return len(set(tokens)) / np.sqrt(2 * len(tokens)) if tokens else 0

# # ---------- Helper: MATTR ----------
# def mattr(tokens, window_size=50):
#     if len(tokens) < window_size:
#         return len(set(tokens)) / len(tokens)
#     scores = []
#     for i in range(len(tokens) - window_size + 1):
#         window = tokens[i:i+window_size]
#         scores.append(len(set(window)) / window_size)
#     return np.mean(scores)

# # ---------- Helper: Complex sentence ratio ----------
# def complex_sentence_ratio(text):
#     sentences = nltk.sent_tokenize(text)
#     complex_count = sum(1 for s in sentences if len(re.findall(r"\b(and|but|or|because|which|although)\b", s.lower())) >= 2)
#     return complex_count / len(sentences) if sentences else 0

# # ---------- Helper: Hypernym depth ----------
# def avg_hypernym_depth(tokens):
#     depths = []
#     for t in tokens:
#         synsets = wn.synsets(t)
#         if synsets:
#             depths.append(max((s.min_depth() for s in synsets), default=0))
#     return np.mean(depths) if depths else 0

# # ---------- Helper: Topic drift ----------
# def topic_drift(text, model):
#     paras = [p.strip() for p in text.split("\n") if p.strip()]
#     if len(paras) < 2:
#         return 0
#     emb_start = model.encode([paras[0]], convert_to_tensor=True)
#     emb_end = model.encode([paras[-1]], convert_to_tensor=True)
#     return 1 - util.cos_sim(emb_start, emb_end).item()

# # ---------- Helper: Alliteration ratio ----------
# def alliteration_ratio(tokens):
#     if len(tokens) < 2:
#         return 0
#     count = sum(1 for i in range(len(tokens)-1) if tokens[i][0].lower() == tokens[i+1][0].lower())
#     return count / (len(tokens)-1)

# # ---------- Helper: Hedging words ----------
# HEDGING_WORDS = {"might", "maybe", "perhaps", "possibly", "could", "should"}
# def hedging_count(tokens):
#     return sum(1 for t in tokens if t.lower() in HEDGING_WORDS)

# # ---------- Helper: Emotive words ----------
# EMOTIVE_WORDS = {"love", "hate", "happy", "sad", "angry", "excited", "worried", "proud"}
# def emotive_count(tokens):
#     return sum(1 for t in tokens if t.lower() in EMOTIVE_WORDS)

# # ---------- Main function ----------
# def add_extra_features(df, text_col="essay_clean", prompt_col="prompt_clean"):
#     sbert = SentenceTransformer("sentence-transformers/paraphrase-MiniLM-L6-v2")
#     new_feats = {
#         "root_ttr": [],
#         "corrected_ttr": [],
#         "mattr": [],
#         "lexical_sophistication": [],
#         "complex_sentence_ratio": [],
#         "avg_hypernym_depth": [],
#         "topic_drift": [],
#         "alliteration_ratio": [],
#         "hedging_count": [],
#         "emotive_count": []
#     }

#     for text, prompt in zip(df[text_col], df[prompt_col]):
#         tokens = [t for t in word_tokenize(text) if t.isalpha()]
#         new_feats["root_ttr"].append(root_ttr(tokens))
#         new_feats["corrected_ttr"].append(corrected_ttr(tokens))
#         new_feats["mattr"].append(mattr(tokens))
#         new_feats["lexical_sophistication"].append(lexical_sophistication(tokens))
#         new_feats["complex_sentence_ratio"].append(complex_sentence_ratio(text))
#         new_feats["avg_hypernym_depth"].append(avg_hypernym_depth(tokens))
#         new_feats["topic_drift"].append(topic_drift(text, sbert))
#         new_feats["alliteration_ratio"].append(alliteration_ratio(tokens))
#         new_feats["hedging_count"].append(hedging_count(tokens))
#         new_feats["emotive_count"].append(emotive_count(tokens))

#     return pd.concat([df, pd.DataFrame(new_feats)], axis=1)


# train_df = add_extra_features(train_df)
# test_df = add_extra_features(test_df)

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/semiqolonn/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/semiqolonn/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
# =========================
# 1) Load data & PCA(emb)
# =========================
from sklearn.decomposition import PCA

pca = PCA(n_components=N_COMPONENTS_PCA, random_state=SEED)
train_emb_pca = pca.fit_transform(train_emb)
test_emb_pca  = pca.transform(test_emb)
print(f"[PCA] explained_variance_sum = {pca.explained_variance_ratio_.sum():.4f}")

y = train_df[TARGET_COLS].copy()
drop_train = ["prompt","essay","essay_clean","prompt_clean","merged_text"] + TARGET_COLS
drop_test  = ["prompt","essay","essay_clean","prompt_clean","merged_text"]

X_train_core = train_df.drop(columns=drop_train, errors="ignore").reset_index(drop=True)
X_test_core  = test_df.drop(columns=drop_test, errors="ignore").reset_index(drop=True)

train_emb_df = pd.DataFrame(train_emb_pca, columns=[f"emb_{i}" for i in range(N_COMPONENTS_PCA)])
test_emb_df  = pd.DataFrame(test_emb_pca,  columns=[f"emb_{i}" for i in range(N_COMPONENTS_PCA)])

train = pd.concat([X_train_core, train_emb_df], axis=1)
test  = pd.concat([X_test_core,  test_emb_df],  axis=1)

X_mat  = train.values.astype(np.float32)
Y_mat  = y.values.astype(np.float32)  # may contain NaN
X_test = test.values.astype(np.float32)

print("Shapes -> X:", X_mat.shape, " Y:", Y_mat.shape, " X_test:", X_test.shape)

[PCA] explained_variance_sum = 0.9116
Shapes -> X: (9912, 321)  Y: (9912, 4)  X_test: (473, 321)


In [7]:
# =========================
# 2) Load anchors
# =========================
def _read_pred_vec(path: str, n_expected_rows: int) -> np.ndarray:
    df = pd.read_csv(path)
    drop_unnamed = [c for c in df.columns if str(c).startswith("Unnamed")]
    if drop_unnamed:
        df = df.drop(columns=drop_unnamed)
    cols = [c for c in TARGET_COLS if c in df.columns]
    if len(cols) == 4:
        df = df[cols]
    else:
        df = df.iloc[:, -4:]
        df.columns = TARGET_COLS
    if len(df) != n_expected_rows:
        raise ValueError(f"{os.path.basename(path)} rows={len(df)} != expected {n_expected_rows}")
    return df.values.astype(np.float32).ravel()

def load_anchor_submissions(anchor_dir=ANCHOR_DIR, n_expected_rows=None):
    anchors = []
    for p in sorted(glob.glob(os.path.join(anchor_dir, "*.csv"))):
        m = re.match(r"(\d+)\.csv$", os.path.basename(p))
        if not m:
            continue
        lb = float(m.group(1)) / 100.0
        vec = _read_pred_vec(p, n_expected_rows)
        anchors.append({"name": os.path.basename(p), "lb": lb, "vec": vec})
    if len(anchors) < 3:
        raise ValueError("Need >=3 anchors")
    return anchors

ANCHORS = load_anchor_submissions(n_expected_rows=test.shape[0])
print("Anchors:", [(a["name"], a["lb"], len(a["vec"])) for a in ANCHORS])

Anchors: [('11.csv', 0.11, 1892), ('15.csv', 0.15, 1892), ('17.csv', 0.17, 1892), ('51.csv', 0.51, 1892), ('78.csv', 0.78, 1892), ('86.csv', 0.86, 1892)]


In [8]:
# =========================
# 3) Utils
# =========================
from sklearn.metrics import mean_squared_error
from numpy.linalg import norm
from sklearn.model_selection import train_test_split

def mean_rmse_across_targets(y_true, y_pred):
    rmses = []
    for i in range(y_true.shape[1]):
        m = ~np.isnan(y_true[:, i])
        if m.any():
            rmses.append(np.sqrt(mean_squared_error(y_true[m, i], y_pred[m, i])))
    return float(np.mean(rmses)) if rmses else float("inf")

def predict_lb_from_anchors(test_vec: np.ndarray, anchors, temp: float, k: int):
    tvn = norm(test_vec) + 1e-12
    sims = []
    for a in anchors:
        avn = norm(a["vec"]) + 1e-12
        sims.append(float(np.dot(test_vec, a["vec"]) / (tvn * avn)))
    sims = np.array(sims, dtype=np.float32)
    idx = np.argsort(-sims)[:min(k, len(sims))]
    sims_k = sims[idx]
    lbs_k  = np.array([anchors[i]["lb"] for i in idx], dtype=np.float32)
    w = np.exp(sims_k * temp); w /= w.sum()
    pred_lb = float((w * lbs_k).sum())
    top_info = list(zip([anchors[i]["name"] for i in idx], sims_k, lbs_k))
    return pred_lb, top_info

In [9]:
# =========================
# 4) GPU-safe base params for tuning
# =========================
from catboost import CatBoostRegressor, Pool

BASE_PARAMS_GPU = dict(
    loss_function="MultiRMSE",
    eval_metric="MultiRMSE",
    task_type="GPU",
    devices="0",
    iterations=2000,
    learning_rate=0.05,
    depth=8,
    l2_leaf_reg=3.0,
    bootstrap_type="Bayesian",
    bagging_temperature=1.0,
    border_count=128,
    min_data_in_leaf=5,
    random_seed=SEED,
    use_best_model=True,
    verbose=False,
)

def train_val_and_test_once(params, X_tr, Y_tr, X_va, Y_va, X_test, es_rounds=200, gpu=True):
    base = BASE_PARAMS_GPU.copy() if gpu else {**BASE_PARAMS_GPU, "task_type": "CPU"}
    m = CatBoostRegressor(**{**base, **params})
    tr_pool = Pool(X_tr, label=Y_tr)
    va_pool = Pool(X_va, label=Y_va)
    m.fit(tr_pool, eval_set=va_pool, early_stopping_rounds=es_rounds, use_best_model=True)
    va_pred = m.predict(X_va)
    val_rmse = mean_rmse_across_targets(Y_va, va_pred)
    test_pred = m.predict(X_test).astype(np.float32).ravel()
    return val_rmse, test_pred

In [10]:
# =========================
# 5) Optuna tuning (GPU-safe) using anchors
# =========================
import optuna

mask_cc = ~np.isnan(Y_mat).any(axis=1)
X_cc = X_mat[mask_cc]
Y_cc = Y_mat[mask_cc]
X_tr, X_va, Y_tr, Y_va = train_test_split(X_cc, Y_cc, test_size=VAL_SIZE, random_state=SEED, shuffle=True)

def tune_with_anchor_lb(X_tr, Y_tr, X_va, Y_va, X_test, n_trials=OPTUNA_TRIALS, temp=TEMP_SOFTMAX, k=TOPK_ANCHORS):
    study = optuna.create_study(direction="minimize")

    def objective(trial: optuna.Trial):
        params = {
            "iterations": trial.suggest_int("iterations", 800, 3000),
            "depth": trial.suggest_int("depth", 4, 10),
            "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.2, log=True),
            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-2, 30.0, log=True),
            "bagging_temperature": trial.suggest_float("bagging_temperature", 0.1, 2.0),
            "boosting_type": "Plain",          # ✅ GPU-safe
            "bootstrap_type": "Poisson",       # ✅ safer for GPU
            "loss_function": "MultiRMSE",      # ✅ supported

        }
        try:
            val_rmse, test_vec = train_val_and_test_once(params, X_tr, Y_tr, X_va, Y_va, X_test, gpu=True)
        except Exception as e:
            print("[GPU failed, falling back to CPU]:", e)
            val_rmse, test_vec = train_val_and_test_once(params, X_tr, Y_tr, X_va, Y_va, X_test, gpu=False)

        assert len(test_vec) == len(ANCHORS[0]["vec"]), f"Vector mismatch {len(test_vec)} vs {len(ANCHORS[0]['vec'])}"
        pred_lb, top = predict_lb_from_anchors(test_vec, ANCHORS, temp=temp, k=k)

        trial.set_user_attr("val_rmse", float(val_rmse))
        trial.set_user_attr("anchor_similar", top)
        return float(pred_lb)

    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
    return study

study = tune_with_anchor_lb(X_tr, Y_tr, X_va, Y_va, X_test)
print("Best predicted LB:", study.best_value)
print("Best params:", study.best_trial.params)

[I 2025-08-12 14:49:18,920] A new study created in memory with name: no-name-4f2dbea0-b858-42a2-8782-176c38ea2df2


  0%|          | 0/500 [00:00<?, ?it/s]

[I 2025-08-12 14:49:50,934] Trial 0 finished with value: 0.5092242360115051 and parameters: {'iterations': 1145, 'depth': 9, 'learning_rate': 0.0010902463703813343, 'l2_leaf_reg': 4.595503123609981, 'bagging_temperature': 0.3397220528396424}. Best is trial 0 with value: 0.5092242360115051.
[I 2025-08-12 14:49:58,880] Trial 1 finished with value: 0.5034897923469543 and parameters: {'iterations': 2290, 'depth': 5, 'learning_rate': 0.0035537535861948546, 'l2_leaf_reg': 0.021257020489828447, 'bagging_temperature': 0.9718735962318823}. Best is trial 1 with value: 0.5034897923469543.
[I 2025-08-12 14:50:03,325] Trial 2 finished with value: 0.4999849200248718 and parameters: {'iterations': 1262, 'depth': 5, 'learning_rate': 0.018014828526503834, 'l2_leaf_reg': 0.5331002966678468, 'bagging_temperature': 1.8435907718884936}. Best is trial 2 with value: 0.4999849200248718.
[I 2025-08-12 14:50:10,058] Trial 3 finished with value: 0.5012486577033997 and parameters: {'iterations': 2934, 'depth': 9,

In [11]:
# =========================
# 6) Final CPU refit (with rsm + grow_policy + feature_border_type)
# =========================
FINAL_PARAMS = {
    "loss_function": "MultiRMSEWithMissingValues",
    "eval_metric": "MultiRMSEWithMissingValues",
    "task_type": "CPU",
    "iterations":  study.best_trial.params.get("iterations", 2000),
    "learning_rate": study.best_trial.params.get("learning_rate", 0.05),
    "depth": study.best_trial.params.get("depth", 8),
    "l2_leaf_reg": study.best_trial.params.get("l2_leaf_reg", 3.0),
    "rsm": 0.93,  # put your tuned or fixed value here
    "grow_policy": "SymmetricTree",
    "bootstrap_type": "Bayesian",
    "bagging_temperature": study.best_trial.params.get("bagging_temperature", 1.0),
    "border_count": 128,
    "feature_border_type": "GreedyLogSum",
    "min_data_in_leaf": 5,
    "random_seed": SEED,
    "use_best_model": False,
    "verbose": 200,
}

final_model = CatBoostRegressor(**FINAL_PARAMS)
final_model.fit(Pool(X_mat, label=Y_mat))

ts = int(time.time())
model_path = f"final_model_{ts}.cbm"
final_model.save_model(model_path)
print("Saved CatBoost model:", model_path)

param_path = f"best_params_{ts}.json"
with open(param_path, "w") as f:
    json.dump(study.best_trial.params, f, indent=2)
print("Saved best params:", param_path)

pred_test = final_model.predict(X_test)


0:	learn: 2.3266656	total: 195ms	remaining: 5m
200:	learn: 0.9987869	total: 39.2s	remaining: 4m 21s
400:	learn: 0.7089819	total: 1m 18s	remaining: 3m 42s
600:	learn: 0.6247775	total: 1m 57s	remaining: 3m 3s
800:	learn: 0.5993216	total: 2m 36s	remaining: 2m 24s
1000:	learn: 0.5886434	total: 3m 15s	remaining: 1m 45s
1200:	learn: 0.5825808	total: 3m 53s	remaining: 1m 6s
1400:	learn: 0.5786644	total: 4m 31s	remaining: 27.3s
1541:	learn: 0.5767213	total: 4m 58s	remaining: 0us
Saved CatBoost model: final_model_1754994580.cbm
Saved best params: best_params_1754994580.json


In [12]:
sub = pd.DataFrame(pred_test, columns=TARGET_COLS)
sub["ID"] = [ID for ID in range(1, len(pred_test) + 1)]  # IDs start from 1
sub_path = f"submission_anchor_{ts}.csv"
sub.to_csv(sub_path, index=False)
print("Saved submission:", sub_path)

Saved submission: submission_anchor_1754994580.csv
