In [21]:
import pandas as pd
from catboost import CatBoostClassifier

SEED = 42

In [22]:
train_emb = pd.read_csv('train_embeddings.csv')
test_emb = pd.read_csv('test_embeddings.csv')
train = pd.read_csv('train_feature_eng.csv')
test = pd.read_csv('test_feature_eng.csv')

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=256, random_state=SEED)
train_emb_pca = pca.fit_transform(train_emb)
test_emb_pca = pca.transform(test_emb)

print(f"explained variance total: {sum(pca.explained_variance_ratio_)}")

train_drop = ['prompt', 'essay', 'task_achievement', 'coherence_and_cohesion',
       'lexical_resource', 'grammatical_range', 'essay_clean', 'prompt_clean',
       'merged_text',]

test_drop = ['prompt', 'essay', 'essay_clean', 'prompt_clean','merged_text',]

y_columns = ['task_achievement', 'coherence_and_cohesion',
       'lexical_resource', 'grammatical_range']


y = train[y_columns]
train = train.drop(train_drop, axis=1)
test = test.drop(test_drop, axis=1)

train_emb = pd.DataFrame(train_emb_pca, columns=[f'emb_{i}' for i in range(train_emb_pca.shape[1])])
test_emb = pd.DataFrame(test_emb_pca, columns=[f'emb_{i}' for i in range(test_emb_pca.shape[1])])

train = pd.concat([train, train_emb], axis=1)
test = pd.concat([test, test_emb], axis=1)

In [27]:
# ==============================================
# Final CatBoost + CV report + full-data refit
# ==============================================
import numpy as np
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

# Convert to numpy arrays for easier handling
X_mat = train.values  # Using your existing 'train' variable
Y_mat = y.values      # Using your existing 'y' variable

def mean_rmse_across_targets(y_true, y_pred):
    rmses = []
    for i in range(y_true.shape[1]):
        m = ~np.isnan(y_true[:, i])
        if m.any():
            rmses.append(np.sqrt(mean_squared_error(y_true[m, i], y_pred[m, i])))
    return float(np.mean(rmses)) if rmses else float("inf")

# Define parameters (using your existing config as base + some improvements)
params_final = dict(
    loss_function="MultiRMSEWithMissingValues",
    eval_metric="MultiRMSEWithMissingValues",
    task_type="CPU",
    iterations=2000,  # Increased for better performance
    learning_rate=0.05,  # Slightly lower for more stable training
    depth=8,  # Increased depth
    l2_leaf_reg=3.0,  # Regularization
    rsm=0.8,  # Random subspace method
    grow_policy="SymmetricTree",
    bootstrap_type="Bayesian",
    bagging_temperature=1.0,
    border_count=128,
    feature_border_type="GreedyLogSum",
    min_data_in_leaf=5,
    random_seed=SEED,
    use_best_model=True,
    verbose=False,
    thread_count=-1,
)

# 5-fold CV with the parameters
N_SPLITS = 5
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
cv_scores = []

print("Starting 5-fold Cross Validation...")
for fold_idx, (tr_idx, va_idx) in enumerate(kf.split(X_mat)):
    X_tr, X_va = X_mat[tr_idx], X_mat[va_idx]
    y_tr, y_va = Y_mat[tr_idx], Y_mat[va_idx]

    train_pool = Pool(X_tr, label=y_tr)
    val_pool   = Pool(X_va, label=y_va)

    model = CatBoostRegressor(**params_final)
    model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=200, use_best_model=True)

    va_pred = model.predict(val_pool)
    fold_rmse = mean_rmse_across_targets(y_va, va_pred)
    cv_scores.append(fold_rmse)
    print(f"Fold {fold_idx+1}: mean RMSE = {fold_rmse:.6f}")

print(f"\n5-fold CV mean RMSE: {np.mean(cv_scores):.6f} ± {np.std(cv_scores):.6f}")
print(f"CV scores: {cv_scores}")

# Refit on ALL training data (disable use_best_model for final training)
print("\nRetraining on full dataset...")
params_refit = params_final.copy()
params_refit["use_best_model"] = False
params_refit["verbose"] = 200

full_pool = Pool(X_mat, label=Y_mat)
model_final = CatBoostRegressor(**params_refit)
model_final.fit(full_pool)

print("Final model training completed!")

Starting 5-fold Cross Validation...
Fold 1: mean RMSE = 1.045270
Fold 1: mean RMSE = 1.045270
Fold 2: mean RMSE = 1.007846
Fold 2: mean RMSE = 1.007846
Fold 3: mean RMSE = 1.070313
Fold 3: mean RMSE = 1.070313
Fold 4: mean RMSE = 1.058785
Fold 4: mean RMSE = 1.058785
Fold 5: mean RMSE = 1.017316

5-fold CV mean RMSE: 1.039906 ± 0.023866
CV scores: [1.0452696783039896, 1.0078463578404309, 1.070312675287826, 1.0587850800250114, 1.0173164447162555]

Retraining on full dataset...
0:	learn: 2.3480663	total: 37.1ms	remaining: 1m 14s
Fold 5: mean RMSE = 1.017316

5-fold CV mean RMSE: 1.039906 ± 0.023866
CV scores: [1.0452696783039896, 1.0078463578404309, 1.070312675287826, 1.0587850800250114, 1.0173164447162555]

Retraining on full dataset...
0:	learn: 2.3480663	total: 37.1ms	remaining: 1m 14s
200:	learn: 1.7180557	total: 7.9s	remaining: 1m 10s
200:	learn: 1.7180557	total: 7.9s	remaining: 1m 10s
400:	learn: 1.4594262	total: 15.7s	remaining: 1m 2s
400:	learn: 1.4594262	total: 15.7s	remaining: 

In [30]:
# Make predictions on test set
print("Making predictions on test set...")
X_test_mat = test.values  # Using your existing 'test' variable
test_pool = Pool(X_test_mat)
test_pred = model_final.predict(test_pool)  # shape: [n_test, 4]

# Create submission dataframe
cols = ["task_achievement", "coherence_and_cohesion", "lexical_resource", "grammatical_range"]
test_predictions_df = pd.DataFrame(test_pred, columns=cols)

# Add essay_id if it exists in test data
test_predictions_df["ID"] = [ID for ID in range(1, len(test_predictions_df)+1)]

print("Test predictions shape:", test_predictions_df.shape)
print("Test predictions preview:")
print(test_predictions_df.head())

# Save predictions
test_predictions_df.to_csv("catboost_cv_predictions.csv", index=False)
print("Saved test predictions -> catboost_cv_predictions.csv")

Making predictions on test set...
Test predictions shape: (473, 5)
Test predictions preview:
   task_achievement  coherence_and_cohesion  lexical_resource  \
0          6.064768                5.735764          5.206620   
1          1.308847                1.260713          2.175877   
2          5.764201                4.806204          4.977789   
3          5.995566                5.980945          5.539767   
4          6.844544                7.337394          6.864787   

   grammatical_range  ID  
0           5.924695   1  
1           4.413276   2  
2           5.224118   3  
3           5.504005   4  
4           6.622644   5  
Saved test predictions -> catboost_cv_predictions.csv


In [31]:
# Feature importance analysis
feature_importance = model_final.get_feature_importance()
feature_names = train.columns.tolist()

# Create feature importance dataframe
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

print("Top 20 most important features:")
print(importance_df.head(20))

# Save feature importance
importance_df.to_csv("catboost_feature_importance.csv", index=False)
print("\nSaved feature importance -> catboost_feature_importance.csv")

# save the model
model_final.save_model("catboost_final_model.cbm")
print("Model saved as catboost_final_model.cbm")

Top 20 most important features:
                     feature  importance
35   prompt_essay_similarity    4.087226
3               unique_words    1.746218
52     grammar_errors_per100    1.663121
1                 char_count    1.389962
15      flesch_kincaid_grade    0.897655
117                   emb_62    0.842553
123                   emb_68    0.840598
51    spelling_errors_per100    0.779712
13        syllables_per_word    0.779636
6             grammar_errors    0.777190
16          dale_chall_score    0.757980
7            avg_word_length    0.757165
10      spelling_error_ratio    0.749672
14       flesch_reading_ease    0.743704
207                  emb_152    0.732110
34            topic_coverage    0.715668
107                   emb_52    0.714639
269                  emb_214    0.632453
56                     emb_1    0.627597
157                  emb_102    0.617756

Saved feature importance -> catboost_feature_importance.csv
Model saved as catboost_final_model.cbm
