In [None]:
from google.colab import files
uploaded = files.upload()
print("Uploaded:", list(uploaded.keys()))

Saving label_encoder.joblib to label_encoder.joblib
Saving x_test_text.pkl to x_test_text.pkl
Saving x_train_text.pkl to x_train_text.pkl
Saving y_test.pkl to y_test.pkl
Saving y_train.pkl to y_train.pkl
Uploaded: ['label_encoder.joblib', 'x_test_text.pkl', 'x_train_text.pkl', 'y_test.pkl', 'y_train.pkl']


In [None]:
import joblib
import numpy as np
from pathlib import Path

TRAIN_PKL = "x_train_text.pkl"
TEST_PKL  = "x_test_text.pkl"
Y_TRAIN_PKL = "y_train.pkl"
Y_TEST_PKL  = "y_test.pkl"
LE_PKL = "label_encoder.joblib"

# safety check
for f in [TRAIN_PKL, TEST_PKL, Y_TRAIN_PKL, Y_TEST_PKL, LE_PKL]:
    if not Path(f).exists():
        raise FileNotFoundError(f"Required file not found in runtime: {f}. Re-upload if necessary.")

X_train = joblib.load(TRAIN_PKL)
X_test  = joblib.load(TEST_PKL)
y_train = joblib.load(Y_TRAIN_PKL)
y_test  = joblib.load(Y_TEST_PKL)
le      = joblib.load(LE_PKL)

# prints and checks
print("✅ Loaded files into memory.")
print("Counts:")
print("  X_train:", len(X_train))
print("  X_test :", len(X_test))
print("  y_train:", len(y_train))
print("  y_test :", len(y_test))
print("Label encoder classes:", getattr(le, "classes_", None))

# quick alignment checks
assert len(X_train) == len(y_train), "Mismatch: X_train and y_train lengths differ!"
assert len(X_test)  == len(y_test),  "Mismatch: X_test and y_test lengths differ!"

# show a tiny sample to confirm texts look okay
print("\nSample train texts (first 3):")
for i, s in enumerate(X_train[:3]):
    print(f"[{i}] {s[:200]}...\n")

✅ Loaded files into memory.
Counts:
  X_train: 86489
  X_test : 21623
  y_train: 86489
  y_test : 21623
Label encoder classes: [0 1]

Sample train texts (first 3):
[0] A Republican-led congressional committee sought on Friday to assert oversight over inquiries that about 20 states are making into Exxon Mobil and climate change, reiterating demands to know more about...

[1] If there s one thing that has always remained consistent about Donald Trump, it s his infamous hairdo. He s kept it the same way for decades, and it s pretty much his calling card.However, in a pictur...

[2] South Korea, U.S., Japan kick off two-day missile tracking drill: South Korea military SEOUL (Reuters) - South Korea, the United States and Japan started a two-day missile tracking drill on Tuesday, S...



In [None]:
import torch
print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())

PyTorch version: 2.8.0+cu126
CUDA available: True


In [None]:
# Loading the SentenceTransformer model
from sentence_transformers import SentenceTransformer

MODEL_NAME = "all-mpnet-base-v2"
model = SentenceTransformer(MODEL_NAME)
print(f"\n Model '{MODEL_NAME}' loaded Successfully!!")

# Quick Embedding test
sample_texts = X_train[:3]
sample_emb = model.encode(sample_texts, convert_to_numpy=True)
print("Test Embedding Shape:", sample_emb.shape)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


 Model 'all-mpnet-base-v2' loaded Successfully!!
Test Embedding Shape: (3, 768)


In [None]:
# Generating and Saving BERT embeddings (test + train)
import numpy as np
from tqdm.notebook import tqdm
from pathlib import Path

BATCH_SIZE = 64

def embed_texts_batched(model, texts, batch_size=BATCH_SIZE):
  all_embs = []
  for i in tqdm(range(0, len(texts), batch_size), desc="Embedding Batches"):
    batch = texts[i:i+batch_size]
    emb = model.encode(batch, show_progress_bar=False, convert_to_numpy=True)
    all_embs.append(emb)
  return np.vstack(all_embs)

print("Computing TRAIN embeddings...")
X_train_emb = embed_texts_batched(model, X_train, batch_size=BATCH_SIZE)

print("Computing TEST embeddings...")
X_test_emb = embed_texts_batched(model, X_test, batch_size=BATCH_SIZE)

np.save("X_train_emb.npy", X_train_emb)
np.save("X_test_emb.npy", X_test_emb)

print("\n Embeddings Generated and Saved Successfully!")
print("TRAIN EMBEDDING SHAPE:", X_train_emb.shape)
print("TEST EMBEDDING SHAPE:", X_test_emb.shape)



Computing TRAIN embeddings...


Embedding Batches:   0%|          | 0/1352 [00:00<?, ?it/s]

Computing TEST embeddings...


Embedding Batches:   0%|          | 0/338 [00:00<?, ?it/s]


 Embeddings Generated and Saved Successfully!
TRAIN EMBEDDING SHAPE: (86489, 768)
TEST EMBEDDING SHAPE: (21623, 768)


In [None]:
from google.colab import files
files.download("X_train_emb.npy")
files.download("X_test_emb.npy")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# training XGBoost on BERT embeddings
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import numpy as np

# Load embeddings (in case runtime was reset)
X_train_emb = np.load("X_train_emb.npy")
X_test_emb  = np.load("X_test_emb.npy")

print("Embedding shapes:", X_train_emb.shape, X_test_emb.shape)
print("Label sizes:", len(y_train), len(y_test))

Embedding shapes: (86489, 768) (21623, 768)
Label sizes: 86489 21623


In [None]:
# Initialize base classifier
clf = xgb.XGBClassifier(
    objective="binary:logistic",
    use_label_encoder=False,
    eval_metric="logloss",
    random_state=42,
    n_jobs=4
)

# Parameter grid for quick randomized search
param_dist = {
    "n_estimators": [100, 200, 400],
    "max_depth": [3, 6, 9],
    "learning_rate": [0.01, 0.05, 0.1],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "reg_lambda": [1, 5, 10]
}

# 3-fold CV for balanced evaluation
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
rs = RandomizedSearchCV(
    clf,
    param_distributions=param_dist,
    n_iter=10,
    scoring="f1",     # you can change to "accuracy" or "f1_macro"
    cv=cv,
    verbose=2,
    n_jobs=1,
    random_state=42
)

print("🚀 Starting training with RandomizedSearchCV...")
rs.fit(X_train_emb, y_train)

best = rs.best_estimator_
print("\n✅ Best parameters found:", rs.best_params_)

# Evaluate on test set
y_pred = best.predict(X_test_emb)
acc = accuracy_score(y_test, y_pred)
print("\nTest Accuracy:", acc)
print("\nDetailed classification report:\n", classification_report(y_test, y_pred, target_names=[str(c) for c in le.classes_]))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion matrix:\n", cm)

# Save trained XGBoost model
joblib.dump(best, "bert_xgb_model.pkl")
print("\nSaved trained model as bert_xgb_model.pkl")

🚀 Starting training with RandomizedSearchCV...
Fitting 3 folds for each of 10 candidates, totalling 30 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=9, n_estimators=200, reg_lambda=1, subsample=0.6; total time= 5.2min


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=9, n_estimators=200, reg_lambda=1, subsample=0.6; total time= 5.1min


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=9, n_estimators=200, reg_lambda=1, subsample=0.6; total time= 5.1min


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.6, learning_rate=0.05, max_depth=9, n_estimators=200, reg_lambda=5, subsample=0.8; total time= 5.5min


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.6, learning_rate=0.05, max_depth=9, n_estimators=200, reg_lambda=5, subsample=0.8; total time= 5.5min


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.6, learning_rate=0.05, max_depth=9, n_estimators=200, reg_lambda=5, subsample=0.8; total time= 5.5min


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=9, n_estimators=100, reg_lambda=5, subsample=1.0; total time= 3.7min


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=9, n_estimators=100, reg_lambda=5, subsample=1.0; total time= 3.7min


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=9, n_estimators=100, reg_lambda=5, subsample=1.0; total time= 3.7min


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=6, n_estimators=100, reg_lambda=5, subsample=0.8; total time= 1.2min


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=6, n_estimators=100, reg_lambda=5, subsample=0.8; total time= 1.2min


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=6, n_estimators=100, reg_lambda=5, subsample=0.8; total time= 1.2min


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=1.0, learning_rate=0.01, max_depth=6, n_estimators=100, reg_lambda=1, subsample=1.0; total time= 1.3min


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=1.0, learning_rate=0.01, max_depth=6, n_estimators=100, reg_lambda=1, subsample=1.0; total time= 1.3min


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=1.0, learning_rate=0.01, max_depth=6, n_estimators=100, reg_lambda=1, subsample=1.0; total time= 1.3min


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=400, reg_lambda=5, subsample=1.0; total time= 1.8min


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=400, reg_lambda=5, subsample=1.0; total time= 1.8min


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=400, reg_lambda=5, subsample=1.0; total time= 1.8min


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=9, n_estimators=400, reg_lambda=1, subsample=0.6; total time=11.3min


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=9, n_estimators=400, reg_lambda=1, subsample=0.6; total time=11.2min


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=9, n_estimators=400, reg_lambda=1, subsample=0.6; total time=11.2min


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.6, learning_rate=0.05, max_depth=3, n_estimators=100, reg_lambda=1, subsample=0.6; total time=  34.0s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.6, learning_rate=0.05, max_depth=3, n_estimators=100, reg_lambda=1, subsample=0.6; total time=  33.5s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.6, learning_rate=0.05, max_depth=3, n_estimators=100, reg_lambda=1, subsample=0.6; total time=  34.0s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.6, learning_rate=0.05, max_depth=6, n_estimators=400, reg_lambda=10, subsample=0.8; total time= 3.7min


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.6, learning_rate=0.05, max_depth=6, n_estimators=400, reg_lambda=10, subsample=0.8; total time= 3.7min


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.6, learning_rate=0.05, max_depth=6, n_estimators=400, reg_lambda=10, subsample=0.8; total time= 3.7min


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=3, n_estimators=400, reg_lambda=1, subsample=0.6; total time= 1.7min


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=3, n_estimators=400, reg_lambda=1, subsample=0.6; total time= 1.7min


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=3, n_estimators=400, reg_lambda=1, subsample=0.6; total time= 1.7min


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



✅ Best parameters found: {'subsample': 0.6, 'reg_lambda': 1, 'n_estimators': 200, 'max_depth': 9, 'learning_rate': 0.1, 'colsample_bytree': 0.8}

Test Accuracy: 0.9308144105813254

Detailed classification report:
               precision    recall  f1-score   support

           0       0.93      0.93      0.93     10437
           1       0.93      0.93      0.93     11186

    accuracy                           0.93     21623
   macro avg       0.93      0.93      0.93     21623
weighted avg       0.93      0.93      0.93     21623


Confusion matrix:
 [[ 9669   768]
 [  728 10458]]

Saved trained model as bert_xgb_model.pkl


In [None]:
from google.colab import files
files.download("bert_xgb_model.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>