In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

from transformers import (
    TFXLMRobertaModel,
    XLMRobertaTokenizer
)

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight


2025-12-25 17:41:28.340302: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-25 17:41:28.417359: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-12-25 17:41:28.607577: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-12-25 17:41:28.607895: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-12-25 17:41:28.618271: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to

In [2]:
df = pd.read_csv("cleaned_data.csv")  # text,label,language

df["label"] = df["label"].map({
    "Approved": 0,
    "Not Approved": 1
})

texts = df["text"].astype(str).tolist()
labels = df["label"].tolist()


In [3]:
X_temp, X_test, y_temp, y_test = train_test_split(
    texts, labels, test_size=0.15, stratify=labels, random_state=42
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.15, stratify=y_temp, random_state=42
)


In [4]:
MODEL_NAME = "xlm-roberta-base"
tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME)

MAX_LEN = 128

def tokenize(texts):
    return tokenizer(
        texts,
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN,
        return_tensors="tf"
    )




KeyboardInterrupt: 

In [None]:
X_train_enc = tokenize(X_train)
X_val_enc   = tokenize(X_val)
X_test_enc  = tokenize(X_test)


In [None]:
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(y_train),
    y=y_train
)

class_weight_dict = {
    0: class_weights[0],
    1: class_weights[1]
}


In [None]:
input_ids = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name="input_ids")
attention_mask = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name="attention_mask")

base_model = TFXLMRobertaModel.from_pretrained(MODEL_NAME)

outputs = base_model(
    input_ids=input_ids,
    attention_mask=attention_mask
)

cls_token = outputs.last_hidden_state[:, 0, :]  # [CLS]

x = tf.keras.layers.Dense(256, activation="relu")(cls_token)
x = tf.keras.layers.Dropout(0.3)(x)
x = tf.keras.layers.Dense(64, activation="relu")(x)
output = tf.keras.layers.Dense(1, activation="sigmoid")(x)

model = tf.keras.Model(
    inputs=[input_ids, attention_mask],
    outputs=output
)


In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss="binary_crossentropy",
    metrics=["accuracy"]
)


In [None]:
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    patience=2,
    restore_best_weights=True,
    verbose=1
)


In [None]:
history = model.fit(
    {
        "input_ids": X_train_enc["input_ids"],
        "attention_mask": X_train_enc["attention_mask"]
    },
    np.array(y_train),
    validation_data=(
        {
            "input_ids": X_val_enc["input_ids"],
            "attention_mask": X_val_enc["attention_mask"]
        },
        np.array(y_val)
    ),
    epochs=5,
    batch_size=16,
    class_weight=class_weight_dict,
    callbacks=[early_stop],
    verbose=1
)


In [None]:
probs = model.predict(
    {
        "input_ids": X_test_enc["input_ids"],
        "attention_mask": X_test_enc["attention_mask"]
    }
).ravel()

THRESHOLD = 0.35
y_pred = (probs > THRESHOLD).astype(int)


In [None]:
print("\nClassification Report (XLM-R TensorFlow):\n")
print(classification_report(y_test, y_pred, target_names=["Approved", "Not Approved"]))

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
