In [18]:
# Import necessary libraries
import pandas as pd
import numpy as np
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import tensorflow as tf
import os
import joblib

# Define paths for loading data and saving models
data_path = "../data/"
train_file = data_path + "train_data.csv"
val_file = data_path + "val_data.csv"
model_save_path = "../models/"


In [19]:

# Load training and validation data
def load_data(train_path, val_path):
    """
    Function to load training and validation data from CSV files.
    """
    try:
        train_df = pd.read_csv(train_path)
        val_df = pd.read_csv(val_path)
        print("[DEBUG] Training data loaded successfully.")
        print("[DEBUG] Validation data loaded successfully.")
    except FileNotFoundError as e:
        print(f"[ERROR] File not found: {e}")
        raise
    except Exception as e:
        print(f"[ERROR] An unexpected error occurred while loading data: {e}")
        raise

    return train_df, val_df

train_df, val_df = load_data(train_file, val_file)


[DEBUG] Training data loaded successfully.
[DEBUG] Validation data loaded successfully.


In [20]:

# Prepare text and labels
X_train = train_df['clean_text'].fillna('')
y_train = train_df['category']
X_val = val_df['clean_text'].fillna('')
y_val = val_df['category']

# Encode labels for DistilBERT
def encode_labels(y_train, y_val):
    """
    Function to encode labels into integers for classification.
    """
    unique_labels = sorted(y_train.unique())
    label_to_int = {label: idx for idx, label in enumerate(unique_labels)}
    y_train_encoded = y_train.map(label_to_int).values
    y_val_encoded = y_val.map(label_to_int).values

    print("[DEBUG] Labels encoded successfully.")
    return y_train_encoded, y_val_encoded, label_to_int

y_train_encoded, y_val_encoded, label_to_int = encode_labels(y_train, y_val)

# Save label mapping for later use
joblib.dump(label_to_int, os.path.join(model_save_path, "label_mapping.pkl"))
print("[DEBUG] Label mapping saved.")


[DEBUG] Labels encoded successfully.
[DEBUG] Label mapping saved.


In [21]:

# Tokenize text for DistilBERT
def tokenize_texts(tokenizer, texts, max_len=128):
    """
    Function to tokenize and encode text data for DistilBERT input.
    """
    encodings = tokenizer(
        list(texts),
        max_length=max_len,
        truncation=True,
        padding=True,
        return_tensors="tf"
    )
    print(f"[DEBUG] Tokenization complete. Example input IDs: {encodings['input_ids'][0]}")
    return encodings

# Initialize DistilBERT tokenizer
distilbert_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize training and validation data
X_train_enc = tokenize_texts(distilbert_tokenizer, X_train)
X_val_enc = tokenize_texts(distilbert_tokenizer, X_val)


[DEBUG] Tokenization complete. Example input IDs: [  101  9088 24954  4177  2419 16913  2072  1998  2283  2708 26445  2102
  7890  2097  2718  1996  3049  4446 23764   102     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0]
[DEBUG] Tokenization complete. Example input IDs: [  101  2165  2074  1061  2869  3288 12163  2152  3789 16913  2072  2005
  2178  1061  2869  2017  2215 12163  2152   102     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     

In [22]:

# Load pre-trained DistilBERT model
def load_distilbert_model(num_labels):
    """
    Function to load a pre-trained DistilBERT model for sequence classification.
    """
    try:
        model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_labels)
        print("[DEBUG] DistilBERT model loaded successfully.")
    except Exception as e:
        print(f"[ERROR] Error loading DistilBERT model: {e}")
        raise

    return model

distilbert_model = load_distilbert_model(num_labels=len(label_to_int))



Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

[DEBUG] DistilBERT model loaded successfully.


In [23]:
# Compile the model
try:
    distilbert_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
                             loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                             metrics=['accuracy'])
    print("[DEBUG] Model compiled successfully.")
except Exception as e:
    print(f"[ERROR] Error compiling the model: {e}")
    raise

# Train the model
try:
    distilbert_model.fit(
        x={"input_ids": X_train_enc["input_ids"], "attention_mask": X_train_enc["attention_mask"]},
        y=y_train_encoded,
        validation_data=(
            {"input_ids": X_val_enc["input_ids"], "attention_mask": X_val_enc["attention_mask"]},
            y_val_encoded
        ),
        epochs=1,
        batch_size=32
    )
    print("[DEBUG] Model training complete.")
except Exception as e:
    print(f"[ERROR] Error during model training: {e}")
    raise



[DEBUG] Model compiled successfully.
[DEBUG] Model training complete.


In [24]:
try:
    # Save the trained model and tokenizer for future use
    model_save_dir = os.path.join(model_save_path, "distilbert_model")
    distilbert_model.save_pretrained(model_save_dir)
    distilbert_tokenizer.save_pretrained(model_save_dir)
    print("[DEBUG] DistilBERT model and tokenizer saved successfully.")
except Exception as e:
    print(f"[ERROR] Error saving DistilBERT model or tokenizer: {e}")
    raise

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score


def evaluate_distilbert_model(model, encodings, labels):
    """
    Function to evaluate the DistilBERT model on validation data with detailed debug output.
    """
    try:
        # Perform predictions
        preds = model.predict({"input_ids": encodings["input_ids"], "attention_mask": encodings["attention_mask"]})
        preds_labels = np.argmax(preds.logits, axis=1)

        # Debug information for predictions
        print(f"[DEBUG] Number of predictions made: {len(preds_labels)}")
        print(f"[DEBUG] Example of predictions: {preds_labels[:10]}")

        # Ensure labels are reshaped to a compatible format for metrics
        labels = labels.reshape(-1)

        # Calculate evaluation metrics
        accuracy = accuracy_score(labels, preds_labels)
        f1 = f1_score(labels, preds_labels, average='weighted', zero_division=1)
        precision = precision_score(labels, preds_labels, average='weighted', zero_division=1)
        recall = recall_score(labels, preds_labels, average='weighted', zero_division=1)

        # Print metrics
        print("\nDistilBERT Model Evaluation Metrics:")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"F1-score: {f1:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")

    except Exception as e:
        print(f"[ERROR] Error during model evaluation: {e}")
        raise


try:
    evaluate_distilbert_model(distilbert_model, X_val_enc, y_val_encoded)
    print("[DEBUG] Model evaluation complete.")
except Exception as e:
    print(f"[ERROR] Error during model evaluation: {e}")
    raise


[DEBUG] DistilBERT model and tokenizer saved successfully.
[DEBUG] Number of predictions made: 26076
[DEBUG] Example of predictions: [2 1 1 1 1 1 1 0 1 0]

DistilBERT Model Evaluation Metrics:
Accuracy: 0.9565
F1-score: 0.9561
Precision: 0.9576
Recall: 0.9565
[DEBUG] Model evaluation complete.
