In [None]:
# To remove the warnings

import warnings
warnings.filterwarnings('ignore') 
warnings.simplefilter('ignore')

## Necessary imports

In [2]:
import tensorflow as tf
from transformers import TFDistilBertForSequenceClassification, DistilBertTokenizerFast
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
import numpy as np
import pandas as pd
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import classification_report

2025-10-05 08:11:06.250658: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-10-05 08:11:06.250779: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-10-05 08:11:06.252347: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-10-05 08:11:06.262407: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Hyperparameters 

In [None]:

# HYPERPARAMETERS
# Task-specific parameters

NUM_LABELS = 2           # Based on your expected sentiment classes (e.g., Positive, Negative)
MODEL_NAME = "distilbert-base-uncased" # DistilBERT model as it is lighter and faster than BERT
BATCH_SIZE = 32
LEARNING_RATE = 1e-4       # Typical learning rate for fine-tuning transformers, reduced for frozen weights training
EPOCHS = 20                # Number of training epochs
DROPOUT_RATE = 0.4     # Standard dropout rate
PATIENCE = 3

## Data Processing

In [None]:
df = pd.read_csv("twitter_training.csv") # Load the dataset for training
df.shape

(74682, 4)

In [5]:
df.head()

Unnamed: 0,Tweet ID,entity,sentiment,Tweet content
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [None]:
df.isna().sum() # Null values in the dataset is 686

Tweet ID           0
entity             0
sentiment          0
Tweet content    686
dtype: int64

In [None]:
df1 = df.dropna(axis=0) # Dropping the null values

In [8]:
df1.shape

(73996, 4)

In [9]:
df1['sentiment'].value_counts()

sentiment
Negative      22358
Positive      20655
Neutral       18108
Irrelevant    12875
Name: count, dtype: int64

In [None]:
df2 = df1.drop(columns=["Tweet ID", "entity"], axis=1)  # Dropping unnecessary columns

df2.head()

Unnamed: 0,sentiment,Tweet content
0,Positive,im getting on borderlands and i will murder yo...
1,Positive,I am coming to the borders and I will kill you...
2,Positive,im getting on borderlands and i will kill you ...
3,Positive,im coming on borderlands and i will murder you...
4,Positive,im getting on borderlands 2 and i will murder ...


In [None]:
# Drop the 'Irrelevant' Class
df3 = df2[df2['sentiment'] != 'Irrelevant'].copy()

In [None]:
#Drop the 'Neutral' Class
df4 = df3[df3['sentiment'] != 'Neutral'].copy()

In [13]:
df4["sentiment"].value_counts()

sentiment
Negative    22358
Positive    20655
Name: count, dtype: int64

In [None]:
# Encode the sentiment labels to numerical values
df4["sentiment"].replace({  
    "Negative": 0,
    "Positive": 1
},
inplace=True)

In [15]:
import re

# --- 1. Noise Removal Function ---
def clean_tweet_text(text):
    # Remove URLs/Links (http, https, www, pic.twitter.com)
    text = re.sub(r'http\S+|www\S+|pic\.twitter\.com\S+', '', text, flags=re.MULTILINE)
    # Remove Twitter Handles (@username)
    text = re.sub(r'@\w+', '', text)
    # Remove Retweet tags (if any)
    text = re.sub(r'RT\s+', '', text)
    # Remove extra whitespace
    text = ' '.join(text.split()).strip()
    return text

# Apply the cleaning function
df4['cleaned_content'] = df4['Tweet content'].apply(clean_tweet_text)
df4.drop(['Tweet content'], axis=1, inplace=True)

In [16]:
df4.head()

Unnamed: 0,sentiment,cleaned_content
0,1,im getting on borderlands and i will murder yo...
1,1,I am coming to the borders and I will kill you...
2,1,im getting on borderlands and i will kill you ...
3,1,im coming on borderlands and i will murder you...
4,1,im getting on borderlands 2 and i will murder ...


In [17]:
df4["sentiment"].value_counts()

sentiment
0    22358
1    20655
Name: count, dtype: int64

In [18]:
X = df4.drop('sentiment', axis=1)
y = df4["sentiment"]

In [None]:
# Train-Test Split with Stratification to maintain class distribution

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [20]:
y_train.value_counts()

sentiment
0    17886
1    16524
Name: count, dtype: int64

## Model Selection

In [None]:
# 1. Load the DistilBertTokenizer model from Hugging Face 
tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_NAME)

# 2. Load the TFDistilBertForSequenceClassification classification model since it is lightweight and fast
# `from_pretrained` loads the model with pre-trained weights
model = TFDistilBertForSequenceClassification.from_pretrained(
    MODEL_NAME, 
    num_labels=NUM_LABELS
)

print(f"Loaded {MODEL_NAME} for full fine-tuning. Total trainable parameters:")
model.summary()

2025-10-05 08:11:49.573463: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-10-05 08:11:49.582428: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-10-05 08:11:49.582428: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

Loaded distilbert-base-uncased for full fine-tuning. Total trainable parameters:
Model: "tf_distil_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMa  multiple                  66362880  
 inLayer)                                                        
                                                                 
 pre_classifier (Dense)      multiple                  590592    
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
 dropout_19 (Dropout)        multiple                  0         
                                                                 
Total params: 66955010 (255.41 MB)
Trainable params: 66955010 (255.41 MB)
Non-trainable params: 0 (0.00 Byte)
_______________________________

In [None]:
#Tokenization of training data

tokenized = tokenizer.batch_encode_plus(
    X_train["cleaned_content"].tolist(),
    padding=True,          # pad to longest sentence
    truncation=True,       # truncate longer sentences
    return_tensors="tf"    # return as tensorflow tensors
)

In [23]:
tokenized["input_ids"]

<tf.Tensor: shape=(34410, 315), dtype=int32, numpy=
array([[  101, 14601,  3111, ...,     0,     0,     0],
       [  101,  1045,  1521, ...,     0,     0,     0],
       [  101,  1030, 19413, ...,     0,     0,     0],
       ...,
       [  101,  1030, 19413, ...,     0,     0,     0],
       [  101,  7632,  1010, ...,     0,     0,     0],
       [  101,  1039,  1005, ...,     0,     0,     0]], dtype=int32)>

In [None]:
#Decoding the tokenized input ids back to text to verify correctness

tokenizer.batch_decode(tokenized["input_ids"])

In [None]:
val_dataset = pd.read_csv("twitter_validation.csv") # Reading the validation dataset

In [25]:
val_dataset.head()

Unnamed: 0,Tweet ID,entity,sentiment,Tweet content
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...


In [None]:
# Encode the sentiment labels to numerical values
val_dataset["sentiment"].replace({
    "Negative": 0,
    "Positive": 1,
},
inplace=True)

In [27]:
import re

# --- 1. Noise Removal Function ---
def clean_tweet_text(text):
    # Remove URLs/Links (http, https, www, pic.twitter.com)
    text = re.sub(r'http\S+|www\S+|pic\.twitter\.com\S+', '', text, flags=re.MULTILINE)
    # Remove Twitter Handles (@username)
    text = re.sub(r'@\w+', '', text)
    # Remove Retweet tags (if any)
    text = re.sub(r'RT\s+', '', text)
    # Remove extra whitespace
    text = ' '.join(text.split()).strip()
    return text

# Apply the cleaning function
val_dataset['cleaned_content'] = val_dataset['Tweet content'].apply(clean_tweet_text)
val_dataset.drop(['Tweet content'], axis=1, inplace=True)

In [None]:
# Drop the 'Irrelevant' Class
val_dataset_filtered = val_dataset[val_dataset['sentiment'] != 'Irrelevant'].copy()

In [None]:
# Drop the 'Neutral' Class
val_dataset_filtered = val_dataset_filtered[val_dataset_filtered['sentiment'] != 'Neutral'].copy()

In [None]:
val_dataset_filtered.drop(['Tweet ID', 'entity'], axis=1, inplace=True) # Dropping unnecessary columns

In [31]:
val_dataset_filtered["sentiment"].value_counts()

sentiment
1    277
0    266
Name: count, dtype: int64

In [32]:
training_labels = tf.constant(y_train, dtype=tf.int32)

In [33]:
X_val = val_dataset_filtered["cleaned_content"]
y_val = val_dataset_filtered["sentiment"]

In [34]:
val_labels = tf.constant(y_val, dtype=tf.int32)

In [None]:
#Tokenization of validation data
val_tokenized = tokenizer.batch_encode_plus(
    X_val.tolist(),
    padding=True,          # pad to longest sentence
    truncation=True,       # truncate longer sentences
    return_tensors="tf"    # return as tensorflow tensors
)

In [None]:
import tensorflow as tf
import numpy as np

# Create a dictionary of the tokenized inputs
tokenized_inputs = {
    'input_ids': tokenized['input_ids'], 
    'attention_mask': tokenized['attention_mask']
}

# Create a tf.data.Dataset from the full inputs
# Apply batching (e.g., in chunks of 32)
feature_extraction_dataset = tf.data.Dataset.from_tensor_slices(tokenized_inputs).batch(BATCH_SIZE)

In [None]:
all_pooled_outputs = []

# Loop through the batched dataset
for batch_inputs in feature_extraction_dataset:
    
    # Run the model's base layer on the small batch
    batch_outputs = model.distilbert(
        input_ids=batch_inputs['input_ids'], 
        attention_mask=batch_inputs['attention_mask'], 
        training=False # Set training to False for inference
    )
    
    # Get the [CLS] token's hidden state
    last_hidden_state = batch_outputs.last_hidden_state
    batch_pooled_output = last_hidden_state[:, 0, :]
    
    # Store the result
    all_pooled_outputs.append(batch_pooled_output.numpy())

# Concatenate all batches to get the final full pooled output array
pooled_output = np.concatenate(all_pooled_outputs, axis=0)



## Building custom classifier from distillbert model

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Model
# Assuming 'base_model' is your loaded TFDistilBertForSequenceClassification object
# and 'NUM_LABELS' is the number of classes (e.g., 4)

DROPOUT_RATE = 0.4  # Add a regularization rate

def build_custom_classifier(base_model, num_labels, dropout_rate=0.2):
    # 1. Get the DistilBERT base layer's input tensors
    input_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_ids')
    attention_mask = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='attention_mask')
    
    # 2. Pass inputs to the DistilBERT base model
    # Note: We access the 'distilbert' layer inside the TFDistilBertForSequenceClassification object
    outputs = base_model.distilbert(
        input_ids=input_ids,
        attention_mask=attention_mask
    )
    
    # 3. Get the pooled output: the hidden state of the [CLS] token (index 0)
    pooled_output = outputs.last_hidden_state[:, 0, :]
    
    # 4. Add the Dropout layer for regularization
    dropout_output = Dropout(dropout_rate, name="classifier_dropout")(pooled_output)
    
    # 5. Add your custom adapter layer
    adapter = Dense(64, activation="relu", name="adapter_dense")(dropout_output)
    
    # 6. Final Classification Layer
    classification_output = Dense(num_labels, name="classification_output")(adapter)
    
    # 7. Create the final Keras Model
    final_model = Model(
        inputs=[input_ids, attention_mask], 
        outputs=classification_output
    )
    
    return final_model

# Re-create the model using this function (assuming base_model and NUM_LABELS are defined)
model = build_custom_classifier(model, NUM_LABELS, DROPOUT_RATE)

## Early stopping and Model checkpoint

In [None]:
early_stopping = EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)
model_checkpoint = ModelCheckpoint(
    filepath='final_high_accuracy_weights.h5', 
    monitor='val_loss', 
    save_best_only=True,
    save_format="tf" # Use the SavedModel format
)


In [None]:
# Instead of fully finetuning, freeze the core DistilBERT layer weights and only train the custom layers
# We are doing frozen weights training to prevent overfitting and speed up training
# This is especially useful when the dataset is small or when computational resources are limited

model.get_layer('distilbert').trainable = False 

import tensorflow as tf

learning_rate = LEARNING_RATE
optimizer = tf.keras.optimizers.AdamW(weight_decay=0.0001, learning_rate=learning_rate)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model.compile(
    optimizer=optimizer,
    loss=loss_fn,
    metrics=['accuracy']
)


In [None]:
import numpy as np

# Explicitly convert the labels to np.int32
y_train = np.array(y_train, dtype=np.int32)
y_val = np.array(y_val, dtype=np.int32)

print(f"y_train dtype successfully cast to: {y_train.dtype}")
print(f"y_val dtype successfully cast to: {y_val.dtype}")

# The dimensions of the labels might also be an issue.
# SparseCategoricalCrossentropy often expects a flat array shape (N_samples,)
# If your arrays have shape (N_samples, 1), try flattening them:
if y_train.ndim > 1 and y_train.shape[1] == 1:
    y_train = y_train.flatten()
    y_val = y_val.flatten()

print(f"y_train shape: {y_train.shape}")
print(f"y_val shape: {y_val.shape}")

y_train dtype successfully cast to: int32
y_val dtype successfully cast to: int32
y_train shape: (34410,)
y_val shape: (543,)


## Model Training

In [None]:

# 1. Define the input features as a tuple (features are always first)
train_inputs_tuple = (tokenized['input_ids'], tokenized['attention_mask'])

# 2. Define the validation data as a tuple (inputs, labels)
val_data_tuple = ((val_tokenized['input_ids'], val_tokenized['attention_mask']), y_val)

print("Starting FINAL training run using the correct Keras tuple structure...")

history = model.fit(
    # Pass the inputs as a tuple and labels separately for validation dataset
    x=train_inputs_tuple, 
    y=y_train,
    epochs=EPOCHS, 
    batch_size=BATCH_SIZE, 
    validation_data=val_data_tuple,
    callbacks=[early_stopping, model_checkpoint] 
)

Starting FINAL training run using the correct Keras tuple structure...
Epoch 1/20


2025-10-05 08:20:19.204683: W tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc:191] Failed to compile generated PTX with ptxas. Falling back to compilation by driver.
2025-10-05 08:20:20.247121: I external/local_xla/xla/service/service.cc:168] XLA service 0x7fdf0d0da2c0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-10-05 08:20:20.247210: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): Quadro P5000, Compute Capability 6.1
2025-10-05 08:20:20.255197: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-10-05 08:20:20.279293: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8902
I0000 00:00:1759652420.399017    3871 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


INFO:tensorflow:Assets written to: best_frozen_model_weights_stable/assets


INFO:tensorflow:Assets written to: best_frozen_model_weights_stable/assets


Epoch 2/20
























INFO:tensorflow:Assets written to: best_frozen_model_weights_stable/assets


INFO:tensorflow:Assets written to: best_frozen_model_weights_stable/assets


Epoch 3/20
























INFO:tensorflow:Assets written to: best_frozen_model_weights_stable/assets


INFO:tensorflow:Assets written to: best_frozen_model_weights_stable/assets


Epoch 4/20
Epoch 5/20
























INFO:tensorflow:Assets written to: best_frozen_model_weights_stable/assets


INFO:tensorflow:Assets written to: best_frozen_model_weights_stable/assets


Epoch 6/20
Epoch 7/20
























INFO:tensorflow:Assets written to: best_frozen_model_weights_stable/assets


INFO:tensorflow:Assets written to: best_frozen_model_weights_stable/assets


Epoch 8/20
Epoch 9/20
Epoch 10/20
























INFO:tensorflow:Assets written to: best_frozen_model_weights_stable/assets


INFO:tensorflow:Assets written to: best_frozen_model_weights_stable/assets


Epoch 11/20
Epoch 12/20
























INFO:tensorflow:Assets written to: best_frozen_model_weights_stable/assets


INFO:tensorflow:Assets written to: best_frozen_model_weights_stable/assets


Epoch 13/20
Epoch 14/20
Epoch 15/20


## Loading the best model saved from the model checkpoint

In [None]:
import tensorflow as tf
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# --- 1. Load ONLY the Weights into the Re-created Model Structure ---
LOAD_PATH = 'final_high_accuracy_weights.h5' 
# Assuming your re-created model is named 'model' from the definition cell
model.load_weights(LOAD_PATH) 
best_model = model # Use the re-created model with loaded weights

print("Best weights successfully loaded (using load_weights).")



2025-10-05 10:04:05.976497: W tensorflow/core/util/tensor_slice_reader.cc:98] Could not open best_frozen_model_weights_stable: DATA_LOSS: file is too short to be an sstable: perhaps your file is in a different file format and you need to use a different restore operator?


Best weights successfully loaded (using load_weights).


## Model Evaluation

In [None]:
# ---  Tokenize the Test Set ---
X_test_text = X_test["cleaned_content"].tolist()
y_test_np = np.array(y_test, dtype=np.int32) # Ensure labels are correct dtype

test_tokenized = tokenizer.batch_encode_plus(
    X_test_text,
    padding=True,          
    truncation=True,       
    return_tensors="tf"    
)

# --- Format the Test Data for Keras ---
test_data_tuple = (
    (test_tokenized['input_ids'], test_tokenized['attention_mask']), 
    y_test_np
)

# --- Run the Final Evaluation on the Test Set ---

print("--- Running Final Evaluation on UNSEEN Test Data ---")

loss, accuracy = best_model.evaluate(
    x=test_data_tuple[0], 
    y=test_data_tuple[1], 
    verbose=0
)

print(f"\nFinal Test Loss: {loss:.4f}")
print(f"Final Test Accuracy: {accuracy:.4f}")

--- Running Final Evaluation on UNSEEN Test Data ---

Final Test Loss: 0.4219
Final Test Accuracy: 0.8097


## Evaluation Metrics

In [None]:
# --- Prepare the Test Data (Using your existing code, which is correct) ---
test_labels = tf.constant(y_test, dtype=tf.int32)
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_tokenized), test_labels)) \
             .batch(32) 

# ---  Get predictions (logits) from the model ---
# Note: For Hugging Face TF models, predictions often require the .logits attribute.
# We will use a try-except block to handle both output types safely.
y_pred_logits = best_model.predict(test_dataset)

# ---  Convert logits to predicted class labels ---
try:
    # Try the Hugging Face output format (where logits is an attribute)
    y_pred = np.argmax(y_pred_logits.logits, axis=1)
except AttributeError:
    # Fall back to the standard Keras output format
    y_pred = np.argmax(y_pred_logits, axis=1)

# ---  Get true labels ---
y_true = np.concatenate([y for x, y in test_dataset], axis=0) 

# ---  Print results ---
print("\nClassification Report:\n")
print(classification_report(y_true, y_pred, digits=4))
print("\nConfusion Matrix:\n")
print(confusion_matrix(y_true, y_pred))


Classification Report:

              precision    recall  f1-score   support

           0     0.8056    0.8356    0.8203      4472
           1     0.8146    0.7817    0.7978      4131

    accuracy                         0.8097      8603
   macro avg     0.8101    0.8086    0.8091      8603
weighted avg     0.8099    0.8097    0.8095      8603


Confusion Matrix:

[[3737  735]
 [ 902 3229]]
