In [None]:
!pip install transformers datasets torch scikit-learn
!pip install nltk
nltk.download('punkt')

In [None]:
!pip install ydata_profiling

In [None]:
from google.colab import files
files.upload()

In [5]:
import os
print(os.listdir("/content"))
# for file in os.listdir("/content"):
#     print(file)

['.config', 'trials.csv', 'sample_data']


In [10]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd
from ydata_profiling import ProfileReport
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('punkt_tab')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download required resources
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

# EDA (Exploratory Data Analysis) to check duplicates , missing values , incorrect labels

In [11]:
def preprocess_text(text):
    """
    Perform text preprocessing using NLTK.

    Steps:
    1. Lowercasing
    2. Tokenization
    3. Removing Punctuation & Special Characters
    4. Stopword Removal
    5. Lemmatization

    Returns:
    - Cleaned text as a string
    """
    if not isinstance(text, str):
        return ""
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalnum()]
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word not in stop_words]
    # lemmatizer = WordNetLemmatizer()
    # tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Convert tokens back to text
    cleaned_text = " ".join(tokens)
    return cleaned_text

In [12]:
# Load dataset
df = pd.read_csv("trials.csv")  # Ensure it has 'description' and 'label' columns
df["text_length"] = df["description"].apply(len)
df["word_count"] = df["description"].apply(lambda x: len(x.split()))
df['tokens'] = df['description'].apply(lambda x: len(word_tokenize(x)))
# Apply preprocessing
df["cleaned_description"] = df["description"].apply(preprocess_text)
df['cleaned_tokens'] = df['cleaned_description'].apply(lambda x: len(word_tokenize(x)))

profile = ProfileReport(df, title="Research Grid EDA data exploration", explorative=True)
# Save the profile report to an HTML file
profile.to_file("rgrid_trials.html")


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [13]:
from google.colab import files
files.download("rgrid_trials.html")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Bert Model Fine tuning with out text preprocessing...

In [17]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, precision_recall_fscore_support



In [14]:
# Load dataset
df = pd.read_csv("trials.csv")  # Ensure it has 'description' and 'label' columns

# Encode class labels into numerical values (if not already encoded)
label_mapping = {label: idx for idx, label in enumerate(df["label"].unique())}
df["label"] = df["label"].map(label_mapping)

# Split dataset into train, validation, and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["description"], df["label"], test_size=0.2, stratify=df["label"], random_state=42
)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.1, stratify=train_labels, random_state=42
)

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize text
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=512)

import torch

class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels.iloc[idx])
        return item

train_dataset = TextDataset(train_encodings, train_labels)
val_dataset = TextDataset(val_encodings, val_labels)
test_dataset = TextDataset(test_encodings, test_labels)
from transformers import BertForSequenceClassification

# Load BERT model with 5 output labels
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=5)

# Define a function to compute evaluation metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)  # Get predicted labels

    # Compute accuracy, precision, recall, and F1-score
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,  # Attach the metrics function
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
print(len(train_dataset))
print(len(val_dataset))
print(len(test_dataset))

1266
141
352


In [18]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.312258,0.900709,0.915511,0.900709,0.902135
2,No log,0.35178,0.921986,0.932934,0.921986,0.922009
3,No log,0.266698,0.950355,0.953959,0.950355,0.950886


Epoch,Training Loss,Validation Loss


TrainOutput(global_step=477, training_loss=0.1570867062614649, metrics={'train_runtime': 193.1519, 'train_samples_per_second': 19.663, 'train_steps_per_second': 2.47, 'total_flos': 1332430273437696.0, 'train_loss': 0.1570867062614649, 'epoch': 3.0})

In [19]:
metrics = trainer.evaluate()
print("Evaluation Metrics:", metrics)

Evaluation Metrics: {'eval_loss': 0.2666977345943451, 'eval_accuracy': 0.950354609929078, 'eval_precision': 0.953959115561118, 'eval_recall': 0.950354609929078, 'eval_f1': 0.9508855506195933, 'eval_runtime': 2.0465, 'eval_samples_per_second': 68.898, 'eval_steps_per_second': 8.795, 'epoch': 3.0}


In [47]:
# Save the trained model
trainer.save_model("./saved_model")

# Save tokenizer for future use
tokenizer.save_pretrained("./saved_model")

('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.txt',
 './saved_model/added_tokens.json')

In [89]:
from google.colab import files
files.download('my_directory.zip')
#files.download('./saved_model/added_tokens.json')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [50]:
# from google.colab import files
# files.download("./saved_model/tokenizer_config.json")
# files.download("./saved_model/special_tokens_map.json")
# files.download("./saved_model/vocab.txt")
# files.download("./saved_model/added_tokens.json")

import shutil
from google.colab import files

# Replace 'my_directory' with your folder name
shutil.make_archive("my_directory", "zip", "saved_model")

# Download the zipped folder
files.download("my_directory.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [20]:
# ------------------- Predict on Test Set -------------------
predictions = trainer.predict(test_dataset)

# Extract logits and labels
logits, labels = predictions.predictions, predictions.label_ids
preds = np.argmax(logits, axis=-1)
# print(preds)
# Compute & Print Final Metrics
final_metrics = compute_metrics((logits, labels))
print("\nFinal Model Performance:")
for metric, value in final_metrics.items():
    print(f"{metric.capitalize()}: {value:.4f}")


Final Model Performance:
Accuracy: 0.9290
Precision: 0.9312
Recall: 0.9290
F1: 0.9292


## Sliding window approach for the text size length

In [21]:
def sliding_window_tokenization(text, tokenizer, max_length=512, stride=256):
    tokens = tokenizer.encode(text, add_special_tokens=True, truncation=False)

    chunks = []
    for i in range(0, len(tokens), stride):
        chunk = tokens[i:i + max_length]
        if len(chunk) < max_length:
            chunk += [tokenizer.pad_token_id] * (max_length - len(chunk))  # Padding
        chunks.append(chunk)

        if i + max_length >= len(tokens):  # Stop if we reach the end
            break

    return chunks

In [22]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [23]:
text = "This is an example text that exceeds 512 tokens. " * 50  # Simulated long text
tokenized_chunks = sliding_window_tokenization(text, tokenizer)

print(f"Total Chunks: {len(tokenized_chunks)}")
print(f"First Chunk: {tokenized_chunks[0]}")

Token indices sequence length is longer than the specified maximum sequence length for this model (552 > 512). Running this sequence through the model will result in indexing errors


Total Chunks: 2
First Chunk: [101, 2023, 2003, 2019, 2742, 3793, 2008, 23651, 24406, 19204, 2015, 1012, 2023, 2003, 2019, 2742, 3793, 2008, 23651, 24406, 19204, 2015, 1012, 2023, 2003, 2019, 2742, 3793, 2008, 23651, 24406, 19204, 2015, 1012, 2023, 2003, 2019, 2742, 3793, 2008, 23651, 24406, 19204, 2015, 1012, 2023, 2003, 2019, 2742, 3793, 2008, 23651, 24406, 19204, 2015, 1012, 2023, 2003, 2019, 2742, 3793, 2008, 23651, 24406, 19204, 2015, 1012, 2023, 2003, 2019, 2742, 3793, 2008, 23651, 24406, 19204, 2015, 1012, 2023, 2003, 2019, 2742, 3793, 2008, 23651, 24406, 19204, 2015, 1012, 2023, 2003, 2019, 2742, 3793, 2008, 23651, 24406, 19204, 2015, 1012, 2023, 2003, 2019, 2742, 3793, 2008, 23651, 24406, 19204, 2015, 1012, 2023, 2003, 2019, 2742, 3793, 2008, 23651, 24406, 19204, 2015, 1012, 2023, 2003, 2019, 2742, 3793, 2008, 23651, 24406, 19204, 2015, 1012, 2023, 2003, 2019, 2742, 3793, 2008, 23651, 24406, 19204, 2015, 1012, 2023, 2003, 2019, 2742, 3793, 2008, 23651, 24406, 19204, 2015, 1012,

In [24]:
# Splitting into train, validation, and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(df["description"], df["label"], test_size=0.2, stratify=df["label"], random_state=42)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.1, stratify=train_labels, random_state=42)

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [25]:
def tokenize_with_sliding_window(texts, labels, tokenizer, max_length=512, stride=256):
    """
    Tokenizes text using a sliding window approach to handle long sequences.

    Args:
    - texts (list): List of text inputs.
    - labels (list): Corresponding class labels.
    - tokenizer (BertTokenizer): BERT tokenizer.
    - max_length (int): Maximum sequence length.
    - stride (int): Overlapping stride size.

    Returns:
    - tokenized_inputs (dict): Tokenized inputs with 'input_ids' & 'attention_mask'.
    - expanded_labels (list): Labels repeated for each chunk.
    """
    tokenized_inputs = {"input_ids": [], "attention_mask": []}
    expanded_labels = []

    for i, text in enumerate(texts):
        label = labels.iloc[i]

        # Tokenize the text
        tokens = tokenizer(text, truncation=False, padding=False, return_tensors="pt")

        input_ids = tokens["input_ids"].squeeze().tolist()
        attention_mask = tokens["attention_mask"].squeeze().tolist()

        # Apply sliding window
        for start in range(0, len(input_ids), stride):
            end = start + max_length
            chunk_ids = input_ids[start:end]
            chunk_mask = attention_mask[start:end]

            # Padding if chunk is shorter than max_length
            if len(chunk_ids) < max_length:
                pad_length = max_length - len(chunk_ids)
                chunk_ids += [tokenizer.pad_token_id] * pad_length
                chunk_mask += [0] * pad_length

            tokenized_inputs["input_ids"].append(chunk_ids)
            tokenized_inputs["attention_mask"].append(chunk_mask)
            expanded_labels.append(label)

    return tokenized_inputs, expanded_labels


In [26]:
len(train_texts)

1266

In [27]:
# Tokenize datasets with sliding window approach
train_encodings, train_labels = tokenize_with_sliding_window(train_texts, train_labels, tokenizer)
val_encodings, val_labels = tokenize_with_sliding_window(val_texts, val_labels, tokenizer)
test_encodings, test_labels = tokenize_with_sliding_window(test_texts, test_labels, tokenizer)


Token indices sequence length is longer than the specified maximum sequence length for this model (1264 > 512). Running this sequence through the model will result in indexing errors


In [28]:
class BertTextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.encodings["input_ids"][idx]),
            "attention_mask": torch.tensor(self.encodings["attention_mask"][idx]),
            "labels": torch.tensor(self.labels[idx]),
        }

# Convert to PyTorch dataset
train_dataset = BertTextDataset(train_encodings, train_labels)
val_dataset = BertTextDataset(val_encodings, val_labels)
test_dataset = BertTextDataset(test_encodings, test_labels)

In [29]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,  # Attach the metrics function
)



In [30]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.362053,0.874587,0.890426,0.874587,0.878044
2,0.304500,0.470185,0.894389,0.901234,0.894389,0.894955
3,0.095700,0.583487,0.891089,0.893592,0.891089,0.890742


TrainOutput(global_step=1017, training_loss=0.1986388850704407, metrics={'train_runtime': 385.3943, 'train_samples_per_second': 21.056, 'train_steps_per_second': 2.639, 'total_flos': 2135203726095360.0, 'train_loss': 0.1986388850704407, 'epoch': 3.0})

In [31]:
metrics = trainer.evaluate()
print("Evaluation Metrics:", metrics)

Evaluation Metrics: {'eval_loss': 0.5834872722625732, 'eval_accuracy': 0.8910891089108911, 'eval_precision': 0.8935922850183541, 'eval_recall': 0.8910891089108911, 'eval_f1': 0.8907423365135091, 'eval_runtime': 4.4262, 'eval_samples_per_second': 68.455, 'eval_steps_per_second': 8.585, 'epoch': 3.0}


In [32]:
# ------------------- Predict on Test Set -------------------
predictions = trainer.predict(test_dataset)

# Extract logits and labels
logits, labels = predictions.predictions, predictions.label_ids
preds = np.argmax(logits, axis=-1)
# print(preds)
# Compute & Print Final Metrics
final_metrics = compute_metrics((logits, labels))
print("\nFinal Model Performance:")
for metric, value in final_metrics.items():
    print(f"{metric.capitalize()}: {value:.4f}")


Final Model Performance:
Accuracy: 0.8784
Precision: 0.8785
Recall: 0.8784
F1: 0.8783
