<a href="https://colab.research.google.com/github/aryamanpathak2022/Sentimental_analysis_wn/blob/main/models/BERT/training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#BERT training

In [1]:
!pip install transformers
!pip install torch

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=3)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv('drive/MyDrive/Sentiment_analysis/manual_processed_Articles.csv')

# Extract the 'Articles' and 'sentiment' columns and convert them to lists
articles_list = df['Processed_Article'].tolist()
sentiment_list = df['sentiment']
label_mapping = {'positive': 1, 'negative': 2, 'neutral': 0}

df['sentiment'] = df['sentiment'].map(label_mapping)

# Print the lists
print(articles_list)
print(sentiment_list)




['least people dead others injured two train collided southeast india sunday police official said train traveling visakhapatnam state andhra pradesh rayagada odisha stopped due break overhead cable hit oncoming train vizianagaram district andhra pradesh reuters said citing senior railway official collision derailed two coach carrying people stationary train official told reuters preliminary investigation suggests human error caused overshooting signal stationary train led collision statement country railway ministry read people died person identified deepika superintendent police vizianagaram district told cnn monday said railway police would investigate incident sunday collision come four month separate train accident odisha three train collided leaving people dead injured rescue operation underway accident site injured moved hospital railway minister ashwini vaishnaw said x chief minister office andhra pradesh said x ordered official take quick relief measure ensure injured get promp

In [33]:
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW, get_linear_schedule_with_warmup
import numpy as np

class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Example usage
# take data formm file
texts = articles_list
labels = sentiment_list.to_numpy()
# get labels in numeric
labels = [label_mapping[label] for label in labels]
print(labels)

dataset = NewsDataset(texts, labels, tokenizer, max_len=128)
# divide the dataset in test and training
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

# DataLoader

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)


[2, 0, 1, 2, 2, 0, 2, 2, 0, 1, 1, 0, 1, 2, 2, 2, 1, 2, 1, 1, 1, 2, 1, 1, 0, 0, 0, 0, 1]


In [9]:
# Set up optimizer and learning rate scheduler
num_epochs = 3

optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Fine-tuning loop
model = model.to(device)
model.train()

for epoch in range(num_epochs):
    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

print("Fine-tuning completed")




Fine-tuning completed


In [37]:

model.eval()
correct_preds = 0
total_preds = 0

with torch.no_grad():
  for batch in train_dataloader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['label'].to(device)

    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    _, predicted = torch.max(outputs.logits, dim=1)

    total_preds += labels.size(0)
    correct_preds += (predicted == labels).sum().item()

accuracy = correct_preds / total_preds
print(f"Initial model accuracy :{accuracy}")




Initial model accuracy :0.391304347826087


In [11]:
model_save_path = 'drive/MyDrive/Sentiment_analysis/fine_tuned_bert_model.pt'

# Save the model state dict and other information you may need
torch.save({
            'model_state_dict': model.state_dict(),
            # Add other information if needed
            }, model_save_path)

print(f"Model saved to {model_save_path}")

Model saved to drive/MyDrive/Sentiment_analysis/fine_tuned_bert_model.pt


In [12]:
import pandas as pd
from transformers import BertTokenizer

# Load unlabelled CSV file
csv_file = 'drive/MyDrive/Sentiment_analysis/processed_Articles.csv'
df = pd.read_csv(csv_file)

# Extract text data
unlabeled_texts = df['Processed_Article'].tolist()



In [13]:
encoded_unlabeled_texts = tokenizer(unlabeled_texts, padding=True, truncation=True, return_tensors='pt')
self_train_dataloader = DataLoader(encoded_unlabeled_texts['input_ids'], batch_size=8)  # Use input_ids directly for self-training

In [14]:
learning_rate = 2e-5
batch_size = 8

In [31]:
# from torch.utils.data import DataLoader, Dataset
# train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# self_train_dataloader = DataLoader(encoded_texts, batch_size=8)
confidence_threshold = 0.4

model.train()
for epoch in range(num_epochs):
  train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)




    # Self-training with unlabeled data
  for input_ids in self_train_dataloader:
        input_ids = input_ids.to(device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids)
            print(outputs)
            logits = outputs.logits
            probabilities = torch.softmax(logits, dim=1)
            max_probabilities, predicted_labels = torch.max(probabilities, dim=1)
            print(max_probabilities)
            print(predicted_labels)

        # Add confident predictions to labeled dataset for re-training
        new_labeled_data = {
            'input_ids': input_ids[max_probabilities >= confidence_threshold].cpu(),
            'labels': predicted_labels[max_probabilities >= confidence_threshold].cpu()
        }


        # Convert tensors to list and append to existing dataset
        texts_to_add = tokenizer.batch_decode(new_labeled_data['input_ids'], skip_special_tokens=True)
        labels_to_add = new_labeled_data['labels'].tolist()
        print(f"Texts to add: {len(texts_to_add)}")
        print(f"Labels to add: {labels_to_add}")

        print(labels_to_add)
        texts.extend(texts_to_add)
        labels=labels.tolist()
        labels.extend(labels_to_add)  # Concatenate labels
        # convert labels to np arrya
        labels = np.array(labels)
        print(labels)


        print(f"Total texts: {len(texts)}")
        print(f"Total labels: {len(labels)}")

        assert len(texts) == len(labels), "Lengths of texts and labels do not match"

        # Update dataset and dataloader with new labeled data
        dataset = NewsDataset(texts, labels, tokenizer, max_len=128)
        train_size = int(0.8 * len(dataset))
        test_size = len(dataset) - train_size
        train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])
        train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

  for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()



SequenceClassifierOutput(loss=None, logits=tensor([[-0.2820, -0.1112, -0.1235],
        [-0.0861, -0.0268, -0.0388],
        [-0.2345,  0.1159, -0.1165],
        [-0.2809,  0.1104, -0.2192],
        [-0.3468,  0.1699,  0.0677],
        [-0.2065, -0.0765,  0.2390],
        [-0.0583, -0.0641, -0.2126],
        [-0.0598,  0.0949, -0.0456]]), hidden_states=None, attentions=None)
tensor([0.3533, 0.3412, 0.4005, 0.4175, 0.4001, 0.4219, 0.3507, 0.3669])
tensor([1, 1, 1, 1, 1, 2, 0, 1])
Texts to add: 4
Labels to add: [1, 1, 1, 2]
[1, 1, 1, 2]
Total texts: 33
Total labels: 33
SequenceClassifierOutput(loss=None, logits=tensor([[-0.3049,  0.1578, -0.0976],
        [-0.3358,  0.0088,  0.1639],
        [-0.2608,  0.1263,  0.1847],
        [-0.2168,  0.0773, -0.0762],
        [-0.0359, -0.0437, -0.2233],
        [-0.0685,  0.0110, -0.1817],
        [-0.2010,  0.1078, -0.1741],
        [-0.1600, -0.0933, -0.3164]]), hidden_states=None, attentions=None)
tensor([0.4159, 0.4060, 0.3870, 0.3842, 0.3544, 

AssertionError: Lengths of texts and labels do not match

In [44]:
print(labels.tolist())

[1, 1, 0, 2, 0, 0]


In [45]:

model.eval()
correct_preds = 0
total_preds = 0

with torch.no_grad():
  for batch in test_dataloader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['label'].to(device)

    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    _, predicted = torch.max(outputs.logits, dim=1)

    total_preds += labels.size(0)
    correct_preds += (predicted == labels).sum().item()

accuracy = correct_preds / total_preds
print(f"Initial model accuracy :{accuracy}")




Initial model accuracy :0.5
