<a href="https://colab.research.google.com/github/atharvadesai1/IPD-Project/blob/main/ipd_twitter_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
 import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from sklearn.metrics import classification_report

In [None]:
# Load the dataset
df = pd.read_csv("/content/Twitter_dataset10k.csv")

In [None]:
df

Unnamed: 0,clean_text,category
0,took your advise and going vote bjp only jai b...,0.0
1,manav show your identity atleast social media\...,-1.0
2,thanks modi for supporting buy new house thank...,1.0
3,lol then why did modi contested election from ...,1.0
4,thumara bol insult karre kowht the problem onl...,0.0
...,...,...
9995,did modi actually promise deposition lakh rupe...,-1.0
9996,dont have ghanta option,0.0
9997,come together creat new india with namo jai bj...,1.0
9998,drdo chief saraswat has categorically stated t...,1.0


In [None]:
df.isnull().sum()

clean_text    1
category      0
dtype: int64

In [None]:
df.dropna(inplace=True)

In [None]:
# Tokenize text using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [None]:
# Encode categories
category_mapping = {-1: 0, 0: 1, 1: 2}  # Mapping -1 to 0, 0 to 1, and 1 to 2
df['category_encoded'] = df['category'].map(category_mapping)


In [None]:
# Split data into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(df['clean_text'], df['category_encoded'], test_size=0.1, random_state=42)

In [None]:
train_labels.shape

(8999,)

In [None]:
train_labels.value_counts()

0    3042
2    2993
1    2964
Name: category_encoded, dtype: int64

In [None]:
# Tokenize text and convert them to tensors
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True)

In [None]:
train_dataset = TensorDataset(
    torch.tensor(train_encodings.input_ids),
    torch.tensor(train_encodings.attention_mask),
    torch.tensor(train_labels.tolist())
)

In [None]:
val_dataset = TensorDataset(
    torch.tensor(val_encodings.input_ids),
    torch.tensor(val_encodings.attention_mask),
    torch.tensor(val_labels.tolist())
)

In [None]:
# Define BERT model
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(category_mapping),
    output_attentions=False,
    output_hidden_states=False
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Define training parameters
batch_size = 32
epochs = 15
learning_rate = 2e-5

In [None]:
# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [None]:
# Define optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [None]:
# Fine-tune BERT model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
# for epoch in range(epochs):
#     model.train()
#     total_loss = 0
#     for batch in train_loader:
#         batch = tuple(t.to(device) for t in batch)
#         inputs = {'input_ids': batch[0],
#                   'attention_mask': batch[1],
#                   'labels': batch[2]}
#         optimizer.zero_grad()
#         outputs = model(**inputs)
#         loss = outputs.loss
#         total_loss += loss.item()
#         loss.backward()
#         torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
#         optimizer.step()
#         scheduler.step()

#     avg_train_loss = total_loss / len(train_loader)
#     print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss:.4f}')

accumulation_steps = 4  # Accumulate gradients over 4 steps

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for i, batch in enumerate(train_loader):
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        total_loss += loss.item()
        loss = loss / accumulation_steps  # Scale loss for gradient accumulation
        loss.backward()

        if (i + 1) % accumulation_steps == 0:
            # Update model parameters after accumulating gradients for accumulation_steps
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

    avg_train_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss:.4f}')


Epoch 1/15, Average Training Loss: 0.1085
Epoch 2/15, Average Training Loss: 0.0979
Epoch 3/15, Average Training Loss: 0.1132
Epoch 4/15, Average Training Loss: 0.0906
Epoch 5/15, Average Training Loss: 0.0787
Epoch 6/15, Average Training Loss: 0.0661
Epoch 7/15, Average Training Loss: 0.0659
Epoch 8/15, Average Training Loss: 0.0773
Epoch 9/15, Average Training Loss: 0.0497
Epoch 10/15, Average Training Loss: 0.0530
Epoch 11/15, Average Training Loss: 0.0428
Epoch 12/15, Average Training Loss: 0.0378
Epoch 13/15, Average Training Loss: 0.0369
Epoch 14/15, Average Training Loss: 0.0345
Epoch 15/15, Average Training Loss: 0.0340


In [None]:
# Evaluate model on validation set
model.eval()
val_preds = []
val_true = []

for batch in val_loader:
    batch = tuple(t.to(device) for t in batch)
    inputs = {'input_ids': batch[0],
              'attention_mask': batch[1],
              'labels': batch[2]}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1).tolist()
    val_preds.extend(preds)
    val_true.extend(inputs['labels'].tolist())

In [None]:
# Generate classification report
val_true = np.array(val_true)
val_preds = np.array(val_preds)
class_names = ['Negative', 'Neutral', 'Positive']
report = classification_report(val_true, val_preds, target_names=class_names)
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

    Negative       0.85      0.87      0.86       344
     Neutral       0.94      0.88      0.91       345
    Positive       0.85      0.88      0.87       311

    accuracy                           0.88      1000
   macro avg       0.88      0.88      0.88      1000
weighted avg       0.88      0.88      0.88      1000



In [None]:
# Analyze user input text
def analyze_tweet(tweet):
    encoded_tweet = tokenizer.encode_plus(tweet, max_length=128, truncation=True, padding='max_length', return_tensors='pt')
    input_ids = encoded_tweet['input_ids'].to(device)
    attention_mask = encoded_tweet['attention_mask'].to(device)
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    predicted_label = torch.argmax(logits, dim=1).item()
    sentiment = {0: "Negative", 1: "Neutral", 2: "Positive"}
    print(f"The sentiment of the tweet '{tweet}' is {sentiment[predicted_label]}.")


In [None]:
# Test with user input
user_input = input("Enter a tweet to analyze: ")
analyze_tweet(user_input)

Enter a tweet to analyze: Hi feeling great
The sentiment of the tweet 'Hi feeling great' is Positive.


In [None]:
import pickle

pickle_file_path = 'model_bert.pkl'
with open(pickle_file_path, 'wb') as file:
    pickle.dump(model, file)

print("Model saved successfully as", pickle_file_path)

Model saved successfully as model_bert.pkl
