In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import re
import os

In [3]:
curr_dir = '/content/drive/MyDrive/Sentiment_Analysis_Folder'

In [4]:
df_train = pd.read_excel(os.path.join(curr_dir,'NLP test data for assignment 0324 (2) copy.xlsb'), sheet_name = 'training', engine = 'pyxlsb')
df_validation = pd.read_excel(os.path.join(curr_dir,'NLP test data for assignment 0324 (2) copy.xlsb'), sheet_name = 'validation', engine = 'pyxlsb')

In [5]:
def clean_data(text):
  text = re.sub(r'https\S+','',text)
  text = re.sub(r'@\w+','',text)
  text = re.sub(r'#\w+','',text)
  text = re.sub(r'\n','',text)
  text = re.sub(r'[^A-Za-z0-9]','',text)
  return text.strip()

df_train['CleanTweet'] = df_train['OriginalTweet'].apply(clean_data)
df_validation['CleanTweet'] = df_validation.iloc[:,-1].apply(clean_data)

from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df_train['SentimentEncoded'] = label_encoder.fit_transform(df_train['Sentiment'])

In [6]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset, DataLoader

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class TweetDataset(Dataset):
  def __init__(self,tweets,labels,tokenizer,max_length=128):
    self.tweets = tweets
    self.labels = labels
    self.max_length = max_length
    self.tokenizer = tokenizer
  def __len__(self):
    return len(self.tweets)
  def __getitem__(self,idx):
    tweet = str(self.tweets[idx])
    label = self.labels[idx]
    encoding = self.tokenizer.encode_plus(
        tweet,
        padding = 'max_length',
        truncation = True,
        return_attention_mask = True,
        return_token_type_ids = True,
        max_length = self.max_length,
        add_special_tokens = True,
        return_tensors = 'pt'
    )
    return {
        'input_ids' : encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'labels': torch.tensor(label,dtype = torch.long)
    }

train_dataset = TweetDataset(
    tweets = df_train['CleanTweet'].values,
    labels = df_train['SentimentEncoded'].values,
    tokenizer = tokenizer
)

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = len(label_encoder.classes_))

training_args = TrainingArguments(
    output_dir = os.path.join(curr_dir,'results'),
    num_train_epochs = 2,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    warmup_steps = 500,
    weight_decay = 0.01,
    logging_dir = os.path.join(curr_dir,'logs'),
    logging_steps = 10
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset
)

trainer.train()

model.save_pretrained(os.path.join(curr_dir,'results'))
tokenizer.save_pretrained(os.path.join(curr_dir,'results'))

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

validation_dataset = TweetDataset(
    tweets = df_validation['CleanTweet'].values,
    labels = [0]*len(df_validation),
    tokenizer = tokenizer
)

validation_loader = DataLoader(validation_dataset, batch_size = 8)

model.eval()
predictions = []

with torch.no_grad():
  for batch in validation_loader:
    inputs = {
        'input_ids' : batch['input_ids'].to(device),
        'attention_mask' : batch['attention_mask'].to(device)

    }
    outputs = model(**inputs)
    _, preds = torch.max(outputs.logits,dim =1)
    predictions.extend(preds.cpu().numpy())
df_validation['PredictedSentiment'] = label_encoder.inverse_transform(predictions)
df_validation.head()

In [None]:
from bertopic import BERTopic


topic_model = BERTopic()

train_topics, train_probs = topic_model.fit_transform(df_train['CleanTweet'])


validation_topics, validation_probs = topic_model.transform(df_validation['CleanTweet'])


df_train['Topic'] = train_topics
df_validation['Topic'] = validation_topics

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


df_train['PredictedSentiment'] = label_encoder.inverse_transform(df_train['SentimentEncoded'])


plt.figure(figsize=(12, 6))
sns.countplot(data=df_train, x='Topic', hue='PredictedSentiment')
plt.title('Sentiment Distribution Across Topics in Training Data')
plt.show()


plt.figure(figsize=(12, 6))
sns.countplot(data=df_validation, x='Topic', hue='PredictedSentiment')
plt.title('Sentiment Distribution Across Topics in Validation Data')
plt.show()