In [None]:
#Aadi
# Importing the relevant Libraries and mounting Google Drive. #
print('-----------------------------------')
print('Importing the relevant Libraries and mounting Google Drive.')
!pip install pandas openpyxl
from google.colab import drive
drive.mount('/content/drive')
print('Mounted Google Drive.')
!pip install transformers
!pip install xlsxwriter
import torch.nn as nn
from transformers import BertForSequenceClassification, BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
import pandas as pd
import time
import pytz
from datetime import datetime
import torch
import os
import warnings
warnings.filterwarnings('ignore')
current_date = datetime.now().strftime('%Y-%m-%d')
print('Libraries successfully imported.')
print('-----------------------------------')

In [None]:
# Reading the CSV File to be tagged #
print('-----------------------------------')
print('Reading the CSV File to be tagged')
datafile = "/content/drive/MyDrive/tech-handover/tagging-automation/BERT Models/Accounts/CIPLA/Competitor/DRL/tagging_task.csv"
df = pd.read_csv(datafile)
print('Length of dataframe "df" is : ',len(df))
print('CSV file read successfully and store in "df".')
print('-----------------------------------')
print('Reading the text, storing in "text_mention."')
text_mention = [str(text) for text in df['text'].tolist()]
print('Length of "text to be tagged" column is : ',len(text_mention))
print('Reading successful.')
print('-----------------------------------')

In [None]:
# Defining Functions #
print('-----------------------------------')
print('Defining the function for Sentiment Tagging.')
def predict_sentiment(text, model, tokenizer, device, max_length=512):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
    return "Neutral" if preds.item() == 2 else "Positive" if preds.item() == 1 else "Negative"
print('Function Defined for Sentiment Tagging.')
print('-----------------------------------')

In [None]:
# Defining the BERT Classifier Class #
print('-----------------------------------')
print('Defining the BERT Classifier Class')
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits
print('BERT Classifier Defined Succesfully.')
print('-----------------------------------')

In [None]:
print('-----------------------------------')
print('Loading the Models along with the Specs.')
bert_model_name = 'bert-base-multilingual-uncased'
num_classes_sentiment = 3
max_length = 512
batch_size = 16
learning_rate = 2e-5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = "cpu"
print('-----------------------------------')
print('Loading the Sentiment Model.....')
sentiment_model_path = F"/content/drive/MyDrive/tech-handover/tagging-automation/BERT Models/Accounts/CIPLA/Competitor/DRL/cipla-competitor-drl-sentiment-v23.12.13-89.pt"
sentiment_model = BERTClassifier(bert_model_name, num_classes_sentiment).to(device)
# sentiment_model.load_state_dict(torch.load(sentiment_model_path, map_location=torch.device('cpu')))
sentiment_model.load_state_dict(torch.load(sentiment_model_path))
sentiment_model.eval()
print('Sentiment Model Loaded.')
print('-----------------------------------')
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
print('Tokenizer Loaded as well.')
print('-----------------------------------')

In [None]:
# Prediction
print('Processing the file and generating the Output.')
final_pred_sentiment = []
ist_timezone = pytz.timezone('Asia/Kolkata')
start_time = time.time()
start_time_ist = datetime.fromtimestamp(start_time, ist_timezone)
print(f"TAGGING started at: {start_time_ist.strftime('%a %b %d %I:%M %p %Y')}")
mention_count = 0  # Initialize mention count
for i in range(len(text_mention)):
  text = str(text_mention[i])
  print("Mentions done -> ",i+1,"/",len(text_mention))
  output_sentiment = predict_sentiment(text, sentiment_model, tokenizer, device, max_length=512)
  final_pred_sentiment.append(output_sentiment)
  mention_count += 1  # Increment mention count
end_time = time.time()
end_time_ist = datetime.fromtimestamp(end_time, ist_timezone)
epoch_time = end_time - start_time
speed_per_mention = epoch_time / mention_count
minutes, seconds = divmod(int(epoch_time), 60)
print('Your Tagging Job is Done.')
print(f"TAGGING ended at: {end_time_ist.strftime('%a %b %d %I:%M %p %Y')}")
print(f"Time taken for TAGGING: {minutes} minutes and {seconds} seconds")
print(f"Speed of prediction per mention: {speed_per_mention:.2f} seconds/mention")
cost_minutes = minutes+1
final_cost = cost_minutes * 0.45
print(f"Cost of predicition: {final_cost} INR")
final_df = pd.DataFrame({'text - mention': df['text'], 'tag - sentiment': final_pred_sentiment})
file_name = f'{current_date}-cipla-comp-drl-output.xlsx'
sheet_name = 'final_output'
google_drive_path = "/content/drive/MyDrive/tech-handover/tagging-automation/BERT Models/Accounts/CIPLA/Competitor/DRL/"
file_path = os.path.join(google_drive_path, file_name)
final_df.to_excel(file_path, sheet_name=sheet_name, index=False)