# SENTIMENT ANALYSIS

![IMAGE](emojis.jpg)

In [115]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
import pandas as pd


# PREPROCESSING THE DATA

In [2]:
df = pd.read_csv('sentiment.csv')

train_ratio = 0.8  # Ratio of training data to total data
train_size = int(train_ratio * len(df))
train_df = df[:train_size]
test_df = df[train_size:]

# BUILDING THE MODEL

In [3]:
class SentimentDataset(Dataset):
    def __init__(self, text, labels, tokenizer, max_length):
        self.text = text
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, idx):
        encoded_input = self.tokenizer.encode_plus(
            self.text[idx],
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        input_ids = encoded_input['input_ids'].squeeze()
        attention_mask = encoded_input['attention_mask'].squeeze()
        label = self.labels[idx]
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': label
        }

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_length = 128

train_dataset = SentimentDataset(train_df['word'].tolist(), train_df['sentiment'].tolist(), tokenizer, max_length)
test_dataset = SentimentDataset(test_df['word'].tolist(), test_df['sentiment'].tolist(), tokenizer, max_length)

batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

class TransformerModel(nn.Module):
    def __init__(self, num_classes):
        super(TransformerModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(768, num_classes)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs['pooler_output']
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits


# TRAIN

In [65]:
num_classes = 2  # Number of sentiment classes (0 or 1)
model = TransformerModel(num_classes)

# Step 3: Training the Model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-5)

num_epochs = 15

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item() * input_ids.size(0)
    
    train_loss /= len(train_dataset)
    
    print(f'Epoch {epoch+1}/{num_epochs}, Training Loss: {train_loss:.4f}')
    


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch 1/15, Training Loss: 0.6734
Epoch 2/15, Training Loss: 0.6262
Epoch 3/15, Training Loss: 0.5844
Epoch 4/15, Training Loss: 0.5704
Epoch 5/15, Training Loss: 0.5689
Epoch 6/15, Training Loss: 0.4994
Epoch 7/15, Training Loss: 0.4582
Epoch 8/15, Training Loss: 0.4178
Epoch 9/15, Training Loss: 0.3960
Epoch 10/15, Training Loss: 0.3799
Epoch 11/15, Training Loss: 0.3348
Epoch 12/15, Training Loss: 0.2902
Epoch 13/15, Training Loss: 0.2653
Epoch 14/15, Training Loss: 0.2437
Epoch 15/15, Training Loss: 0.2244


In [73]:
def predict_sentiment(text):
    model.eval()
    
    encoded_input = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    
    input_ids = encoded_input['input_ids'].to(device)
    attention_mask = encoded_input['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        _, predicted_labels = torch.max(outputs, dim=1)
        
    predicted_sentiment = predicted_labels.item()
    return predicted_sentiment

# Example usage:
text = "awesome"
predicted_sentiment = predict_sentiment(text)
print(f'Predicted Sentiment: {predicted_sentiment}')

Predicted Sentiment: 1


# ACCURACY

In [66]:
# Step 4: Inference and Accuracy Calculation
def compute_accuracy(model, dataloader):
    model.eval()
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            _, predicted_labels = torch.max(outputs, dim=1)

            correct_predictions += (predicted_labels == labels).sum().item()
            total_predictions += labels.size(0)

    accuracy = correct_predictions / total_predictions * 100
    return accuracy

test_accuracy = compute_accuracy(model, test_dataloader)
print(f'Test Accuracy: {test_accuracy:.2f}%')

Test Accuracy: 77.27%


In [74]:
torch.save(model, 'sentiment.pth')

# INFER WITH LOCALLY SAVED MODEL

In [116]:
def infer(text):
    model = torch.load('sentiment.pth')
    model.eval()
    encoded_input = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    
    input_ids = encoded_input['input_ids'].to(device)
    attention_mask = encoded_input['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        _, predicted_labels = torch.max(outputs, dim=1)
        return predicted_labels.item()
infer('This is Good')

1