In [13]:
!pip install transformers
!pip install lightning



In [14]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
import re



# Define a regular expression pattern to tokenize the text
# pattern = r"\b\w+\b|\S"  # This pattern captures words or non-space characters

# Use the findall function to tokenize the text
# tokens = re.findall(pattern, text)

df = pd.read_csv('text_emotion.csv')


# Download NLTK data (if not already downloaded)
nltk.download('punkt')

pattern = r'\b\w+\b|[\.,!?()\[\]{}&*("^%$#@;")=]+|\''

# Tokenize the "content" column

# checker.correct_strings(
#     ["I luk foward to receving your reply", "were did wendigo goe boating?"])

df['content'] = df['content'].apply(lambda x: x.lower())
# df['content'] = df['content'].apply(lambda x: checker.correct(x))
df['tokens'] = df['content'].apply(lambda x: nltk.regexp_tokenize(x, pattern) )


# Display the DataFrame with tokens
df

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,tweet_id,sentiment,author,content,tokens
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...,"[@, tiffanylue, i, know, i, was, listenin, to,..."
1,1956967666,sadness,wannamama,layin n bed with a headache ughhhh...waitin o...,"[layin, n, bed, with, a, headache, ughhhh, ......"
2,1956967696,sadness,coolfunky,funeral ceremony...gloomy friday...,"[funeral, ceremony, ..., gloomy, friday, ...]"
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends soon!,"[wants, to, hang, out, with, friends, soon, !]"
4,1956968416,neutral,xkilljoyx,@dannycastillo we want to trade with someone w...,"[@, dannycastillo, we, want, to, trade, with, ..."
...,...,...,...,...,...
39995,1753918954,neutral,showMe_Heaven,@johnlloydtaylor,"[@, johnlloydtaylor]"
39996,1753919001,love,drapeaux,happy mothers day all my love,"[happy, mothers, day, all, my, love]"
39997,1753919005,love,JenniRox,happy mother's day to all the mommies out ther...,"[happy, mother, ', s, day, to, all, the, mommi..."
39998,1753919043,happiness,ipdaman1,@niariley wassup beautiful!!! follow me!! pee...,"[@, niariley, wassup, beautiful, !!!, follow, ..."


In [None]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
import json

# Load the pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)


# Initialize an empty list to store embeddings

trainData = []

# Tokenize and convert each word to embeddings
for words in df['tokens']:
  embeddings = []
  for word in words:
      # Tokenize the word
      tokens = tokenizer.tokenize(word)
      # Convert tokens to IDs
      input_ids = tokenizer.convert_tokens_to_ids(tokens)
      # Create a tensor from input IDs
      input_ids = torch.tensor(input_ids).unsqueeze(0)  # Add batch dimension

      # Get embeddings
      with torch.no_grad():
          outputs = model(input_ids)

      # Sum the subword embeddings
      word_embedding = torch.sum(outputs.last_hidden_state, dim=1).numpy()

      embeddings.append(word_embedding.tolist()[0])
  trainData.append(embeddings)

# Now 'embeddings' contains the embeddings for each word
# for word, embedding in zip(words, embeddings):
#     print(f"Word: {word}, Embedding Shape: {embedding.shape}")

# Find the maximum length of embeddings in trainData
max_length = max(len(embedding) for embedding in trainData)

padVec = [ 0 for i in range(768)]

for i in range(len(trainData)):
  while len(trainData[i]) < max_length:
    trainData[i].append(padVec)

trainData = [data for data in trainData]

file_name = "data.json"

# Open the file in write mode and store the data
with open(file_name, "w") as json_file:
    json.dump(trainData, json_file)

print(f"Data has been saved to {file_name}")


In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder


# Extract the 'sentiment' column as the target variable
sentiments = df['sentiment'].values

sentiments = [[label] for label in sentiments]

# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse=False)

# Fit and transform the encoder on the target variable
y = encoder.fit_transform(sentiments)

# y_train is now an array of one-hot encoded vectors
y

In [2]:
import torch.nn.functional as F
import torch.optim as optim
import torch
import torch.nn as nn
import lightning as L

class DualCnnBiLsmtModel(L.LightningModule):
    def __init__(self, embedding_dim, hidden_dim, tagset_size,senLen,lr):
        super(DualCnnBiLsmtModel, self).__init__()
        torch.manual_seed(seed=42)
        self.hidden_dim = hidden_dim
        self.tagset_size = tagset_size
        self.cnn = nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(senLen, 1))
        self.fc1 = nn.Linear(1, hidden_dim)  # Adjust the hidden_dim as needed
        self.fc2 = nn.Linear(hidden_dim, self.tagset_size)
        self.lstm = nn.LSTM(embedding_dim, 128//2, num_layers=1, bidirectional=True)
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.dropout = nn.Dropout(p=0.35)
        self.relu = nn.ReLU()
        self.learning_rate =lr

    def forward(self, x):
        lstm_output1, _ = self.lstm(x)

        cnn_output1 = F.relu(self.cnn(lstm_output1))

        cnn_output1 = self.dropout(cnn_output1)
        # output1 = F.max_pool1d(lstm_output1, lstm_output1.size(2)).squeeze(2)

        cnn_output2 = F.relu(self.cnn(x))
        # cnn_output1 = F.max_pool1d(cnn_output1, cnn_output1.size(2)).squeeze(2)
        cnn_output2 = self.dropout(cnn_output2)
        lstm_output2, _ = self.lstm(cnn_output2)

        combined_output = torch.cat((lstm_output2, cnn_output1),dim=2)

        output = F.max_pool1d(combined_output, combined_output.size(2))

        output = output.view(combined_output.size(0), -1)

        # Apply the first dense (fully connected) layer
        output = self.fc1(output)
        # Apply the second dense layer
        output = self.fc2(output)

        output = F.softmax(output)



        # print(output.size())

        # output2 = self.relu(lstm_output)  # Apply ReLU activation
        # output2 = self.hidden2tag(lstm_output)
        return output

    def loss(self, x, y):
      logits = self.forward(x)

      criterion = nn.CrossEntropyLoss()  # Use CrossEntropyLoss for softmax loss
      loss = criterion(logits, y)

      return loss

    # def decode(self, x):
    #     logits = self.forward(x)
    #     predicted_tags = self.crf.decode(logits)
    #     return predicted_tags

    def training_step(self, batch, batch_idx):
        x, y = batch
        loss = self.loss(x, y)
        print(loss)
        return loss
    #0.0031622776601683794
    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr= self.learning_rate)
        return optimizer


[1, 2, 3, 4, 5]

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from pytorch_lightning.callbacks import LearningRateFinder
import lightning as L

# Define the dimensions and size of your dataset
embedding_dim = 768  # Change this to match your word embedding dimension
hidden_dim = 128  # Change this to match your model architecture
tagset_size = len(y[0].tolist())  # The number of classes: happy, sad, anger
batch_size = 32

trainData= torch.tensor(trainData)
y = torch.tensor(y.tolist()).to(float)
batch_size = 32
dataset = TensorDataset(trainData, y)

model = DualCnnBiLsmtModel(embedding_dim, hidden_dim, tagset_size, len(trainData[0]),0.1)

dataloader = DataLoader(dataset, batch_size=batch_size, drop_last=True)

trainer = L.Trainer(max_epochs=1000)
tuner = L.pytorch.tuner.Tuner(trainer)
# Run the learning rate finder
lr_finder = tuner.lr_find(model, train_dataloaders= dataloader, min_lr=0.000000001, max_lr=1,early_stop_threshold=None)

# Plot the learning rate finder results
fig = lr_finder.plot(suggest=True)
suggested_lr = lr_finder.suggestion()
print("Suggested Learning Rate:", suggested_lr)

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import KFold
from torch.utils.data.dataset import random_split

# Define the dimensions and size of your dataset
embedding_dim = 768  # Change this to match your word embedding dimension
hidden_dim = 128  # Change this to match your model architecture
tagset_size = len(y[0].tolist())  # The number of classes: happy, sad, anger
batch_size = 32

trainData= torch.tensor(trainData)
y = torch.tensor(y.tolist()).to(float)

# Create a PyTorch dataset
dataset = TensorDataset(trainData, y)

# Define the number of folds for cross-validation
num_folds = 10

# Initialize a PyTorch Lightning trainer with your desired settings
trainer = L.Trainer(max_epochs=2000)  # Adjust max_epochs as needed

# Initialize KFold cross-validation
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
# model = BiLSTMCRFModel(embedding_dim, hidden_dim, tagset_size, len(trainData[0]))

# Lists to store performance metrics for each fold
fold_metrics = []
accuracy = 0
# Perform 10-fold cross-validation
for fold, (train_indices, test_indices) in enumerate(kf.split(dataset)):
    # Create data loaders for the current fold


    train_sampler = torch.utils.data.SubsetRandomSampler(train_indices)
    test_sampler = torch.utils.data.SubsetRandomSampler(test_indices)


    train_loader = DataLoader(dataset, batch_size=batch_size, sampler=train_sampler, drop_last=True)
    test_loader = DataLoader(dataset, batch_size=batch_size, sampler=test_sampler, drop_last=True)

    # Create a new model for each fold
    model = DualCnnBiLsmtModel(embedding_dim, hidden_dim, tagset_size, len(trainData[0]),suggested_lr)

    # Train the model
    trainer.fit(model, train_loader, test_loader)

    correct_predictions = 0
    total_predictions = 0


    for batch in test_loader:  # Assuming you have a DataLoader for the test set
      x_test, y_test = batch

      # Forward pass to get model predictions
      with torch.no_grad():  # Disable gradient tracking
          predictions = model(x_test)



      # Convert predictions to class labels by selecting the class with the highest probability
      predicted_labels = torch.argmax(predictions, dim=1)
      true_labels = torch.argmax(y_test, dim=1)

      # Compare predicted labels with true labels
      correct_predictions += (predicted_labels == true_labels).sum().item()
      total_predictions += len(y_test)

    # accuracy = correct_predictions / total_predictions
    print(correct_predictions)
    print(total_predictions)
    accuracy += correct_predictions/total_predictions

print("Model 10-Fold accuracy = ",accuracy/10)

In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import torch.nn.functional as F
import torch.optim as optim
import torch
import torch.nn as nn
import lightning as L
import torch
from torch.utils.data import DataLoader, TensorDataset
from pytorch_lightning.callbacks import LearningRateFinder
import lightning as L
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import KFold
from torch.utils.data.dataset import random_split