In [26]:
#for data processing
import pandas as pd
import numpy as np
import string
import re

#for plotting
import matplotlib.pyplot as plt
import seaborn as sns

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

#for nlp
from collections import Counter
import nltk
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

#for regression model

from sklearn import metrics

#for evaluation
import scipy

In [33]:
# Load train data for different emotions
cols = ['id', 'text', 'label', 'intensity']

anger_ratings = pd.read_csv('data/anger-ratings.csv', header=0, names=cols)
fear_ratings = pd.read_csv('data/fear-ratings.csv', header=0, names=cols)
sad_ratings = pd.read_csv('data/sadness-ratings.csv', header=0, names=cols)
joy_ratings = pd.read_csv('data/joy-ratings.csv', header=0, names=cols)

# Display the first few rows of the joy_train DataFrame
anger_ratings.head()


Unnamed: 0,id,text,label,intensity
0,10857,@ZubairSabirPTI pls dont insult the word 'Molna',anger,0.479
1,10858,@ArcticFantasy I would have almost took offens...,anger,0.458
2,10859,@IllinoisLoyalty that Rutgers game was an abom...,anger,0.562
3,10860,@CozanGaming that's what lisa asked before she...,anger,0.5
4,10861,Sometimes I get mad over something so minuscul...,anger,0.708


In [34]:
# Frames contain anger_ratings, fear_ratings, sad_ratings, joy_ratings
frames = [anger_ratings, fear_ratings, sad_ratings, joy_ratings]
data_training = pd.concat(frames)
data_training.reset_index(inplace=True)

# Filter out rows with label equal to 4
data_training = data_training[data_training['label'] != 4]

# Rename the label "Label" to the actual emotion labels
data_training['label'] = data_training['label'].replace({'Label': 'fear'})

# Reset index after filtering and renaming
data_training.reset_index(drop=True, inplace=True)

# Check label value counts after removal
print(data_training['label'].value_counts())


label
fear       2252
anger      1701
joy        1616
sadness    1533
Name: count, dtype: int64


In [35]:
data_training.head()

Unnamed: 0,index,id,text,label,intensity
0,0,10857,@ZubairSabirPTI pls dont insult the word 'Molna',anger,0.479
1,1,10858,@ArcticFantasy I would have almost took offens...,anger,0.458
2,2,10859,@IllinoisLoyalty that Rutgers game was an abom...,anger,0.562
3,3,10860,@CozanGaming that's what lisa asked before she...,anger,0.5
4,4,10861,Sometimes I get mad over something so minuscul...,anger,0.708


In [36]:
punc = string.punctuation
data_training['word_count'] = data_training['text'].apply(lambda x:len(x.split()))
data_training['char_count'] = data_training['text'].apply(lambda x:len(x.replace(' ','')))
data_training['punc_count'] = data_training['text'].apply(lambda x:len([a for a in x if a in punc]))
data_training.head()

Unnamed: 0,index,id,text,label,intensity,word_count,char_count,punc_count
0,0,10857,@ZubairSabirPTI pls dont insult the word 'Molna',anger,0.479,7,42,3
1,1,10858,@ArcticFantasy I would have almost took offens...,anger,0.458,14,68,1
2,2,10859,@IllinoisLoyalty that Rutgers game was an abom...,anger,0.562,20,95,4
3,3,10860,@CozanGaming that's what lisa asked before she...,anger,0.5,16,75,6
4,4,10861,Sometimes I get mad over something so minuscul...,anger,0.708,25,109,0


In [37]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\appin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [38]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

mention_pattern = r'@[A-Za-z0-9_]+'
url_pattern = r'https?://[A-Za-z0-9./]+'
number_pattern = r'[0-9]+'

combined_pattern = re.compile(f"({'|'.join([mention_pattern, url_pattern, number_pattern])})")

stop_words = set(stopwords.words('english'))

def tweet_cleaner(data_frame):
    print('Cleaning and parsing the tweets...\n')
    clean_data = []
    
    for index, row in data_frame.iterrows():
        cleaned_text = re.sub(combined_pattern, '', row.text)
        cleaned_text = cleaned_text.lower()
        words = word_tokenize(cleaned_text)
        filtered_words = [w for w in words if w not in stop_words]
        cleaned_sentence = ' '.join(filtered_words).strip()
        clean_data.append((cleaned_sentence, row.label, row.intensity))  # Include intensity value
    print('Done!')
    return clean_data


In [39]:
# Clean the data
clean_data_training_list = tweet_cleaner(data_training)

# Create a DataFrame from the cleaned data
clean_data_training_df = pd.DataFrame(clean_data_training_list, columns=['cleaned_text', 'label', 'intensity'])

# Save the DataFrame to a CSV file
clean_data_training_df.to_csv('cleaned_data_training.csv', index=False)


Cleaning and parsing the tweets...

Done!


In [40]:
!pip install transformers



DEPRECATION: Loading egg at c:\python311\lib\site-packages\mask_rcnn-2.1-py3.11.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation.. Discussion can be found at https://github.com/pypa/pip/issues/12330

[notice] A new release of pip is available: 23.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [41]:
import torch
import pandas as pd
import logging
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup, BertConfig
from torch.utils.data import TensorDataset, DataLoader, random_split
from tqdm.auto import tqdm
from torch import nn
import torch.optim as optim

# Load and Preprocess Data

In [42]:
data = pd.read_csv("cleaned_data_training.csv")

In [43]:
data

Unnamed: 0,cleaned_text,label,intensity
0,pls dont insult word 'molna ',anger,0.479
1,would almost took offense actually snapped,anger,0.458
2,rutgers game abomination . affront god man . m...,anger,0.562
3,"'s lisa asked started raging , 'can call ? ' heh",anger,0.500
4,sometimes get mad something minuscule try ruin...,anger,0.708
...,...,...,...
7097,'s lack company liveliness makes bored .,joy,0.058
7098,quinn 's short hair makes sad . # glee,joy,0.040
7099,hate overthinking e v e r h n g like jus ' wan...,joy,0.040
7100,people cheer sports teams completely outside n...,joy,0.020


In [44]:
# Instantiate the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the texts
# words = data.words.tolist()
words = [str(cleaned_text) for cleaned_text in data.cleaned_text.tolist()]
data_1 = tokenizer.batch_encode_plus(words, add_special_tokens=True, padding=True, truncation=True, return_tensors='pt')


# Prepare the dataset
vad_scores1 = torch.tensor(data[['intensity']].values, dtype=torch.float32)
dataset = TensorDataset(data_1["input_ids"], data_1["attention_mask"], vad_scores1)

# Split the dataset into training, validation, and test sets
train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size

train_dataset, val_dataset,test_dataset = random_split(dataset, [train_size, val_size,test_size])

# Create data loaders
batch_size_dataset = 16
num_workers = 0
train_loader = DataLoader(train_dataset, batch_size=batch_size_dataset, shuffle=True, num_workers=num_workers)
val_loader = DataLoader(val_dataset, batch_size=batch_size_dataset, num_workers=num_workers)
test_loader = DataLoader(test_dataset, batch_size=batch_size_dataset, num_workers=num_workers)

# Define the VADModel Class and Instantiate the Model

In [45]:
# Define the VADModel class
class VADModel(nn.Module):
    def __init__(self):
        super(VADModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.fc = nn.Linear(768, 1)  # 768 is the output size of BERT base model

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        vad_scores_pred = self.fc(pooled_output)
        return vad_scores_pred

# Create an instance of VADModel
model = VADModel()

# Define Loss Function, Optimizer, and Learning Rate Scheduler

In [47]:
# Set the device (GPU or CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
# Define the loss function and optimizer
loss_fn = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-5)

# Define the learning rate scheduler
num_epochs = 5
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Training Loop

In [49]:
import torch
from sklearn.metrics import mean_absolute_error, mean_squared_error
model.train()

total_mse_across_epochs = 0.0
total_loss_across_epochs = 0.0

for epoch in range(num_epochs):
    losses = []
    total_mse = 0.0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}, Loss: ", leave=False)
    for batch in progress_bar:
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        vad_scores = batch[2].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        loss = loss_fn(outputs, vad_scores)
        losses.append(loss.item())

        mse = torch.mean(torch.square(outputs - vad_scores)).item()
        total_mse += mse

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    epoch_mse = total_mse / len(train_loader.dataset)
    epoch_loss = sum(losses) / len(losses)

    total_mse_across_epochs += epoch_mse
    total_loss_across_epochs += epoch_loss

    print(f"Epoch {epoch+1}/{num_epochs}: Loss = {epoch_loss:.4f}, MSE = {epoch_mse:.4f}")

    # Validation
    model.eval()
    val_losses = []
    val_total_mse = 0.0

    with torch.no_grad():
        for val_batch in val_loader:
            val_input_ids = val_batch[0].to(device)
            val_attention_mask = val_batch[1].to(device)
            val_vad_scores = val_batch[2].to(device)

            val_outputs = model(input_ids=val_input_ids, attention_mask=val_attention_mask)
            val_loss = loss_fn(val_outputs, val_vad_scores)
            val_losses.append(val_loss.item())

            val_mse = torch.mean(torch.square(val_outputs - val_vad_scores)).item()
            val_total_mse += val_mse

    val_epoch_mse = val_total_mse / len(val_loader.dataset)
    val_epoch_loss = sum(val_losses) / len(val_losses)

    print(f"Validation MSE: {val_epoch_mse:.4f}")
    print(f"Validation Loss: {val_epoch_loss:.4f}")

average_mse = total_mse_across_epochs / num_epochs
average_loss = total_loss_across_epochs / num_epochs
print(f"\nAverage MSE across all epochs: {average_mse:.4f}")
print(f"Average Loss across all epochs: {average_loss:.4f}")
# Calculate MAE and RMSE
y_true = []
y_pred = []

with torch.no_grad():
    for test_batch in test_loader:
        test_input_ids = test_batch[0].to(device)
        test_attention_mask = test_batch[1].to(device)
        test_vad_scores = test_batch[2].to(device)

        test_outputs = model(input_ids=test_input_ids, attention_mask=test_attention_mask)
        y_true.extend(test_vad_scores.cpu().numpy())
        y_pred.extend(test_outputs.cpu().numpy())

mae = mean_absolute_error(y_true, y_pred)
rmse = mean_squared_error(y_true, y_pred, squared=False)

print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")

Epoch 1/5, Loss:   0%|          | 0/356 [00:00<?, ?it/s]

Epoch 1/5: Loss = 0.0346, MSE = 0.0022
Validation MSE: 0.0017
Validation Loss: 0.0262


Epoch 2/5, Loss:   0%|          | 0/356 [00:00<?, ?it/s]

Epoch 2/5: Loss = 0.0198, MSE = 0.0012
Validation MSE: 0.0015
Validation Loss: 0.0229


Epoch 3/5, Loss:   0%|          | 0/356 [00:00<?, ?it/s]

Epoch 3/5: Loss = 0.0113, MSE = 0.0007
Validation MSE: 0.0014
Validation Loss: 0.0221


Epoch 4/5, Loss:   0%|          | 0/356 [00:00<?, ?it/s]

Epoch 4/5: Loss = 0.0058, MSE = 0.0004
Validation MSE: 0.0015
Validation Loss: 0.0233


Epoch 5/5, Loss:   0%|          | 0/356 [00:00<?, ?it/s]

Epoch 5/5: Loss = 0.0033, MSE = 0.0002
Validation MSE: 0.0015
Validation Loss: 0.0233

Average MSE across all epochs: 0.0009
Average Loss across all epochs: 0.0150
MAE: 0.1212
RMSE: 0.1564


In [50]:
PATH = "deep_learning_model.pt"

# Save
torch.save(model, PATH)

 # Prediction Function

In [51]:
# Function to predict VAD scores for a given text
def predict_vad_scores(model, tokenizer, text, device):
    model.eval()
    with torch.no_grad():
        inputs = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.cpu().numpy()

#  Prediction

In [52]:

# Read the CSV file
data = pd.read_csv("cleaned_data_training.csv")  # Replace "your_data.csv" with your file path

# Iterate over each row in the CSV file
predicted_scores = []
for index, row in data.iterrows():
    text = row['cleaned_text']  # Replace 'text_column_name' with the column containing text in your CSV
    vad_scores = predict_vad_scores(model, tokenizer, text, device)
    predicted_scores.append(vad_scores)

# Add predicted scores to the DataFrame
data['predicted_scores'] = predicted_scores

# Save the DataFrame to a new CSV file
data.to_csv("predicted_data.csv", index=False)

In [53]:
data = pd.read_csv("predicted_data.csv")
data

Unnamed: 0,cleaned_text,label,intensity,predicted_scores
0,pls dont insult word 'molna ',anger,0.479,[[0.5012173]]
1,would almost took offense actually snapped,anger,0.458,[[0.44910318]]
2,rutgers game abomination . affront god man . m...,anger,0.562,[[0.60562694]]
3,"'s lisa asked started raging , 'can call ? ' heh",anger,0.500,[[0.5181444]]
4,sometimes get mad something minuscule try ruin...,anger,0.708,[[0.6405146]]
...,...,...,...,...
7097,'s lack company liveliness makes bored .,joy,0.058,[[0.28966087]]
7098,quinn 's short hair makes sad . # glee,joy,0.040,[[0.15976156]]
7099,hate overthinking e v e r h n g like jus ' wan...,joy,0.040,[[0.2919076]]
7100,people cheer sports teams completely outside n...,joy,0.020,[[0.21450563]]
