In [58]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim
import math
import copy
import sklearn
import string
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import torchtext
from torchtext.data import get_tokenizer
from nltk.tokenize import word_tokenize
import sentence_transformers

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [59]:
from google.colab import drive
from google.colab import files
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [60]:
uploaded = files.upload()

Saving dataset.csv to dataset (1).csv


In [61]:
#The emotions are classified into six categories: sadness (0), joy (1), love (2), anger (3), fear (4), and surprise (5).
data = pd.read_csv("/content/dataset.csv")

In [62]:
data.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,0,i just feel really helpless and heavy hearted,4
1,1,ive enjoyed being able to slouch about relax a...,0
2,2,i gave up my internship with the dmrg and am f...,4
3,3,i dont know i feel so lost,0
4,4,i am a kindergarten teacher and i am thoroughl...,4


In [63]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 416809 entries, 0 to 416808
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  416809 non-null  int64 
 1   text        416809 non-null  object
 2   label       416809 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 9.5+ MB


In [64]:
data.isna().sum()

Unnamed: 0    0
text          0
label         0
dtype: int64

In [65]:
#The emotions are classified into six categories: sadness (0), joy (1), love (2), anger (3), fear (4), and surprise (5).
data['label'].value_counts()

label
1    141067
0    121187
3     57317
4     47712
2     34554
5     14972
Name: count, dtype: int64

In [66]:
stop_words = stopwords.words('english')
def clean_data(tweet) -> str:
  #Convert to Lower Case
  tweet = tweet.lower()

  #Remove Punctuation from the tweet
  tweet = tweet.translate(str.maketrans('', '', string.punctuation))

  #Remove stopwords from the tweet
  remove_stopwords = [word for word in tweet.split() if word not in stop_words]
  tweet = ' '.join(remove_stopwords)

  #Remove digits
  tweet = re.sub(r"\d",'',tweet)

  #remove extra space
  tweet = re.sub(r"\s+"," ",tweet)

  return tweet

In [67]:
data['model_input_text'] = data['text'].apply(clean_data)
data.head()

Unnamed: 0.1,Unnamed: 0,text,label,model_input_text
0,0,i just feel really helpless and heavy hearted,4,feel really helpless heavy hearted
1,1,ive enjoyed being able to slouch about relax a...,0,ive enjoyed able slouch relax unwind frankly n...
2,2,i gave up my internship with the dmrg and am f...,4,gave internship dmrg feeling distraught
3,3,i dont know i feel so lost,0,dont know feel lost
4,4,i am a kindergarten teacher and i am thoroughl...,4,kindergarten teacher thoroughly weary job take...


In [88]:
from sentence_transformers import SentenceTransformer
# Load a pre-trained model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def embedding(sentence):
  embeddings = model.encode(sentence)
  return embeddings



In [89]:
data['embedded'] = data['model_input_text'].apply(embedding)
data.head()

Unnamed: 0.1,Unnamed: 0,text,label,model_input_text,embedded
0,0,i just feel really helpless and heavy hearted,4,feel really helpless heavy hearted,"[0.19374932, 0.33269206, 0.07816112, -0.028441..."
1,1,ive enjoyed being able to slouch about relax a...,0,ive enjoyed able slouch relax unwind frankly n...,"[0.0833772, -0.27273348, 0.64947385, 0.0682905..."
2,2,i gave up my internship with the dmrg and am f...,4,gave internship dmrg feeling distraught,"[-0.2025333, 0.15025133, 0.65832657, 0.0836372..."
3,3,i dont know i feel so lost,0,dont know feel lost,"[0.4032204, -0.06682834, 0.3310123, 0.20955956..."
4,4,i am a kindergarten teacher and i am thoroughl...,4,kindergarten teacher thoroughly weary job take...,"[0.1375044, 0.27389878, 0.21386795, 0.41600043..."


In [106]:
def get_positional_encodings(seq_len=1, embedding_dim=384):
    pos_enc = np.zeros((seq_len, embedding_dim))
    for pos in range(seq_len):
        for i in range(0, embedding_dim, 2):
            pos_enc[pos, i] = np.sin(pos / (10000 ** ((2 * i)/embedding_dim)))
            if i + 1 < embedding_dim:
                pos_enc[pos, i + 1] = np.cos(pos / (10000 ** ((2 * (i + 1))/embedding_dim)))
    return pos_enc

In [114]:
position_encoding = get_positional_encodings()

def combined_embedding(embedding):
  comb_embedding = embedding + position_encoding.squeeze(0)
  return comb_embedding

In [115]:
data['combined_embedded'] = data['embedded'].apply(combined_embedding)
data.head()

Unnamed: 0.1,Unnamed: 0,text,label,model_input_text,embedded,combined_embedded
0,0,i just feel really helpless and heavy hearted,4,feel really helpless heavy hearted,"[0.19374932, 0.33269206, 0.07816112, -0.028441...","[0.1937493234872818, 1.3326920568943024, 0.078..."
1,1,ive enjoyed being able to slouch about relax a...,0,ive enjoyed able slouch relax unwind frankly n...,"[0.0833772, -0.27273348, 0.64947385, 0.0682905...","[0.08337719738483429, 0.7272665202617645, 0.64..."
2,2,i gave up my internship with the dmrg and am f...,4,gave internship dmrg feeling distraught,"[-0.2025333, 0.15025133, 0.65832657, 0.0836372...","[-0.2025333046913147, 1.15025132894516, 0.6583..."
3,3,i dont know i feel so lost,0,dont know feel lost,"[0.4032204, -0.06682834, 0.3310123, 0.20955956...","[0.40322038531303406, 0.9331716597080231, 0.33..."
4,4,i am a kindergarten teacher and i am thoroughl...,4,kindergarten teacher thoroughly weary job take...,"[0.1375044, 0.27389878, 0.21386795, 0.41600043...","[0.13750439882278442, 1.2738987803459167, 0.21..."


In [71]:
# size_of_verification_data = 0.1
# size_of_test_data = 0.2

# from sklearn.model_selection import train_test_split

# X = data['padded_sequences']
# y = data['label']

# X_model, X_verify, y_model, y_verify = train_test_split(X,y,test_size=size_of_verification_data,shuffle=True)
# X_train, X_test, y_train, y_test = train_test_split(X_model, y_model, test_size=size_of_test_data, shuffle=True)

In [72]:
# print(f"Info of X_verify: {X_verify.info()} \n\n")
# print(f"Info of X_train: {X_train.info()} \n\n")
# print(f"Info of X_test: {X_test.info()} \n\n")

In [73]:
# from torch.utils.data import DataLoader
# BATCH_SIZE = 16


# train_dataloader = DataLoader(list(zip(X_train,y_train)), shuffle=True, batch_size=BATCH_SIZE)
# for X_batch, y_batch in train_dataloader:
#     # print(X_batch, y_batch)
#     break

# test_dataloader = DataLoader(list(zip(X_test,y_test)), shuffle=True, batch_size=BATCH_SIZE)
# for X_batch, y_batch in test_dataloader:
#     # print(X_batch, y_batch)
#     break

# verify_dataloader = DataLoader(list(zip(X_verify,y_verify)), shuffle=True, batch_size=BATCH_SIZE)
# for X_batch, y_batch in verify_dataloader:
#     # print(X_batch, y_batch)
#     break

In [74]:
# print(f"Length of Train Dataloader is: {len(train_dataloader)}")
# print(f"Length of Test Dataloader is: {len(test_dataloader)}")
# print(f"Length of verify Dataloader is: {len(verify_dataloader)}")

In [75]:
# train_feature_batch, train_labels_batch = next(iter(train_dataloader))
# len(train_feature_batch), len(train_labels_batch)

In [81]:
# pos_encodings = pos_encoding(max_sent_len,d_model,10000)

In [83]:
# weights = torch.tensor([0, 10, 3, 0], dtype=torch.float) # create a tensor of weights
# torch.multinomial(weights, 4,replacement=True)