In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim
import math
import copy
import sklearn
import string
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import torchtext
from torchtext.data import get_tokenizer
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
from google.colab import drive
from google.colab import files
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
uploaded = files.upload()

Saving dataset.csv to dataset.csv


In [4]:
#The emotions are classified into six categories: sadness (0), joy (1), love (2), anger (3), fear (4), and surprise (5).
data = pd.read_csv("/content/dataset.csv")

In [5]:
data.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,0,i just feel really helpless and heavy hearted,4
1,1,ive enjoyed being able to slouch about relax a...,0
2,2,i gave up my internship with the dmrg and am f...,4
3,3,i dont know i feel so lost,0
4,4,i am a kindergarten teacher and i am thoroughl...,4


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 416809 entries, 0 to 416808
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  416809 non-null  int64 
 1   text        416809 non-null  object
 2   label       416809 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 9.5+ MB


In [7]:
data.isna().sum()

Unnamed: 0    0
text          0
label         0
dtype: int64

In [8]:
#The emotions are classified into six categories: sadness (0), joy (1), love (2), anger (3), fear (4), and surprise (5).
data['label'].value_counts()

label
1    141067
0    121187
3     57317
4     47712
2     34554
5     14972
Name: count, dtype: int64

In [9]:
stop_words = stopwords.words('english')
def clean_data(tweet) -> str:
  #Convert to Lower Case
  tweet = tweet.lower()

  #Remove Punctuation from the tweet
  tweet = tweet.translate(str.maketrans('', '', string.punctuation))

  #Remove stopwords from the tweet
  remove_stopwords = [word for word in tweet.split() if word not in stop_words]
  tweet = ' '.join(remove_stopwords)

  #Remove digits
  tweet = re.sub(r"\d",'',tweet)

  #remove extra space
  tweet = re.sub(r"\s+"," ",tweet)

  return tweet

In [10]:
data['model_input_text'] = data['text'].apply(clean_data)
data.head()

Unnamed: 0.1,Unnamed: 0,text,label,model_input_text
0,0,i just feel really helpless and heavy hearted,4,feel really helpless heavy hearted
1,1,ive enjoyed being able to slouch about relax a...,0,ive enjoyed able slouch relax unwind frankly n...
2,2,i gave up my internship with the dmrg and am f...,4,gave internship dmrg feeling distraught
3,3,i dont know i feel so lost,0,dont know feel lost
4,4,i am a kindergarten teacher and i am thoroughl...,4,kindergarten teacher thoroughly weary job take...


In [11]:
tokenizer = get_tokenizer("basic_english")
tokens = tokenizer("You can now install TorchText using pip!")
tokens

['you', 'can', 'now', 'install', 'torchtext', 'using', 'pip', '!']

In [12]:
def convert(lst):
  return ([i for i in lst.split()])

data['tokenized_sents'] = data['model_input_text'].apply(convert)
data.head()

Unnamed: 0.1,Unnamed: 0,text,label,model_input_text,tokenized_sents
0,0,i just feel really helpless and heavy hearted,4,feel really helpless heavy hearted,"[feel, really, helpless, heavy, hearted]"
1,1,ive enjoyed being able to slouch about relax a...,0,ive enjoyed able slouch relax unwind frankly n...,"[ive, enjoyed, able, slouch, relax, unwind, fr..."
2,2,i gave up my internship with the dmrg and am f...,4,gave internship dmrg feeling distraught,"[gave, internship, dmrg, feeling, distraught]"
3,3,i dont know i feel so lost,0,dont know feel lost,"[dont, know, feel, lost]"
4,4,i am a kindergarten teacher and i am thoroughl...,4,kindergarten teacher thoroughly weary job take...,"[kindergarten, teacher, thoroughly, weary, job..."


In [13]:
res = []
max_sent_len = 0

#Follwoing line of codes are used to concatenate all input cleaned text into one sentence and define a lookup table with each word as a key and an integer value
#Input cleaned sentence with maximum length is also recorded. This value is used to find the length of input sequenced list after padding
for index, row in data.iterrows():
    res += row['tokenized_sents']
    sent_len = len(row['tokenized_sents'])
    if sent_len > max_sent_len:
      max_sent_len = sent_len
look_up = {s: i for i, s in enumerate(res)}


def convert_text(lst):
  """Converts list of words into list of integer sequences by mapping each word in look_up table defined above"""
  res = list()
  for word in lst:
    code = look_up[word]
    res.append(code)
  return res

def padding(lst):
  """Pads list of integer sequences by adding zeroes in the front if the input sequence"""
  res = list()
  for i in range(max_sent_len - len(lst)):
    res.append(0)
  res += lst
  return res


data['tokenized_sequences'] = data['tokenized_sents'].apply(convert_text)
data['padded_sequences'] = data['tokenized_sequences'].apply(padding)
data.head()

Unnamed: 0.1,Unnamed: 0,text,label,model_input_text,tokenized_sents,tokenized_sequences,padded_sequences
0,0,i just feel really helpless and heavy hearted,4,feel really helpless heavy hearted,"[feel, really, helpless, heavy, hearted]","[3899824, 3899187, 3898073, 3897963, 3835981]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,1,ive enjoyed being able to slouch about relax a...,0,ive enjoyed able slouch relax unwind frankly n...,"[ive, enjoyed, able, slouch, relax, unwind, fr...","[3899364, 3882812, 3899815, 1377156, 3898375, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,2,i gave up my internship with the dmrg and am f...,4,gave internship dmrg feeling distraught,"[gave, internship, dmrg, feeling, distraught]","[3899634, 3878954, 31, 3899780, 3896437]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,3,i dont know i feel so lost,0,dont know feel lost,"[dont, know, feel, lost]","[3899673, 3899722, 3899824, 3898864]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,4,i am a kindergarten teacher and i am thoroughl...,4,kindergarten teacher thoroughly weary job take...,"[kindergarten, teacher, thoroughly, weary, job...","[3884466, 3897100, 3897666, 3886858, 3899761, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [14]:
size_of_verification_data = 0.1
size_of_test_data = 0.2

from sklearn.model_selection import train_test_split

X = data['padded_sequences']
y = data['label']

X_model, X_verify, y_model, y_verify = train_test_split(X,y,test_size=size_of_verification_data,shuffle=True)
X_train, X_test, y_train, y_test = train_test_split(X_model, y_model, test_size=size_of_test_data, shuffle=True)

In [15]:
print(f"Info of X_verify: {X_verify.info()} \n\n")
print(f"Info of X_train: {X_train.info()} \n\n")
print(f"Info of X_test: {X_test.info()} \n\n")

<class 'pandas.core.series.Series'>
Index: 41681 entries, 325310 to 107910
Series name: padded_sequences
Non-Null Count  Dtype 
--------------  ----- 
41681 non-null  object
dtypes: object(1)
memory usage: 651.3+ KB
Info of X_verify: None 


<class 'pandas.core.series.Series'>
Index: 300102 entries, 204901 to 247293
Series name: padded_sequences
Non-Null Count   Dtype 
--------------   ----- 
300102 non-null  object
dtypes: object(1)
memory usage: 4.6+ MB
Info of X_train: None 


<class 'pandas.core.series.Series'>
Index: 75026 entries, 53172 to 277446
Series name: padded_sequences
Non-Null Count  Dtype 
--------------  ----- 
75026 non-null  object
dtypes: object(1)
memory usage: 1.1+ MB
Info of X_test: None 




In [16]:
from torch.utils.data import DataLoader
BATCH_SIZE = 16


train_dataloader = DataLoader(list(zip(X_train,y_train)), shuffle=True, batch_size=BATCH_SIZE)
for X_batch, y_batch in train_dataloader:
    # print(X_batch, y_batch)
    break

test_dataloader = DataLoader(list(zip(X_test,y_test)), shuffle=True, batch_size=BATCH_SIZE)
for X_batch, y_batch in test_dataloader:
    # print(X_batch, y_batch)
    break

verify_dataloader = DataLoader(list(zip(X_verify,y_verify)), shuffle=True, batch_size=BATCH_SIZE)
for X_batch, y_batch in verify_dataloader:
    # print(X_batch, y_batch)
    break

In [17]:
print(f"Length of Train Dataloader is: {len(train_dataloader)}")
print(f"Length of Test Dataloader is: {len(test_dataloader)}")
print(f"Length of verify Dataloader is: {len(verify_dataloader)}")

Length of Train Dataloader is: 18757
Length of Test Dataloader is: 4690
Length of verify Dataloader is: 2606


In [18]:
train_feature_batch, train_labels_batch = next(iter(train_dataloader))
len(train_feature_batch), len(train_labels_batch)

(79, 16)