In [1]:
import pandas as pd

In [2]:
import torch

In [3]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [4]:
import torch.nn as nn

In [5]:
import gc

In [6]:
from torch.nn.utils.rnn import pack_padded_sequence,pad_packed_sequence

In [7]:
import spacy



In [8]:
import numpy as np

In [9]:
from tqdm.notebook import tqdm

In [10]:
sp = spacy.load('en_core_web_sm')

In [11]:
import pickle

In [12]:
def pickle_dump(f_name,obj):
  with open(f_name,"wb") as f:
    pickle.dump(file = f, obj = obj)


In [13]:
def pickle_load(f_name):
    with open(f_name,"rb") as f:
        obj = pickle.load(f)
    
    return obj

### Load Data

In [14]:
data = pd.read_csv("/kaggle/input/preprocessed-data/preprocessed_data.csv")

In [15]:
len(data)

172761

In [16]:
## since the data is too huge for our capacity, currently naively cutting it down to 70% of the original size. latter we will look into better ways to handle this large data
data = data[:int(0.7*len(data))]

In [17]:
len(data)

120932

In [18]:
data.head()

Unnamed: 0,text,emotions
0,probably mention feel proud actually keep new ...,joy
1,people feel like go grm worthwhile hour,joy
2,feel especially pleased long time come,joy
3,struggle awful feeling say sweet thing deserve...,joy
4,mean stupid trip make great album thing go fee...,joy


In [19]:
test_data_len = int(0.2*len(data))

train_data_len = len(data) - test_data_len

test_data_len,train_data_len

(24186, 96746)

In [20]:
data = data.sample(frac = 1).reset_index(drop=True)

In [21]:
train_data,test_data = data.loc[:train_data_len,:],data.loc[train_data_len:train_data_len+test_data_len,:]

In [22]:
len(train_data),len(test_data)

(96747, 24186)

In [23]:
del data
gc.collect()

515

### Custom Dataset for handling data

## Generating Vocacb from the text
> Why is it required? 
We cannot feed a string of text to pytorch, it deals with tensors , for which we will need to convert the text to tensor. One method could be to give the index of the word in the vocab

For Ex: Text : I have an apple.
index of I in vocab : 10
index of have in vocab : 20
index of an in vocab : 5
index of an apple : 6

hence,
input : [10,20,5,6]

In [24]:
def yield_tokens(list_of_text):
  '''
    Will return an iterator containing the tokens of each sen
  '''
  for text in list_of_text:
    yield text.strip().split()

In [25]:
from torchtext.vocab import build_vocab_from_iterator
vocab = build_vocab_from_iterator(yield_tokens(train_data.text),specials=["<unk>"])

vocab.set_default_index(vocab['<unk>'])

In [26]:
vocab["pleased"]

575

In [27]:
vocab["afda"]

0

In [28]:
len(vocab)

18889

### Convert Text to tensor

In [29]:
import gc

In [30]:
def convertTextToTensor(list_of_text,vocab):

    res_tensor = torch.zeros(len(list_of_text),len(vocab))
    
    for row,t in tqdm(enumerate(list_of_text)):#loop for each text in list of texts

      for col,w in enumerate(t.split()): #loop for each word in text:
        res_tensor[row][col] = vocab[w]
    
    return res_tensor


In [31]:
#since storing the entire text 

tensors_text  = convertTextToTensor(list(train_data.text),vocab)



0it [00:00, ?it/s]

In [32]:
tensors_text.dtype

torch.float32

In [33]:
pickle_dump(f_name="train_text_tensors.pkl",obj=tensors_text)

del tensors_text
gc.collect()

86

In [34]:
vocab['<unk>']

0

In [35]:
vocab["absgfd"]

0

In [36]:
tensors_text_test = convertTextToTensor(list(test_data.text),vocab)

pickle_dump(f_name="test_text_tensors.pkl",obj=tensors_text_test)

del tensors_text_test
gc.collect()

0it [00:00, ?it/s]

21

##Fitting LabelEncoder for converting labels to tensors

In [37]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(train_data.emotions)


### Converting Labels to tensors

In [38]:
def convertLabelToTensor(labels,label_enc_obj):
    labels = label_enc_obj.transform(labels)
    return labels

In [39]:
tensors_labels_train = convertLabelToTensor(train_data.emotions,le)
tensors_labels_test = convertLabelToTensor(test_data.emotions,le)

In [40]:
tensors_labels_train.shape

(96747,)

In [41]:
tensors_labels_test.shape

(24186,)

In [42]:
pickle_dump(f_name = "train_labels_tensors",obj=tensors_labels_train)
pickle_dump(f_name = "test_labels_tensors",obj=tensors_labels_test)