In [None]:
import pandas as pd

In [None]:
import torch

In [None]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [None]:
import torch.nn as nn

In [None]:
import gc

In [None]:
from torch.nn.utils.rnn import pack_padded_sequence,pad_packed_sequence

In [None]:
import spacy

In [None]:
import numpy as np

In [None]:
from tqdm.notebook import tqdm

In [None]:
sp = spacy.load('en_core_web_sm')

In [None]:
import pickle

In [None]:
def pickle_dump(f_name,obj):
  with open(f_name,"wb") as f:
    pickle.dump(file = f, obj = obj)


In [None]:
def pickle_load(f_name):
    with open(f_name,"rb") as f:
        obj = pickle.load(f)
    
    return obj

### Load Data

In [None]:
data = pd.read_csv("/kaggle/input/preprocessed-data/preprocessed_data.csv")

In [None]:
data.head()

In [None]:
test_data_len = int(0.2*len(data))

train_data_len = len(data) - test_data_len

test_data_len,train_data_len

In [None]:
data = data.sample(frac = 1).reset_index(drop=True)

In [None]:
train_data,test_data = data.loc[:train_data_len,:],data.loc[train_data_len:train_data_len+test_data_len,:]

In [None]:
len(train_data),len(test_data)

In [None]:
del data
gc.collect()

### Custom Dataset for handling data

## Generating Vocacb from the text
> Why is it required? 
We cannot feed a string of text to pytorch, it deals with tensors , for which we will need to convert the text to tensor. One method could be to give the index of the word in the vocab

For Ex: Text : I have an apple.
index of I in vocab : 10
index of have in vocab : 20
index of an in vocab : 5
index of an apple : 6

hence,
input : [10,20,5,6]

In [None]:
def yield_tokens(list_of_text):
  '''
    Will return an iterator containing the tokens of each sen
  '''
  for text in list_of_text:
    yield text.strip().split()

In [None]:
from torchtext.vocab import build_vocab_from_iterator
vocab = build_vocab_from_iterator(yield_tokens(train_data.text),specials=["<unk>"])

vocab.set_default_index(vocab['<unk>'])

In [None]:
vocab["pleased"]

In [None]:
vocab["afda"]

In [None]:
len(vocab)

### Convert Text to tensor

In [None]:
import gc

In [None]:
def convertTextToTensor(list_of_text,vocab):

    res_tensor = torch.zeros(len(list_of_text),len(vocab))
    
    for row,t in tqdm(enumerate(list_of_text)):#loop for each text in list of texts

      for col,w in enumerate(t.split()): #loop for each word in text:
        res_tensor[row][col] = vocab[w]
    
    return res_tensor


In [None]:
#since storing the entire text 

tensors_text  = convertTextToTensor(list(train_data.text),vocab)



In [None]:
pickle_dump(f_name="train_text_tensors.pkl",obj=tensors_text)

del tensors_text
gc.collect()

In [None]:
# del train_data
# gc.collect()

del data
gc.collect()

In [None]:
del train_data
gc.collect()

In [None]:
vocab['<unk>']

In [None]:
vocab["absgfd"]

In [None]:
tensors_text_test = convertTextToTensor(list(test_data.text),vocab)

pickle_dump(f_name="test_text_tensors.pkl",obj=tensors_text_test)

del tensors_text_test
gc.collect()

##Fitting LabelEncoder for converting labels to tensors

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(train_data.emotions)


### Converting Labels to tensors

In [None]:
def convertLabelToTensor(labels,label_enc_obj):
    labels = label_enc_obj.transform(labels)
    return labels

In [None]:
tensors_labels_train = convertLabelToTensor(train_data.emotions,le)
tensors_labels_test = convertLabelToTensor(test_data.emotions,le)

In [None]:
tensors_labels_train.shape

In [None]:
tensors_labels_test.shape