In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer # pre processing
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset,DataLoader
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [None]:
train_df=pd.read_csv('/content/Train.csv')
test_df=pd.read_csv('/content/Test.csv')

In [None]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    40000 non-null  object
 1   label   40000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 625.1+ KB


In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
import re
stopwords_list=nltk.corpus.stopwords.words('english')
ws=WordNetLemmatizer()
def preprocessing(text):
  text=text.lower()
  text=re.sub(r'[^a-z\s]',' ',text)  #all special symbols get removed
  no_stop=[word for word in text.split() if word not in stopwords_list]
  preprocessed=' '.join([ws.lemmatize(word) for word in no_stop])
  return preprocessed


In [None]:
train_df['text']=train_df['text'].apply(preprocessing) #applying preprocessing method on text column
test_df['text']=test_df['text'].apply(preprocessing)

In [None]:
tokenizer=Tokenizer(num_words=10000,oov_token='<oov>') #top 10k most frequent words are assigned index. words of test data not present in training set are assigned oov tokens
tokenizer.fit_on_texts(train_df['text']) #tokenizer is always trained on training data, not testing data

In [None]:
train_sequences=tokenizer.texts_to_sequences(train_df['text']) #sequence of words in the reviews are replaced by sequence of tokens
test_sequences=tokenizer.texts_to_sequences(test_df['text'])

In [None]:
train_sequences[0]

[1969,
 364,
 66,
 1495,
 6947,
 2473,
 249,
 187,
 157,
 6947,
 249,
 5218,
 249,
 352,
 1,
 909,
 5,
 352,
 1601,
 7171,
 757,
 301,
 595,
 425,
 144,
 15,
 3,
 1242,
 13,
 12,
 2252,
 312,
 144,
 1,
 1197,
 196,
 84,
 7172,
 408,
 1674,
 30,
 1326,
 114,
 416,
 6947,
 2233,
 293,
 2086,
 1642,
 5,
 560,
 881,
 55,
 69,
 5676,
 107,
 1,
 2041,
 201,
 912,
 1,
 1,
 251,
 58,
 2409,
 195,
 235,
 4112,
 329,
 4,
 1828,
 1677,
 1413,
 823,
 109,
 3022,
 8428,
 1,
 9654,
 1,
 1,
 1,
 510,
 2520,
 4918]

In [None]:
train_padded_sequences=pad_sequences(train_sequences,maxlen=100,padding='post') #by default, its post padding. so we can even omit this
test_padded_sequences=pad_sequences(test_sequences,maxlen=100,padding='post')

converting your padded sequences and labels into PyTorch tensors, which is necessary before feeding them into a PyTorch model.

In [None]:
train_padded_torch=torch.tensor(train_padded_sequences,dtype=torch.long)  # converting numpy array to a torch tensor
test_padded_torch=torch.tensor(test_padded_sequences,dtype=torch.long)

train_labels_torch=torch.tensor(train_df['label'].values,dtype=torch.float32)
test_labels_torch=torch.tensor(test_df['label'].values,dtype=torch.float32)


 You're now preparing your data to be used with a PyTorch DataLoader
 This creates PyTorch Dataset objects from your input features (padded sequences) and labels.

TensorDataset is a convenient way to bundle your input and label tensors together so you can:[It’s a wrapper around tensors. It behaves like a dataset where each item is a tuple of tensors]

Access them in pairs (input, label)

Feed them easily into a DataLoader for batching, shuffling, etc.

question on batches

In [None]:
#now we need to create batches
train_dataset=TensorDataset(train_padded_torch,train_labels_torch)
test_dataset=TensorDataset(test_padded_torch,test_labels_torch)

In [None]:
train_loader=DataLoader(train_dataset,batch_size=32,shuffle=True) #faster optimization with large batch size
test_loader=DataLoader(test_dataset,batch_size=32,shuffle=True)  #small batch size gives better results. time will be more but curve would be smooth

In [None]:
class TextClassification(nn.Module): #nn.Module is the base class in pytorch for all neural networks. model's name is TextClassification
# in init method, we are just defining the layers to be used in this NN model
  def __init__(self,vocab_size,embedding_dim,hidden_units):  #self is an object. we are adding 1 hidden layer so just 1 hidden units, if more hidden layers need to be added, then u may write hidden units1, hidden units2, etc
    super(TextClassification,self).__init__() #super constructor of this class TextClassification
    self.embedding=nn.Embedding(vocab_size,embedding_dim) # Embedding layer is a 2 layered FFNN just like word2vec.
    #Input to this layer is (batch size,sequence length), i.e. shape is (32,100). all the training egs will be of size (32,100)
    #output of this layer is (batch size, seq, embedding size) i.e. shape is (32,100,300)
    #each word will be replaced by a vector of 300 dimension
    self.global_avg_pool=nn.AdaptiveAvgPool1d(1)  # its just like Global Average pooling layer in TF. its converts word vectors to document vectors
    #Average of embeddings of all words in 1 document. each document shall be now represented by a 300 dimensional vector.
    #output shape of this layer is (32 x 300 x 1) (32 examples, each example is 300 dim ) bcoz for 100 words in the seq, we have just 1 vector of 300dim
    #it takes 3D input n gives 3d output
    self.fc1=nn.Linear(embedding_dim,hidden_units) #output of previous layer will serve as input here i.e. 300 dimensional vector so no. of input neurons is embedding_dim
    self.relu=nn.ReLU()
    self.fc2=nn.Linear(hidden_units,1)  # output layer
    self.sigmoid=nn.Sigmoid()

  def forward(self,text): # pytorch forward propagation starts here. we define the sequence in which these layers will be connected
    embedded=self.embedding(text).permute(0,2,1) # previously output is of shape (32,100,300) but here we need output to be of shape (32,300,100) to pass to next global_avg_pool
    #so we need to swap last 2 dim, so for the avg pooling for the next layer, sequence length goes to last and 1st dim remains same so permute(0,2,1)
    pooled=self.global_avg_pool(embedded).squeeze(2) # average is to be done of all the words in the seq, i.e. on the last dim of (32,300,100),i.e. 2nd dim i.e, we need to do the average of all the words in the document, i.e. of all 100 words in the seq.
    # average pooling layer shape is (32,300,1), but next layer should have input shape as (32,300). every eg should be  a flatten 1 D array to pass to the next Linear layer. so we remove the 2nd dimension so we use squeeze(2)
    # In TF we need not use permute and squeeze
    hidden=self.relu(self.fc1(pooled))  #Applying relu on hidden layer
    output=self.sigmoid(self.fc2(hidden)) #Applying sigmoid on output layer
    return output


In [None]:

device =torch.device('cuda' if torch.cuda.is_available() else 'cpu')
vocab_size=10002 #1 is for padding n other for unknown OOV token
embedding_dim=300
hidden_units=256
model=TextClassification(vocab_size,embedding_dim,hidden_units).to(device) #model is an instance of TextClassification Model class

In [None]:
loss_fn=nn.BCELoss()    #binary cross entropy loss
optimizer=optim.Adam(model.parameters(),lr=3e-4)

In [None]:
def batch_accuracy(outputs,labels):
  predicted=(outputs>0.5).float()
  correct=(predicted==labels).sum().item()
  total=labels.size(0)
  return correct/total

In [None]:
epochs=10
for i in range(epochs):
  model.train()
  loss=0
  accuracy=0
  for texts,labels in train_loader:
    texts,labels=texts.to(device),labels.to(device)
    optimizer.zero_grad()
    outputs=model(texts).squeeze()  #probability will be returned, which is 1x1 so we squeeze it to convert it into a scalar value
    batch_loss=loss_fn(outputs,labels)
    batch_loss.backward()
    optimizer.step()
    loss+=batch_loss.item()
    accuracy+=batch_accuracy(outputs,labels)
  print(f'Epoch: {i+1}, Training_Accuracy: {accuracy/len(train_loader)}, Training_loss: {loss/len(train_loader)}')

Epoch: 1, Training_Accuracy: 0.750775, Training_loss: 0.49339095450639725
Epoch: 2, Training_Accuracy: 0.860125, Training_loss: 0.3254195868849754
Epoch: 3, Training_Accuracy: 0.8835, Training_loss: 0.2791259401202202
Epoch: 4, Training_Accuracy: 0.89765, Training_loss: 0.25131699267625807
Epoch: 5, Training_Accuracy: 0.909025, Training_loss: 0.2290497768998146
Epoch: 6, Training_Accuracy: 0.915475, Training_loss: 0.21538467758893967
Epoch: 7, Training_Accuracy: 0.92035, Training_loss: 0.2016336963355541
Epoch: 8, Training_Accuracy: 0.92705, Training_loss: 0.18862017841339113
Epoch: 9, Training_Accuracy: 0.93005, Training_loss: 0.18122458250373602
Epoch: 10, Training_Accuracy: 0.935, Training_loss: 0.1729540966063738


In [None]:
def evaluate():
  model.eval()
  loss=0
  accuracy=0
  with torch.no_grad():
    for texts,labels in test_loader:
      texts,labels=texts.to(device),labels.to(device)
      outputs=model(texts).squeeze()
      batch_loss=loss_fn(outputs,labels)
      loss+=batch_loss.item()
      accuracy+=batch_accuracy(outputs,labels)
  print(f'Testing_Accuracy: {accuracy/len(test_loader)}, Testing_loss: {loss/len(test_loader)}')

In [None]:
evaluate()

Testing_Accuracy: 0.8648487261146497, Testing_loss: 0.35987203150607977


Accuracy is low so we need to do some hyper parameter tuning.
Moreover, NN is not a good technique for doing text classification. we need to use RNN for this.

In [None]:
tokenizer.vocab_size

AttributeError: 'Tokenizer' object has no attribute 'vocab_size'