# Nepali News Classification 
Compare the accuracy of different models using different algorithms like LSTM , GRU to classify the news dataset into different categories (politics, sport, entertainment, tech, business). 

## Improrting Data and Data Preprocessing


In [1]:
import pandas as pd 
import numpy as np

  from pandas.core import (


In [2]:
pd.set_option('display.max_rows', None)

In [3]:
dataset = pd.read_csv('/home/supriya/Desktop/FDV/Nepali-News-Classification/Nepali_Dataset_New.csv')

In [4]:
dataset.shape

(4540, 2)

In [5]:
dataset.head()

Unnamed: 0,News,Category
0,"\n﻿काठमाण्डौ, ६ असार । नेपाल ललितकला प्रज्ञा प...",politics
1,नेपालको आशा जीवितैकप्तान पारस खड्काले ब्याट र ...,sport
2,"\n﻿राजविराज, २६ फागुन । नेकपा एमालेलाई औद्योगि...",politics
3,"सामाजिक कार्यका लागि सुन्दरी""\nसौन्दर्य प्रतिय...",entertainment
4,"खराब प्रदर्शनपछि प्रशिक्षण पिच""महिला राष्ट्रिय...",sport


In [6]:
dataset['Category'].unique()

array(['politics', 'sport', 'entertainment', 'tech', 'business'],
      dtype=object)

In [7]:
cats = {
    'politics' : 0, 
    'sport' : 1, 
    'entertainment' : 2, 
    'tech' : 3, 
    'business': 4
   
}

### Cleaning , Tokenizing and Removing stopwords

In [8]:
from nepali_stemmer.stemmer import NepStemmer
import nltk
from nltk.corpus import stopwords
import re

def clean_and_tokenize(text):
    #removing unnecessary symbols
    clean_text = re.sub(r'[\n,|।\'":]', '', text)


    #Tokenizing Text 
    nepstem = NepStemmer()
    tokenized_text = nepstem.stem(clean_text)

    #removing stopwords
    nep_stopwords = set(stopwords.words('nepali'))
    words = tokenized_text.split()
    filtered_words = [word for word in words if word.lower() not in nep_stopwords]
    C_T_S = ' '.join(filtered_words)

    return C_T_S

    

## Data Splitting
- Initial Splitting : X, y -> train_x, train_y 
- Tokenize train_x
- Further splitting for validation and testing 
        -To prevent data leakage

In [9]:
%%time
from sklearn.model_selection import train_test_split

x = dataset['News']
y = dataset['Category']

train_x, _, train_y, _ = train_test_split(x, y, stratify=y, test_size=0.95)

# clean the text
train_x = train_x.apply(lambda x: clean_and_tokenize(str(x)))

# convert the categories to ids
train_y = train_y.apply(lambda x: cats[x])

# first split the train dataset into (train, rest) set
train_x, rest_x, train_y, rest_y = train_test_split(train_x, train_y, stratify=train_y, test_size=0.2, random_state=747)

# then split the rest into (test, valid) set
test_x, valid_x, test_y, valid_y = train_test_split(rest_x, rest_y, stratify=rest_y, test_size=0.5, random_state=747)
print(len(train_x))
print(len(test_y), len(valid_y), len(train_y))

181
23 23 181
CPU times: user 10.1 s, sys: 227 ms, total: 10.3 s
Wall time: 11.8 s


## Creating Dataframe
- using the splitted data to create **train.csv** , **test.csv** , **valid.csv**

In [10]:
train_df = pd.concat([train_x, train_y], axis=1).reset_index()[['Category', 'News']]
test_df = pd.concat([test_x, test_y], axis=1).reset_index()[['Category', 'News']]
valid_df = pd.concat([valid_x, valid_y], axis=1).reset_index()[['Category', 'News']]

train_df.to_csv('train.csv', header=['classlabel', 'content'], encoding='utf-8', index=False)
test_df.to_csv('test.csv', header=['classlabel', 'content'], encoding='utf-8', index=False)
valid_df.to_csv('valid.csv', header=['classlabel', 'content'], encoding='utf-8', index=False)

In [11]:
ls

app.py      GRU.py       LSTM.py                 README.md     test.csv
FastApi.py  index.ipynb  Nepali_Dataset_New.csv  RNN.pth       train.csv
GRU.pth     index.py     [0m[01;34m__pycache__[0m/            test_0.ipynb  valid.csv


In [12]:
train_df.head()

Unnamed: 0,Category,News
0,4,आईओसी पेट्रोलियम पदार्थ आपूर्ति घटायोभारतीय आय...
1,4,पहाडी जिल्ला आपूर्ति बन्दजिविस ठेकेदार आयातित ...
2,0,﻿राजविराज २७ फागुन सप्तरी मलेठ गाउँ विकास समित...
3,3,सफ्टवेयर बाट भविष्यवाणीभाग्य विश्वास उत्तर एस ...
4,4,सिस्नो ढिंडो खाने सम्पन्नराजधानी निजी स्कुल गर...


## Setting up Data preprocessing and preparing Pytorch dataloaders


In [13]:
import torch
import torch.nn.functional as F
from torchtext import data
from torchtext import datasets
import time
import random
import pandas as pd
import numpy as np

torch.backends.cudnn.deterministic = True

In [14]:
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)

VOCABULARY_SIZE = 60_000
LEARNING_RATE = 1e-3
BATCH_SIZE = 32
NUM_EPOCHS = 50
DROPOUT = 0.5
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

EMBEDDING_DIM = 200
BIDIRECTIONAL = True
HIDDEN_DIM = 200
NUM_LAYERS = 2
OUTPUT_DIM = 5

### Defining Fields 

In [15]:
TEXT = data.Field(sequential=True,
#                   tokenize='spacy',
                  include_lengths=True) # necessary for packed_padded_sequence

LABEL = data.LabelField(dtype=torch.float)

## Creating Dataset 
- create dataset using **TabularDataset** from **torchtext** which allows loading data from CSV files.


In [16]:
fields = [('classlabel', LABEL), ('content', TEXT)]

train_dataset = data.TabularDataset(
    path="train.csv", format='csv',
    skip_header=True, fields=fields)

test_dataset = data.TabularDataset(
    path="test.csv", format='csv',
    skip_header=True, fields=fields)

valid_dataset = data.TabularDataset(
    path="valid.csv", format='csv',
    skip_header=True, fields=fields)

In [17]:
print(f'Num Train: {len(train_dataset)}')
print(f'Num Valid: {len(test_dataset)}')
print(f'Num Valid: {len(valid_dataset)}')

Num Train: 181
Num Valid: 23
Num Valid: 23


### Build Vocabulary

In [18]:
TEXT.build_vocab(train_dataset, test_dataset, valid_dataset,
                 min_freq=2)
LABEL.build_vocab(train_dataset)

print(f'Vocabulary size: {len(TEXT.vocab)}')
print(f'Number of classes: {len(LABEL.vocab)}')

Vocabulary size: 5571
Number of classes: 5


### Create Dataloaders

In [19]:
train_loader, valid_loader, test_loader = data.BucketIterator.splits(
    (train_dataset, valid_dataset, test_dataset), 
    batch_size=BATCH_SIZE,
    sort_within_batch=True, # necessary for packed_padded_sequence
    sort_key=lambda x: len(x.content),
    device=DEVICE)

In [20]:
print('Train')
for batch in train_loader:
    print(f'Text matrix size: {batch.content[0].size()}')
    print(f'Target vector size: {batch.classlabel.size()}')
    break
    
print('\nValid:')
for batch in valid_loader:
    print(f'Text matrix size: {batch.content[0].size()}')
    print(f'Target vector size: {batch.classlabel.size()}')
    break
    
print('\nTest:')
for batch in test_loader:
    print(f'Text matrix size: {batch.content[0].size()}')
    print(f'Target vector size: {batch.classlabel.size()}')
    break

Train
Text matrix size: torch.Size([181, 32])
Target vector size: torch.Size([32])

Valid:
Text matrix size: torch.Size([701, 23])
Target vector size: torch.Size([23])

Test:
Text matrix size: torch.Size([1034, 23])
Target vector size: torch.Size([23])


## LSTM

In [21]:
import torch.nn as nn


class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, bidirectional, hidden_dim, num_layers, output_dim, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx=pad_idx)
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim,
                           num_layers=num_layers,
                           bidirectional=bidirectional, 
                           dropout=dropout)
        self.fc1 = nn.Linear(hidden_dim * num_layers, 64)
        self.fc2 = nn.Linear(64, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_length):
        embedded = self.dropout(self.embedding(text))
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_length)
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        # output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        hidden = self.fc1(hidden)
        hidden = self.dropout(hidden)
        hidden = self.fc2(hidden)
        return hidden

### Setting model Architecture

In [22]:
INPUT_DIM = len(TEXT.vocab)

PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

torch.manual_seed(RANDOM_SEED)
model = RNN(INPUT_DIM, EMBEDDING_DIM, BIDIRECTIONAL, HIDDEN_DIM, NUM_LAYERS, OUTPUT_DIM, DROPOUT, PAD_IDX)
model = model.to(DEVICE)
print(model)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

RNN(
  (embedding): Embedding(5571, 200, padding_idx=1)
  (rnn): LSTM(200, 200, num_layers=2, dropout=0.5, bidirectional=True)
  (fc1): Linear(in_features=400, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=5, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)


### Function to evaluate accuracy of model 

In [23]:
def compute_accuracy(model, data_loader, device):
    model.eval()
    correct_pred, num_examples = 0, 0
    with torch.no_grad():
        for batch_idx, batch_data in enumerate(data_loader):
            text, text_lengths = batch_data.content
            if 0 in text_lengths:
                continue
            logits = model(text, text_lengths.to('cpu'))
            _, predicted_labels = torch.max(logits, 1)
            num_examples += batch_data.classlabel.size(0)
            correct_pred += (predicted_labels.long() == batch_data.classlabel.long()).sum()
        return correct_pred.float()/num_examples * 100

### Training Model

In [24]:
start_time = time.time()

for epoch in range(NUM_EPOCHS):
    model.train()
    for batch_idx, batch_data in enumerate(train_loader):
        
        text, text_lengths = batch_data.content
        # print(text.shape, text_lengths.shape)
        if 0 in text_lengths:
            continue
        
        ### FORWARD AND BACK PROP
        logits = model(text, text_lengths.to('cpu'))
        cost = F.cross_entropy(logits, batch_data.classlabel.long())
        optimizer.zero_grad()
        
        cost.backward()
        
        ### UPDATE MODEL PARAMETERS
        optimizer.step()
        ### LOGGING
        if not batch_idx % 50:
            print (f'Epoch: {epoch+1:03d}/{NUM_EPOCHS:03d} | '
                   f'Batch {batch_idx:03d}/{len(train_loader):03d} | '
                   f'Cost: {cost:.4f}')

    with torch.set_grad_enabled(False):
        print(f'training accuracy: '
              f'{compute_accuracy(model, train_loader, DEVICE):.2f}%'
              f'\nvalid accuracy: '
              f'{compute_accuracy(model, valid_loader, DEVICE):.2f}%')
        
    print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')
    
print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
print(f'Test accuracy: {compute_accuracy(model, test_loader, DEVICE):.2f}%')

Epoch: 001/050 | Batch 000/006 | Cost: 1.5908
training accuracy: 22.10%
valid accuracy: 21.74%
Time elapsed: 4.38 min
Epoch: 002/050 | Batch 000/006 | Cost: 1.6059
training accuracy: 35.91%
valid accuracy: 30.43%
Time elapsed: 8.32 min
Epoch: 003/050 | Batch 000/006 | Cost: 1.5287
training accuracy: 55.80%
valid accuracy: 34.78%
Time elapsed: 12.16 min
Epoch: 004/050 | Batch 000/006 | Cost: 1.4632
training accuracy: 66.85%
valid accuracy: 43.48%
Time elapsed: 16.64 min
Epoch: 005/050 | Batch 000/006 | Cost: 1.4139
training accuracy: 72.38%
valid accuracy: 43.48%
Time elapsed: 20.90 min
Epoch: 006/050 | Batch 000/006 | Cost: 1.3309
training accuracy: 75.69%
valid accuracy: 52.17%
Time elapsed: 24.74 min
Epoch: 007/050 | Batch 000/006 | Cost: 1.1706
training accuracy: 72.93%
valid accuracy: 52.17%
Time elapsed: 28.59 min
Epoch: 008/050 | Batch 000/006 | Cost: 1.2719
training accuracy: 91.16%
valid accuracy: 52.17%
Time elapsed: 32.58 min
Epoch: 009/050 | Batch 000/006 | Cost: 0.8465
trai

### Interpretation of LSTM 

After **50** epochs, seems like the model has achieved 100% accuracy on the training set, which suggests that the model might have **overfit** the training data
- **Validation Accuracy : 65.22%**
- **Test Accuracy : 60.87%**




In [None]:
cats = {
    'politics' : 0, 
    'sport' : 1, 
    'entertainment' : 2, 
    'tech' : 3, 
    'business': 4
   
}

map_dict = {v: k for k, v in cats.items()}

def predict(model, sentence, device='cpu'):
    model.eval()
    indexed = [TEXT.vocab.stoi[token] for token in clean_and_tokenize(sentence).split()]
#     indexed = [TEXT.vocab.stoi[i] for i in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)
    
    output = model(tensor, length_tensor)
    predictions = torch.softmax(output, dim=1)
    
    probs, label = predictions.max(dim=1)
    
    return predictions, probs.item(), label.item()

In [None]:
news = """
काठमाडौं । अमेरिका र वेस्ट इन्डिजमा अर्को महिना हुने आईसीसी टी-ट्वेन्टी विश्वकप खेल्ने नेपाली राष्ट्रिय क्रिकेट टिमको बुधबार घोषणा भएको छ । नेपाल क्रिकेट संघ क्यानले नेपाली टिम आज सार्वजनिक गरेको हो ।
रोहित पौडेलको कप्तानीमा मुख्य प्रशिक्षक मोन्टी देसाइले कमलसिंह ऐरी र सागर ढकाललाई समेट्दै १५ सदस्यीय टोली घोषणा गरे। विश्वकप खेल्ने टिममा अशिफ शेख, अनिल साह, कुशल भुर्तेल, कुशल मल्ल, दिपेन्द्रसिंह ऐरी, ललित राजवंशी, करण केसी, गुल्सन झा, सोमपाल कामी, प्रतिश जिसी, सन्दीप जोरा र अभिनास बोहरा छन् ।
"""
preds, probs, label = predict(model, news)

print(f'Class Label: {label} -> {map_dict[label]}')

Class Label: 1 -> sport


In [None]:
news = """
काठमाडौँ । नेपालमा स्वदेशी र विदेशी गरेर वार्षिक डेढ सयदेखि दुई सय वटासम्म चलचित्र प्रदर्शनमा आउने गरेका छन् । प्रदर्शनमा ल्याउनु अघि सबै चलचित्रले केन्द्रीय चलचित्र जाँच समिति (सेन्सर बोर्ड) बाट प्रदर्शनका लागि खुला (सेन्सर पास) प्रमाणपत्र लिन अनिवार्य हुन्छ ।

"""
preds, probs, label = predict(model, news)

print(f'Class Label: {label} -> {map_dict[label]}')

NameError: name 'model' is not defined

### Saving LSTM Model

In [None]:
torch.save(model.state_dict(), "RNN.pth")

## GRU

In [None]:
import torch.nn as nn

class GRU(nn.Module):
    def __init__(self, input_dim, embedding_dim, bidirectional, hidden_dim, num_layers, output_dim, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx=pad_idx)
        self.gru = nn.GRU(embedding_dim, 
                          hidden_dim,
                          num_layers=num_layers,
                          bidirectional=bidirectional, 
                          dropout=dropout)
        self.fc1 = nn.Linear(hidden_dim * num_layers, 64)
        self.fc2 = nn.Linear(64, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_length):
        embedded = self.dropout(self.embedding(text))
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_length)
        packed_output, hidden = self.gru(packed_embedded)
        # output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        hidden = self.fc1(hidden)
        hidden = self.dropout(hidden)
        hidden = self.fc2(hidden)
        return hidden


In [None]:
INPUT_DIM = len(TEXT.vocab)

PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

torch.manual_seed(RANDOM_SEED)
model_1 = GRU(INPUT_DIM, EMBEDDING_DIM, BIDIRECTIONAL, HIDDEN_DIM, NUM_LAYERS, OUTPUT_DIM, DROPOUT, PAD_IDX)
model_1 = model_1.to(DEVICE)
print(model_1)
optimizer = torch.optim.Adam(model_1.parameters(), lr=LEARNING_RATE)

GRU(
  (embedding): Embedding(5202, 200, padding_idx=1)
  (gru): GRU(200, 200, num_layers=2, dropout=0.5, bidirectional=True)
  (fc1): Linear(in_features=400, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=8, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)


In [None]:
NUM_EPOCHS = 30


In [None]:
start_time = time.time()

for epoch in range(NUM_EPOCHS):
    model_1.train()
    for batch_idx, batch_data in enumerate(train_loader):
        
        text, text_lengths = batch_data.content
        # print(text.shape, text_lengths.shape)
        if 0 in text_lengths:
            continue
        
        ### FORWARD AND BACK PROP
        logits = model_1(text, text_lengths.to('cpu'))
        cost = F.cross_entropy(logits, batch_data.classlabel.long())
        optimizer.zero_grad()
        
        cost.backward()
        
        ### UPDATE MODEL_1 PARAMETERS
        optimizer.step()
        ### LOGGING
        if not batch_idx % 50:
            print (f'Epoch: {epoch+1:03d}/{NUM_EPOCHS:03d} | '
                   f'Batch {batch_idx:03d}/{len(train_loader):03d} | '
                   f'Cost: {cost:.4f}')

    with torch.set_grad_enabled(False):
        print(f'training accuracy: '
              f'{compute_accuracy(model_1, train_loader, DEVICE):.2f}%'
              f'\nvalid accuracy: '
              f'{compute_accuracy(model_1, valid_loader, DEVICE):.2f}%')
        
    print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')
    
print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
print(f'Test accuracy: {compute_accuracy(model_1, test_loader, DEVICE):.2f}%')

Epoch: 001/030 | Batch 000/006 | Cost: 1.8624
training accuracy: 35.36%
valid accuracy: 30.43%
Time elapsed: 2.57 min
Epoch: 002/030 | Batch 000/006 | Cost: 1.6761
training accuracy: 51.93%
valid accuracy: 30.43%
Time elapsed: 5.11 min
Epoch: 003/030 | Batch 000/006 | Cost: 1.5015
training accuracy: 70.72%
valid accuracy: 39.13%
Time elapsed: 7.64 min
Epoch: 004/030 | Batch 000/006 | Cost: 1.4099
training accuracy: 72.38%
valid accuracy: 47.83%
Time elapsed: 10.18 min
Epoch: 005/030 | Batch 000/006 | Cost: 1.2767
training accuracy: 80.66%
valid accuracy: 56.52%
Time elapsed: 12.73 min
Epoch: 006/030 | Batch 000/006 | Cost: 1.1164
training accuracy: 90.61%
valid accuracy: 56.52%
Time elapsed: 15.25 min
Epoch: 007/030 | Batch 000/006 | Cost: 0.9552
training accuracy: 91.71%
valid accuracy: 60.87%
Time elapsed: 17.77 min
Epoch: 008/030 | Batch 000/006 | Cost: 0.7979
training accuracy: 95.03%
valid accuracy: 60.87%
Time elapsed: 20.23 min
Epoch: 009/030 | Batch 000/006 | Cost: 0.5574
train

### Interpretation of GRU Model 

- **Validation Accuracy : 69.87%**
- **Test Accuracy : 69.57%**


### GRU performance better than LSTM 

- **Simpler Architecture**: The GRU has a simpler architecture compared to LSTM, with fewer parameters. This simplicity may be the reason for GRU to be trained more efficiently and  prevent overfitting.
- **Fewer Gating Mechanisms**: While both GRU and LSTM use gating mechanisms to control the flow of information, GRU has fewer gating mechanisms (update and reset gates) compared to LSTM (input, forget, and output gates). This reduced complexity may make it easier for the GRU to capture relevant information and avoid vanishing gradient problems.
- **Dataset Characteristics**: The performance of different RNN architectures can vary depending on the characteristics of the dataset, such as the length of sequences, the presence of long-range dependencies, and the complexity of patterns. It's possible that the dataset used for text classification favors the characteristics of the GRU model, leading to better performance.

In [None]:
news = """
काठमाडौं । अमेरिका र वेस्ट इन्डिजमा अर्को महिना हुने आईसीसी टी-ट्वेन्टी विश्वकप खेल्ने नेपाली राष्ट्रिय क्रिकेट टिमको बुधबार घोषणा भएको छ । नेपाल क्रिकेट संघ क्यानले नेपाली टिम आज सार्वजनिक गरेको हो ।
रोहित पौडेलको कप्तानीमा मुख्य प्रशिक्षक मोन्टी देसाइले कमलसिंह ऐरी र सागर ढकाललाई समेट्दै १५ सदस्यीय टोली घोषणा गरे। विश्वकप खेल्ने टिममा अशिफ शेख, अनिल साह, कुशल भुर्तेल, कुशल मल्ल, दिपेन्द्रसिंह ऐरी, ललित राजवंशी, करण केसी, गुल्सन झा, सोमपाल कामी, प्रतिश जिसी, सन्दीप जोरा र अभिनास बोहरा छन् ।
"""
preds, probs, label = predict(model_1, news)

print(f'Class Label: {label} -> {map_dict[label]}')

Class Label: 0 -> politics


In [None]:
news = """
काठमाडौँ । नेपालमा स्वदेशी र विदेशी गरेर वार्षिक डेढ सयदेखि दुई सय वटासम्म चलचित्र प्रदर्शनमा आउने गरेका छन् । प्रदर्शनमा ल्याउनु अघि सबै चलचित्रले केन्द्रीय चलचित्र जाँच समिति (सेन्सर बोर्ड) बाट प्रदर्शनका लागि खुला (सेन्सर पास) प्रमाणपत्र लिन अनिवार्य हुन्छ ।
"""
preds, probs, label = predict(model_1, news)

print(f'Class Label: {label} -> {map_dict[label]}')

Class Label: 2 -> entertainment


In [None]:
torch.save(model_1.state_dict(), "GRU.pth")


### RNN

In [None]:
import torch.nn as nn

class SimpleRNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx=pad_idx)
        self.rnn = nn.RNN(embedding_dim, 
                          hidden_dim, 
                          dropout=dropout)
        self.fc1 = nn.Linear(hidden_dim, 64)
        self.fc2 = nn.Linear(64, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_length):
        embedded = self.dropout(self.embedding(text))
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_length)
        packed_output, hidden = self.rnn(packed_embedded)
        # output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
        hidden = self.dropout(hidden[-1,:,:])  # Take the last hidden state
        hidden = self.fc1(hidden)
        hidden = self.dropout(hidden)
        hidden = self.fc2(hidden)
        return hidden


In [None]:
import torch
from torchtext.vocab import Vocab


TEXT.build_vocab(train_dataset, test_dataset, valid_dataset,
                 min_freq=2)
LABEL.build_vocab(train_dataset)

# Assuming you have defined TEXT.vocab and other necessary variables

INPUT_DIM = len(TEXT.vocab)
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
EMBEDDING_DIM = 100  # Example value, please replace with your desired value
HIDDEN_DIM = 128  # Example value, please replace with your desired value
OUTPUT_DIM = 1  # Example value, please replace with your desired value
DROPOUT = 0.5  # Example value, please replace with your desired value

model_2 = SimpleRNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, DROPOUT, PAD_IDX)
print(model_2)


SimpleRNN(
  (embedding): Embedding(5437, 100, padding_idx=1)
  (rnn): RNN(100, 128, dropout=0.5)
  (fc1): Linear(in_features=128, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)




In [None]:
NUM_EPOCHS = 30


In [None]:
cats = {
    'politics' : 0, 
    'sport' : 1, 
    'entertainment' : 2, 
    'tech' : 3, 
    'business': 4
   
}

map_dict = {v: k for k, v in cats.items()}

def predict(model, sentence, device='cpu'):
    model.eval()
    indexed = [TEXT.vocab.stoi[token] for token in clean_and_tokenize(sentence).split()]
#     indexed = [TEXT.vocab.stoi[i] for i in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)
    
    output = model(tensor, length_tensor)
    predictions = torch.softmax(output, dim=1)
    
    probs, label = predictions.max(dim=1)
    
    return predictions, probs.item(), label.item()

In [None]:
def compute_accuracy(model, data_loader, device):
    model.eval()
    correct_pred, num_examples = 0, 0
    with torch.no_grad():
        for batch_idx, batch_data in enumerate(data_loader):
            text, text_lengths = batch_data.content
            if 0 in text_lengths:
                continue
            logits = model(text, text_lengths.to('cpu'))
            _, predicted_labels = torch.max(logits, 1)
            num_examples += batch_data.classlabel.size(0)
            correct_pred += (predicted_labels.long() == batch_data.classlabel.long()).sum()
        return correct_pred.float()/num_examples * 100

In [None]:
import time

start_time = time.time()

for epoch in range(NUM_EPOCHS):
    model_2.train()
    for batch_idx, batch_data in enumerate(train_loader):
        
        text, text_lengths = batch_data.content
        # print(text.shape, text_lengths.shape)
        if 0 in text_lengths:
            continue
        
        ### FORWARD AND BACK PROP
        logits = model_2(text, text_lengths.to('cpu'))
        cost = F.cross_entropy(logits, batch_data.classlabel.long())
        optimizer.zero_grad()
        
        cost.backward()
        
        ### UPDATE model_2 PARAMETERS
        optimizer.step()
        ### LOGGING
        if not batch_idx % 50:
            print (f'Epoch: {epoch+1:03d}/{NUM_EPOCHS:03d} | '
                   f'Batch {batch_idx:03d}/{len(train_loader):03d} | '
                   f'Cost: {cost:.4f}')

    with torch.set_grad_enabled(False):
        print(f'training accuracy: '
              f'{compute_accuracy(model_2, train_loader, DEVICE):.2f}%'
              f'\nvalid accuracy: '
              f'{compute_accuracy(model_2, valid_loader, DEVICE):.2f}%')
        
    print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')
    
print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
print(f'Test accuracy: {compute_accuracy(model_2, test_loader, DEVICE):.2f}%')

IndexError: Target 4 is out of bounds.