<a href="https://colab.research.google.com/github/akshatjain2k/Data-Science-NLP/blob/Amey/News_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
from torch.utils.data import TensorDataset, DataLoader
from collections import Counter
import torch.optim as optim
from sklearn.metrics import accuracy_score

In [26]:
df = pd.read_csv(r"/content/not_preprocessed_data.csv")
df.head()

Unnamed: 0,article_body,category
0,"Long COVID community, which is an open and gro...",Partnership
1,Government test prep platform Adda247 on Octob...,Funding
2,Private equity and venture capital investments...,Merger/Acquisition
3,Digital book-keeping startup Khatabook said on...,Funding
4,Events are always important and exciting to or...,Research


In [27]:
df.category.value_counts()

Partnership           1587
IPO                   1413
Merger/Acquisition     990
Finance                989
Conference News        892
Funding                728
Research               469
Name: category, dtype: int64

In [28]:
# dropiing research category because of less data points
df = df.drop(df[df['category'] == 'Research'].index)

In [29]:
# Using Label Encoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['category'] = le.fit_transform(df['category'])
print(df['category'].value_counts())

5    1587
3    1413
4     990
1     989
0     892
2     728
Name: category, dtype: int64


In [30]:
# split the data into train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.article_body, df.category, test_size=0.2, random_state=42)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


(5279,)
(1320,)
(5279,)
(1320,)


In [31]:
import re
def preprocess_string(word):
  
  # Remove punctuation
  s = re.sub(r'[^\w\s]', '', word)
  
  # Convert to lowercase
  s = s.lower()

  # Remove extra whitespaces
  text = re.sub(r'\s+', ' ', s)

  # Remove numbers
  text = re.sub(r"\d", '', text)
  return text

In [32]:
import spacy
#loading the english language small model of spacy
en = spacy.load('en_core_web_sm')
stopwords = en.Defaults.stop_words
def create_word_list(x_train):
    word_list = []
    stop_words = set(en.Defaults.stop_words)
    for sent in x_train:
        for word in sent.lower().split(' '):
            word = preprocess_string(word)
            if word not in stop_words and word != '':
                word_list.append(word)
    return word_list
word_list = create_word_list(X_train)

In [33]:
def tokenize(x_train, x_test):
    corpus = Counter(word_list)
    corpus_ = sorted(corpus.items(), key = lambda x: x[1], reverse=True)[:10000]
    onehot_dict = {w[0]:i+1 for i, w in enumerate(corpus_)}

    final_list_train,final_list_test = [],[]
    for sent in x_train:
            final_list_train.append([onehot_dict[preprocess_string(word)] for word in sent.lower().split() 
                                     if preprocess_string(word) in onehot_dict.keys()])
    for sent in x_test:
            final_list_test.append([onehot_dict[preprocess_string(word)] for word in sent.lower().split() 
                                    if preprocess_string(word) in onehot_dict.keys()])
            
    return np.array(final_list_train),np.array(final_list_test),onehot_dict

In [34]:
X_train, X_test, vocab = tokenize(X_train, X_test)

  return np.array(final_list_train),np.array(final_list_test),onehot_dict


In [35]:
X_train

array([list([48, 101, 21, 1876, 7, 200, 3, 4, 940, 2278, 18, 3953, 1, 258, 254, 539, 1541, 4259, 8754, 2107, 3449, 557, 3953, 86, 7932, 2494, 1725, 3175, 29, 2238, 41, 4100, 9753, 5566, 6741, 7263, 1542, 7933, 5915, 160, 5026, 2441, 5, 3175, 29, 2238, 5, 823, 605, 5916, 18, 31, 263, 52, 95, 41, 3953, 3953, 6742, 3953, 2239, 6742, 2239, 905, 160, 160, 605, 170, 5027, 101, 293, 615, 724, 6743, 3953, 24, 67, 1877, 4613, 5296, 103, 1876, 43, 366, 615, 2279, 52, 9754, 3816, 951, 2442, 346, 65, 1293, 977, 3084, 1, 539, 103, 74, 3176, 29, 2278, 9755, 4260, 2495, 652, 370, 3953, 3274, 4614, 790, 989, 940, 2278, 165, 1082, 484, 318, 894, 5916, 1, 1877, 760, 113, 1876, 7, 17, 3, 4, 146, 16, 165, 246, 47, 231, 48, 101, 1168, 143, 1207, 8, 47, 5297, 48, 101, 106, 2550, 235, 1571, 15, 145, 28, 22, 101, 952, 63, 197, 355, 2280, 21, 3450, 910, 211, 18, 3954, 320, 31, 1457, 238, 21, 4419]),
       list([1876, 7, 5567, 47, 1, 204, 367, 146, 48, 101, 16, 3, 4, 88, 232, 1317, 14, 1383, 16, 569, 507, 31, 

In [36]:
len(vocab)

10000

In [37]:
def padding(sents, seq_len):
    features = np.zeros((len(sents), seq_len), dtype = int)
    for i, rev in enumerate(sents):
        if len(rev) != 0:
            features[i, -len(rev):] = np.array(rev)[:seq_len]
    return features

In [38]:
train_sentences = padding(X_train, 500)
test_sentences = padding(X_test, 500)

In [39]:
train_sentences

array([[   0,    0,    0, ...,  238,   21, 4419],
       [   0,    0,    0, ..., 1207, 1230,   24],
       [   0,    0,    0, ...,  925, 6746,   37],
       ...,
       [   0,    0,    0, ...,  316,  624,   93],
       [   0,    0,    0, ..., 2836,  239, 6504],
       [   0,    0,    0, ...,  569,    1,  350]])

In [40]:
train_data = TensorDataset(torch.from_numpy(np.array(train_sentences)), torch.from_numpy(np.array(y_train)))
test_data = TensorDataset(torch.from_numpy(np.array(test_sentences)), torch.from_numpy(np.array(y_test)))

In [41]:
batch_size = 64

In [42]:
trainloader = DataLoader(train_data, shuffle=True, batch_size = batch_size)
testloader = DataLoader(test_data, shuffle= True, batch_size = batch_size)

In [43]:
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU is available


In [44]:
# LSTM
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super(LSTM,self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional = bidirectional, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
       # Initialize hidden and cell states
        embedded = self.embedding(x)

        # Frwd Propogation
        output, (hidden, cell) = self.lstm(embedded)
        out_frwd = hidden[-2,:,:]
        out_reverse = hidden[-1,:,:]
        out_reduced = torch.cat((out_frwd, out_reverse), dim = 1)
        hidden = self.dropout(out_reduced)
        hidden = torch.squeeze(hidden,1)
        return hidden

In [45]:
# Hyperparameters
vocab_size = len(vocab) + 1
embedding_dim = 100
hidden_dim = 64
output_dim = 6
n_layers = 2
bidirection = True
dropout = 0.5

# Model
model = LSTM(vocab_size, 
             embedding_dim, 
             hidden_dim, 
             output_dim, 
             n_layers, 
             bidirection, 
             dropout)
print(model)

LSTM(
  (embedding): Embedding(10001, 100)
  (lstm): LSTM(100, 64, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=128, out_features=6, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)


In [46]:
def train(model, device, trainloader, optimizer, criterion):
    model.train()
    running_loss = 0.0
    for inputs, labels in trainloader:
        inputs, labels = inputs.to(device), labels.to(device)
        # cleaning the cache of optimizer
        optimizer.zero_grad()

        # Forward propogation
        outputs = model(inputs)

        # computing loss
        loss = criterion(outputs, labels)
        loss.backward()

        # updating weights
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)

    epoch_loss = running_loss / len(trainloader.dataset)
    return epoch_loss


def test(model, device, testloader, criterion):
    model.eval()
    running_loss = 0.0
    y_true, y_pred = [], []
    with torch.no_grad():
        for inputs, labels in testloader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            # Compute Loss
            loss = criterion(outputs, labels)

            running_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs, 1)
            y_true.extend(labels.tolist())
            y_pred.extend(predicted.tolist())

    epoch_loss = running_loss / len(testloader.dataset)
    epoch_accuracy = accuracy_score(y_true, y_pred)
    return epoch_loss, epoch_accuracy


In [47]:
n_epochs = 100
learning_rate = 0.01
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

LSTM(
  (embedding): Embedding(10001, 100)
  (lstm): LSTM(100, 64, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=128, out_features=6, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [48]:
# train the model
train_losses, test_losses, test_accuracies = [], [], []
for epoch in range(n_epochs):
    #print(f'Epoch {epoch + 1} / {n_epochs}', end=' ')
    
    train_loss = train(model, device, trainloader, optimizer, criterion)
    
    test_loss, test_accuracy = test(model, device, testloader, criterion)
    train_losses.append(train_loss)
    test_losses.append(test_loss)
    test_accuracies.append(test_accuracy)
    
    if epoch %10 ==0: 
      print(f"\tTrain_loss {train_loss:.3f} | Epoch no.: {epoch+1}")
      print(f"\tTest_loss {test_loss:.3f} | Model_Acc: {test_accuracy*100:.2f}%")
      print()


	Train_loss 3.632 | Epoch no.: 1
	Test_loss 3.154 | Model_Acc: 32.50%

	Train_loss 3.480 | Epoch no.: 11
	Test_loss 3.088 | Model_Acc: 25.83%

	Train_loss 3.555 | Epoch no.: 21
	Test_loss 3.204 | Model_Acc: 31.97%

	Train_loss 3.579 | Epoch no.: 31
	Test_loss 3.193 | Model_Acc: 33.86%

	Train_loss 3.578 | Epoch no.: 41
	Test_loss 3.192 | Model_Acc: 40.15%

	Train_loss 3.566 | Epoch no.: 51
	Test_loss 3.197 | Model_Acc: 38.79%

	Train_loss 3.578 | Epoch no.: 61
	Test_loss 3.201 | Model_Acc: 39.70%

	Train_loss 3.565 | Epoch no.: 71
	Test_loss 3.193 | Model_Acc: 41.21%

	Train_loss 3.558 | Epoch no.: 81
	Test_loss 3.206 | Model_Acc: 41.67%

	Train_loss 3.555 | Epoch no.: 91
	Test_loss 3.202 | Model_Acc: 41.74%

