In [2]:
import pandas as pd
import numpy as np
df = pd.read_csv('complete_data.csv')
df.head()

Unnamed: 0,article_body,category
0,"Long COVID community, which is an open and gro...",Partnership
1,Government test prep platform Adda247 on Octob...,Funding
2,Private equity and venture capital investments...,Merger/Acquisition
3,Digital book-keeping startup Khatabook said on...,Funding
4,Events are always important and exciting to or...,Research


In [3]:
# check if data is balance or unbalanced
counts = df['category'].value_counts()
print(counts)

Partnership           1587
IPO                   1413
Merger/Acquisition     990
Finance                989
Conference News        892
Funding                728
Research               469
Name: category, dtype: int64


In [4]:
# balance data
df = df.groupby('category').head(700)
df.category.value_counts()

Partnership           700
Funding               700
Merger/Acquisition    700
Conference News       700
Finance               700
IPO                   700
Research              469
Name: category, dtype: int64

In [5]:
# droping research category because of less data points
df = df.drop(df[df['category'] == 'Research'].index)


In [6]:
df.category.value_counts()

Partnership           700
Funding               700
Merger/Acquisition    700
Conference News       700
Finance               700
IPO                   700
Name: category, dtype: int64

In [7]:
# use map function to map the category to numerical values
df['category'] = df['category'].map({'Funding': 0, 'Partnership': 1, 'Merger/Acquisition': 2, 'Finance': 3, 'Conference News': 4, 'IPO': 5, "Research": 6})
df.head()

Unnamed: 0,article_body,category
0,"Long COVID community, which is an open and gro...",1
1,Government test prep platform Adda247 on Octob...,0
2,Private equity and venture capital investments...,2
3,Digital book-keeping startup Khatabook said on...,0
5,it easier for everyone to experience the world...,1


In [8]:
#tokenization

import re
import spacy
import string
nlp = spacy.load("en_core_web_sm")

def tokenize (text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    # remove punctuation
    regex = re.compile('[' + re.escape(string.punctuation) + '\\r\\t\\n]') 
    # convert to lower case
    nopunct = regex.sub(" ", text.lower())
    return [token.text for token in nlp.tokenizer(nopunct)]

In [9]:
#count number of occurences of each word
from collections import Counter
counts = Counter()
for index, row in df.iterrows():
    counts.update(tokenize(row['article_body']))

In [10]:
#deleting infrequent words
print("num_words before:",len(counts.keys()))
for word in list(counts):
    if counts[word] < 2:
        del counts[word]
print("num_words after:",len(counts.keys()))

num_words before: 32488
num_words after: 19504


In [11]:
#creating vocabulary
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [12]:
# To encode word into index using vocab2index
def encode_sentence(text, vocab2index, N=70):
    tokenized = tokenize(text)
    encoded = np.zeros(N, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length

In [13]:
df['encoded'] = df['article_body'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))
df.head()

  df['encoded'] = df['article_body'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))


Unnamed: 0,article_body,category,encoded
0,"Long COVID community, which is an open and gro...",1,"[[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 4, 12, 13, 1..."
1,Government test prep platform Adda247 on Octob...,0,"[[37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, ..."
2,Private equity and venture capital investments...,2,"[[283, 284, 10, 285, 54, 286, 287, 18, 288, 28..."
3,Digital book-keeping startup Khatabook said on...,0,"[[309, 310, 311, 141, 312, 45, 42, 313, 314, 1..."
5,it easier for everyone to experience the world...,1,"[[46, 395, 12, 396, 100, 397, 18, 398, 5, 18, ..."


In [14]:
df.to_csv('processed_data.csv', index=False)
df.head()

Unnamed: 0,article_body,category,encoded
0,"Long COVID community, which is an open and gro...",1,"[[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 4, 12, 13, 1..."
1,Government test prep platform Adda247 on Octob...,0,"[[37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, ..."
2,Private equity and venture capital investments...,2,"[[283, 284, 10, 285, 54, 286, 287, 18, 288, 28..."
3,Digital book-keeping startup Khatabook said on...,0,"[[309, 310, 311, 141, 312, 45, 42, 313, 314, 1..."
5,it easier for everyone to experience the world...,1,"[[46, 395, 12, 396, 100, 397, 18, 398, 5, 18, ..."


In [15]:
from sklearn.model_selection import train_test_split
X = list(df['encoded'])
y = list(df['category'])
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)

In [16]:
#library imports
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sklearn.metrics import mean_squared_error

In [17]:
class NewssDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx][0].astype(np.int32)), self.y[idx], self.X[idx][1]

In [18]:
train_ds = NewssDataset(X_train, y_train)
valid_ds = NewssDataset(X_valid, y_valid)

In [38]:
def train_model(model, epochs=10, lr=0.001):
    # update parameter when requires_grad = true
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    # using optimizer
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs): 
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y, l in train_dl:
            x = x.long()
            y = y.long()
            y_pred = model(x, l)

            # zero the grad
            optimizer.zero_grad()

            # calculate loss
            loss = F.cross_entropy(y_pred, y)

            # backward pass
            loss.backward()

            # updates weights
            optimizer.step()

            # total loss
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc, val_rmse = validation_metrics(model, val_dl)
        if i % 5 == 1:
            print("train loss %.3f, val loss %.3f, val accuracy %.3f, and val rmse %.3f" % (sum_loss/total, val_loss, val_acc, val_rmse))



In [20]:
def validation_metrics (model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    sum_rmse = 0.0
    for x, y, l in valid_dl:
        x = x.long()
        y = y.long()
        y_hat = model(x, l)
        loss = F.cross_entropy(y_hat, y)
        pred = torch.max(y_hat, 1)[1]

        # adding if guess is right
        correct += (pred == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
        sum_rmse += np.sqrt(mean_squared_error(pred, y.unsqueeze(-1)))*y.shape[0]
    return sum_loss/total, correct/total, sum_rmse/total
          # avg loss       accuracy of model  

In [29]:

batch_size = 250
vocab_size = len(words)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(valid_ds, batch_size=batch_size)


In [34]:
class LSTM(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super().__init__()
        # creating Embedding layer-convert the input sequence into dense vector
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0) # padding_idx index of the padding token in the vocabulary.

        # lstm layer - size of the input, size of hidden state, batch size
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)

        # fully connected layer
        self.linear = nn.Linear(hidden_dim, 6)

        # dropout layer
        self.dropout = nn.Dropout(0.4)
        
    def forward(self, x, l): # x- input sequence of token, l- length of each sequence in the batch
        # map the input sequence of dense vector using embedding layer
        x = self.embeddings(x)

        x = self.dropout(x)

        # produce output and final ht
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])
        

In [35]:
model =  LSTM(vocab_size, 50, 50)
print(model)

LSTM(
  (embeddings): Embedding(19506, 50, padding_idx=0)
  (lstm): LSTM(50, 50, batch_first=True)
  (linear): Linear(in_features=50, out_features=6, bias=True)
  (dropout): Dropout(p=0.4, inplace=False)
)


In [36]:
train_model(model, epochs=30, lr=0.01)

train loss 1.550, val loss 1.497, val accuracy 0.324, and val rmse 2.223
train loss 0.955, val loss 1.126, val accuracy 0.525, and val rmse 1.836
train loss 0.490, val loss 0.929, val accuracy 0.683, and val rmse 1.293
train loss 0.297, val loss 0.945, val accuracy 0.724, and val rmse 1.266
train loss 0.171, val loss 0.929, val accuracy 0.770, and val rmse 1.215
train loss 0.140, val loss 0.881, val accuracy 0.796, and val rmse 1.094
