In [3]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter

In [4]:
train_df = pd.read_csv('train.csv')

In [None]:
train_df.head()

Unnamed: 0,Class Index,Title,Description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


In [5]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [6]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

In [7]:
processed_text = pd.DataFrame({
    'Class Index': train_df['Class Index'],
    'Title': train_df['Title'].apply(preprocess_text),
    'Description': train_df['Description'].apply(preprocess_text)
})

In [8]:
processed_text

Unnamed: 0,Class Index,Title,Description
0,3,wall bear claw back black reuters,reuters wall street seeing green
1,3,carlyle look toward commercial aerospace reuters,reuters private investment firm carlyle group ...
2,3,oil economy cloud stock outlook reuters,reuters soaring crude price plus economy outlo...
3,3,iraq halt oil export main southern pipeline re...,reuters authority halted oil main pipeline sou...
4,3,oil price soar record posing new menace u econ...,afp tearaway world oil price toppling record s...
...,...,...,...
119995,1,pakistan musharraf say wo quit army chief,karachi reuters pakistani president pervez mus...
119996,2,renteria signing deal,red sox general manager theo epstein acknowled...
119997,2,saban going dolphin yet,miami dolphin put courtship lsu coach nick sab...
119998,2,today nfl game,pittsburgh ny giant time line steelers record ...


In [9]:
processed_text['Class Index'] = processed_text['Class Index'] - 1

In [10]:
all_tokens = [token for sublist in processed_text['Title'].tolist() + processed_text['Description'].tolist() for token in sublist]
vocab = {word: idx+1 for idx, (word, _) in enumerate(Counter(all_tokens).items())}

In [11]:
def tokens_to_indices(tokens, vocab):
    return [vocab.get(token, 0) for token in tokens]

In [12]:
processed_text['token_indices1'] = processed_text['Title'].apply(lambda tokens: tokens_to_indices(tokens, vocab))
processed_text['token_indices2'] = processed_text['Description'].apply(lambda tokens: tokens_to_indices(tokens, vocab))

In [13]:
processed_text

Unnamed: 0,Class Index,Title,Description,token_indices1,token_indices2
0,2,wall bear claw back black reuters,reuters wall street seeing green,"[1, 2, 3, 3, 4, 5, 6, 2, 7, 4, 8, 3, 2, 1, 4, ...","[7, 6, 10, 11, 6, 7, 12, 4, 1, 2, 3, 3, 4, 12,..."
1,2,carlyle look toward commercial aerospace reuters,reuters private investment firm carlyle group ...,"[8, 2, 7, 3, 13, 3, 6, 4, 3, 14, 14, 9, 4, 11,...","[7, 6, 10, 11, 6, 7, 12, 4, 18, 7, 17, 25, 2, ..."
2,2,oil economy cloud stock outlook reuters,reuters soaring crude price plus economy outlo...,"[14, 17, 3, 4, 6, 8, 14, 19, 14, 16, 13, 4, 8,...","[7, 6, 10, 11, 6, 7, 12, 4, 12, 14, 2, 7, 17, ..."
3,2,iraq halt oil export main southern pipeline re...,reuters authority halted oil main pipeline sou...,"[17, 7, 2, 20, 4, 21, 2, 3, 11, 4, 14, 17, 3, ...","[7, 6, 10, 11, 6, 7, 12, 4, 2, 10, 11, 21, 14,..."
4,2,oil price soar record posing new menace u econ...,afp tearaway world oil price toppling record s...,"[14, 17, 3, 4, 18, 7, 17, 8, 6, 4, 12, 14, 2, ...","[2, 24, 18, 4, 11, 6, 2, 7, 2, 1, 2, 13, 4, 1,..."
...,...,...,...,...,...
119995,0,pakistan musharraf say wo quit army chief,karachi reuters pakistani president pervez mus...,"[18, 2, 9, 17, 12, 11, 2, 19, 4, 16, 10, 12, 2...","[9, 2, 7, 2, 8, 21, 17, 4, 7, 6, 10, 11, 6, 7,..."
119996,1,renteria signing deal,red sox general manager theo epstein acknowled...,"[7, 6, 19, 11, 6, 7, 17, 2, 4, 12, 17, 23, 19,...","[7, 6, 15, 4, 12, 14, 22, 4, 23, 6, 19, 6, 7, ..."
119997,1,saban going dolphin yet,miami dolphin put courtship lsu coach nick sab...,"[12, 2, 5, 2, 19, 4, 23, 14, 17, 19, 23, 4, 15...","[16, 17, 2, 16, 17, 4, 15, 14, 3, 18, 21, 17, ..."
119998,1,today nfl game,pittsburgh ny giant time line steelers record ...,"[11, 14, 15, 2, 13, 4, 19, 24, 3, 4, 23, 2, 16...","[18, 17, 11, 11, 12, 5, 10, 7, 23, 21, 4, 19, ..."


In [14]:
max_seqlen = 142
def pad_sequence(sequence, max_len):
    return sequence + [0] * (max_len - len(sequence)) if len(sequence) < max_len else sequence[:max_len]

In [15]:
processed_text['padded_token_indices1'] = processed_text['token_indices1'].apply(lambda seq: pad_sequence(seq, max_seqlen))
processed_text['padded_token_indices2'] = processed_text['token_indices2'].apply(lambda seq: pad_sequence(seq, max_seqlen))

In [16]:
processed_text['combined_padded_indices'] = processed_text.apply(lambda row: row['padded_token_indices1'] + row['padded_token_indices2'], axis=1)

In [17]:
processed_text.head()

Unnamed: 0,Class Index,Title,Description,token_indices1,token_indices2,padded_token_indices1,padded_token_indices2,combined_padded_indices
0,2,wall bear claw back black reuters,reuters wall street seeing green,"[1, 2, 3, 3, 4, 5, 6, 2, 7, 4, 8, 3, 2, 1, 4, ...","[7, 6, 10, 11, 6, 7, 12, 4, 1, 2, 3, 3, 4, 12,...","[1, 2, 3, 3, 4, 5, 6, 2, 7, 4, 8, 3, 2, 1, 4, ...","[7, 6, 10, 11, 6, 7, 12, 4, 1, 2, 3, 3, 4, 12,...","[1, 2, 3, 3, 4, 5, 6, 2, 7, 4, 8, 3, 2, 1, 4, ..."
1,2,carlyle look toward commercial aerospace reuters,reuters private investment firm carlyle group ...,"[8, 2, 7, 3, 13, 3, 6, 4, 3, 14, 14, 9, 4, 11,...","[7, 6, 10, 11, 6, 7, 12, 4, 18, 7, 17, 25, 2, ...","[8, 2, 7, 3, 13, 3, 6, 4, 3, 14, 14, 9, 4, 11,...","[7, 6, 10, 11, 6, 7, 12, 4, 18, 7, 17, 25, 2, ...","[8, 2, 7, 3, 13, 3, 6, 4, 3, 14, 14, 9, 4, 11,..."
2,2,oil economy cloud stock outlook reuters,reuters soaring crude price plus economy outlo...,"[14, 17, 3, 4, 6, 8, 14, 19, 14, 16, 13, 4, 8,...","[7, 6, 10, 11, 6, 7, 12, 4, 12, 14, 2, 7, 17, ...","[14, 17, 3, 4, 6, 8, 14, 19, 14, 16, 13, 4, 8,...","[7, 6, 10, 11, 6, 7, 12, 4, 12, 14, 2, 7, 17, ...","[14, 17, 3, 4, 6, 8, 14, 19, 14, 16, 13, 4, 8,..."
3,2,iraq halt oil export main southern pipeline re...,reuters authority halted oil main pipeline sou...,"[17, 7, 2, 20, 4, 21, 2, 3, 11, 4, 14, 17, 3, ...","[7, 6, 10, 11, 6, 7, 12, 4, 2, 10, 11, 21, 14,...","[17, 7, 2, 20, 4, 21, 2, 3, 11, 4, 14, 17, 3, ...","[7, 6, 10, 11, 6, 7, 12, 4, 2, 10, 11, 21, 14,...","[17, 7, 2, 20, 4, 21, 2, 3, 11, 4, 14, 17, 3, ..."
4,2,oil price soar record posing new menace u econ...,afp tearaway world oil price toppling record s...,"[14, 17, 3, 4, 18, 7, 17, 8, 6, 4, 12, 14, 2, ...","[2, 24, 18, 4, 11, 6, 2, 7, 2, 1, 2, 13, 4, 1,...","[14, 17, 3, 4, 18, 7, 17, 8, 6, 4, 12, 14, 2, ...","[2, 24, 18, 4, 11, 6, 2, 7, 2, 1, 2, 13, 4, 1,...","[14, 17, 3, 4, 18, 7, 17, 8, 6, 4, 12, 14, 2, ..."


In [18]:
test_df = pd.read_csv('test.csv')

In [19]:
processed_text_test = pd.DataFrame({
    'Class Index': test_df['Class Index'],
    'Title': test_df['Title'].apply(preprocess_text),
    'Description': test_df['Description'].apply(preprocess_text)
})

In [20]:
processed_text_test['Class Index'] = processed_text_test['Class Index'] - 1

In [21]:
processed_text_test['token_indices1'] = processed_text_test['Title'].apply(lambda tokens: tokens_to_indices(tokens, vocab))
processed_text_test['token_indices2'] = processed_text_test['Description'].apply(lambda tokens: tokens_to_indices(tokens, vocab))

In [22]:
processed_text_test['padded_token_indices1'] = processed_text_test['token_indices1'].apply(lambda seq: pad_sequence(seq, max_seqlen))
processed_text_test['padded_token_indices2'] = processed_text_test['token_indices2'].apply(lambda seq: pad_sequence(seq, max_seqlen))

In [23]:
processed_text_test['combined_padded_indices'] = processed_text_test.apply(lambda row: row['padded_token_indices1'] + row['padded_token_indices2'], axis=1)

In [24]:
processed_text_test.head()

Unnamed: 0,Class Index,Title,Description,token_indices1,token_indices2,padded_token_indices1,padded_token_indices2,combined_padded_indices
0,2,fear n pension talk,union representing worker turner newall say ta...,"[24, 6, 2, 7, 4, 19, 4, 18, 6, 19, 12, 17, 14,...","[10, 19, 17, 14, 19, 4, 7, 6, 18, 7, 6, 12, 6,...","[24, 6, 2, 7, 4, 19, 4, 18, 6, 19, 12, 17, 14,...","[10, 19, 17, 14, 19, 4, 7, 6, 18, 7, 6, 12, 6,...","[24, 6, 2, 7, 4, 19, 4, 18, 6, 19, 12, 17, 14,..."
1,3,race second private team set launch date human...,toronto canada rocketeers competing million an...,"[7, 2, 8, 6, 4, 12, 6, 8, 14, 19, 15, 4, 18, 7...","[11, 14, 7, 14, 19, 11, 14, 4, 8, 2, 19, 2, 15...","[7, 2, 8, 6, 4, 12, 6, 8, 14, 19, 15, 4, 18, 7...","[11, 14, 7, 14, 19, 11, 14, 4, 8, 2, 19, 2, 15...","[7, 2, 8, 6, 4, 12, 6, 8, 14, 19, 15, 4, 18, 7..."
2,3,company win grant study peptide ap,ap company founded chemistry researcher univer...,"[8, 14, 16, 18, 2, 19, 13, 4, 1, 17, 19, 4, 23...","[2, 18, 4, 8, 14, 16, 18, 2, 19, 13, 4, 24, 14...","[8, 14, 16, 18, 2, 19, 13, 4, 1, 17, 19, 4, 23...","[2, 18, 4, 8, 14, 16, 18, 2, 19, 13, 4, 24, 14...","[8, 14, 16, 18, 2, 19, 13, 4, 1, 17, 19, 4, 23..."
3,3,prediction unit help forecast wildfire ap,ap barely dawn mike fitzpatrick start shift bl...,"[18, 7, 6, 15, 17, 8, 11, 17, 14, 19, 4, 10, 1...","[2, 18, 4, 5, 2, 7, 6, 3, 13, 4, 15, 2, 1, 19,...","[18, 7, 6, 15, 17, 8, 11, 17, 14, 19, 4, 10, 1...","[2, 18, 4, 5, 2, 7, 6, 3, 13, 4, 15, 2, 1, 19,...","[18, 7, 6, 15, 17, 8, 11, 17, 14, 19, 4, 10, 1..."
4,3,aim limit smog ap,ap southern california agency went emission bo...,"[2, 17, 16, 4, 3, 17, 16, 17, 11, 4, 12, 16, 1...","[2, 18, 4, 12, 14, 10, 11, 21, 6, 7, 19, 4, 8,...","[2, 17, 16, 4, 3, 17, 16, 17, 11, 4, 12, 16, 1...","[2, 18, 4, 12, 14, 10, 11, 21, 6, 7, 19, 4, 8,...","[2, 17, 16, 4, 3, 17, 16, 17, 11, 4, 12, 16, 1..."


In [25]:
import torch
from torch.utils.data import Dataset, DataLoader

In [26]:
X_train = torch.tensor(processed_text['combined_padded_indices'].tolist(), dtype=torch.long)
y_train = torch.tensor(processed_text['Class Index'].values, dtype=torch.long)
X_test = torch.tensor(processed_text_test['combined_padded_indices'].tolist(), dtype=torch.long)
y_test = torch.tensor(processed_text_test['Class Index'].values, dtype=torch.long)

In [27]:
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [28]:
train_dataset = TextDataset(X_train, y_train)
test_dataset = TextDataset(X_test, y_test)

In [29]:
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)

In [30]:
from torch import nn

In [31]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        output = self.fc(lstm_out[:, -1, :])
        return output

In [32]:
vocab_size = len(vocab) + 1
embedding_dim = 50
hidden_dim = 128
output_dim = len(processed_text['Class Index'].unique())

In [33]:
model = LSTMClassifier(vocab_size, embedding_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [37]:
torch.cuda.is_available()

True

In [38]:
device = torch.device("cuda")
print("Using device:", device)

Using device: cuda


In [39]:
model = LSTMClassifier(vocab_size, embedding_dim, hidden_dim, output_dim).to(device)

In [40]:
for epoch in range(5):
    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        optimizer.zero_grad()
        predictions = model(batch_x)
        loss = criterion(predictions, batch_y)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

Epoch 1, Loss: 1.4052369594573975
Epoch 2, Loss: 1.3789639472961426
Epoch 3, Loss: 1.3958125114440918
Epoch 4, Loss: 1.4113852977752686
Epoch 5, Loss: 1.3859304189682007


In [86]:
model.eval()

LSTMClassifier(
  (embedding): Embedding(28, 50)
  (lstm): LSTM(50, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=4, bias=True)
)

In [90]:
predictions = list()
true_labels = list()

In [91]:
with torch.no_grad():
    for batch_x, batch_y in test_loader:
        batch_x = batch_x.to(device)
        outputs = model(batch_x)
        preds = torch.argmax(outputs, dim=1)
        predictions.append(preds.tolist())
        true_labels.append(batch_y.tolist())

In [94]:
true_labels_flat = [label for sublist in true_labels for label in sublist]
predictions_flat = [pred for sublist in predictions for pred in sublist]

In [99]:
import numpy as np
true_labels = np.array(true_labels_flat)
predictions = np.array(predictions_flat)

In [97]:
from sklearn.metrics import accuracy_score, f1_score

In [101]:
accuracy = accuracy_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions, average = 'weighted')

# **Poor Accuracy and F1-Scores probably due to simple model and minimal pre processing**


In [102]:
print("Accuracy:", accuracy)
print("f1-score:", f1)

Accuracy: 0.2719736842105263
f1-score: 0.1831790678476211
