In [1]:
import pandas as pd
import contractions
import re
import nltk
nltk.download('stopwords')
from sklearn.model_selection import train_test_split
import numpy as np

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/alvinrach/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
d = pd.read_csv('https://raw.githubusercontent.com/alvinrach/learn-ai-bbc/main/BBC%20News%20Train.csv')
test = pd.read_csv('https://raw.githubusercontent.com/alvinrach/learn-ai-bbc/main/BBC%20News%20Test.csv')
sample = pd.read_csv('https://raw.githubusercontent.com/alvinrach/learn-ai-bbc/main/BBC%20News%20Sample%20Solution.csv')

In [3]:
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1490 entries, 0 to 1489
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ArticleId  1490 non-null   int64 
 1   Text       1490 non-null   object
 2   Category   1490 non-null   object
dtypes: int64(1), object(2)
memory usage: 35.1+ KB


In [4]:
d = d.drop('ArticleId',axis=1)

In [5]:
d

Unnamed: 0,Text,Category
0,worldcom ex-boss launches defence lawyers defe...,business
1,german business confidence slides german busin...,business
2,bbc poll indicates economic gloom citizens in ...,business
3,lifestyle governs mobile choice faster bett...,tech
4,enron bosses in $168m payout eighteen former e...,business
...,...,...
1485,double eviction from big brother model caprice...,entertainment
1486,dj double act revamp chart show dj duo jk and ...,entertainment
1487,weak dollar hits reuters revenues at media gro...,business
1488,apple ipod family expands market apple has exp...,tech


In [6]:
def txtprocess(txt):
    txt = str(txt).lower()
    txt = contractions.fix(txt)

    txt = re.sub(r'[^a-zA-Z]', ' ', txt)
    txt = re.sub(' +', ' ', txt)

    txt = ' '.join(txt.split())

    return txt

d['Text'] = d['Text'].apply(txtprocess)

In [7]:
stop_words = set(nltk.corpus.stopwords.words('english'))

# kayak you'll gitu masih ada ' nya , apa bagusnya sebelum txtprocess, tapi kecil semua sih
def remove_stopwords(txt):
    no_stopword_txt = [w for w in txt.split() if not w in stop_words]
    return ' '.join(no_stopword_txt)

d['Text'] = d['Text'].apply(lambda x: remove_stopwords(x))

In [8]:
category = pd.get_dummies(d.Category, dtype=int)
d_new = pd.concat([d, category], axis=1)
d_new = d_new.drop('Category', axis=1)
d_new

Unnamed: 0,Text,business,entertainment,politics,sport,tech
0,worldcom ex boss launches defence lawyers defe...,1,0,0,0,0
1,german business confidence slides german busin...,1,0,0,0,0
2,bbc poll indicates economic gloom citizens maj...,1,0,0,0,0
3,lifestyle governs mobile choice faster better ...,0,0,0,0,1
4,enron bosses payout eighteen former enron dire...,1,0,0,0,0
...,...,...,...,...,...,...
1485,double eviction big brother model caprice holb...,0,1,0,0,0
1486,dj double act revamp chart show dj duo jk joel...,0,1,0,0,0
1487,weak dollar hits reuters revenues media group ...,1,0,0,0,0
1488,apple ipod family expands market apple expande...,0,0,0,0,1


In [9]:
article = d_new['Text'].values
label = d_new[category.columns].values

In [10]:
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Tokenize directly (no fit_on_texts needed - it's already pretrained!)
tokens = tokenizer(article.tolist(), padding=True, truncation=True, return_tensors='pt')

# Split the tokenized data
padded_train, padded_test, y_train, y_test = train_test_split(
    tokens['input_ids'], label, test_size=0.2, random_state=42
)

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
padded_train, y_train

(tensor([[  101, 11865,  6562,  ...,     0,     0,     0],
         [  101,  4121,  5481,  ...,     0,     0,     0],
         [  101,  7206,  3404,  ...,     0,     0,     0],
         ...,
         [  101, 23413,  2229,  ...,     0,     0,     0],
         [  101,  3153,  2189,  ...,     0,     0,     0],
         [  101,  3306,  3940,  ...,     0,     0,     0]]),
 array([[0, 0, 0, 1, 0],
        [1, 0, 0, 0, 0],
        [0, 0, 1, 0, 0],
        ...,
        [0, 0, 1, 0, 0],
        [0, 1, 0, 0, 0],
        [0, 0, 0, 1, 0]]))

In [12]:
tokenizer.vocab_size

30522

In [13]:
# dengan pad packed sequence, tp padding_idx 0
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class MyModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim=500, hidden_dim=64, output_dim=5):
        super(MyModel, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size+1, embedding_dim=embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # x: [batch_size, seq_len]
        x = self.embedding(x)  # [batch_size, seq_len, embedding_dim]

        # compute lengths of non-padded tokens
        lengths = torch.sum(x.abs().sum(dim=2) != 0, dim=1)  # or use original input: x_input != 0
        # pack the sequence
        packed = pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        # pass through LSTM
        packed_output, (h_n, c_n) = self.lstm(packed)
        # unpack if needed (not necessary if just taking last hidden)
        output, _ = pad_packed_sequence(packed_output, batch_first=True)

        # take last hidden state for classification
        x = h_n[-1]  # [batch_size, hidden_dim]
        x = self.fc(x)  # [batch_size, output_dim]
        x = F.softmax(x, dim=1)
        return x

# Example usage
vocab_size = tokenizer.vocab_size
model = MyModel(vocab_size)

# Define loss and optimizer
criterion = nn.CrossEntropyLoss()  # same as categorical_crossentropy
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, eps=1e-7)

In [14]:
#gpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Move model to GPU
model = model.to(device)


import torch
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

# Convert your data to PyTorch tensors
X_train_tensor = padded_train.clone().detach().long()
y_train_tensor = torch.tensor(np.argmax(y_train, axis=1), dtype=torch.long)  # if one-hot
X_val_tensor = padded_test.clone().detach().long()
y_val_tensor = torch.tensor(np.argmax(y_test, axis=1), dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128)

# Early stopping parameters
patience = 15
best_val_loss = float('inf')
counter = 0

num_epochs = 60

for epoch in range(num_epochs):
    # Training
    model.train()
    train_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        y_pred = model(X_batch)
        loss = criterion(y_pred, y_batch)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * X_batch.size(0)
    train_loss /= len(train_loader.dataset)

    # Validation
    model.eval()
    val_loss = 0
    correct = 0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)
            val_loss += loss.item() * X_batch.size(0)
            correct += (y_pred.argmax(1) == y_batch).sum().item()
    val_loss /= len(val_loader.dataset)
    val_acc = correct / len(val_loader.dataset)

    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

    # Early stopping check
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        counter = 0
        best_model_state = model.state_dict()  # save best model
    else:
        counter += 1
        if counter >= patience:
            print("Early stopping triggered")
            model.load_state_dict(best_model_state)  # restore best model
            break

Using device: cuda
Epoch 1/60, Train Loss: 1.6044, Val Loss: 1.6050, Val Acc: 0.2114
Epoch 2/60, Train Loss: 1.5889, Val Loss: 1.5957, Val Acc: 0.2450
Epoch 3/60, Train Loss: 1.5617, Val Loss: 1.5879, Val Acc: 0.2819
Epoch 4/60, Train Loss: 1.5193, Val Loss: 1.5716, Val Acc: 0.3020
Epoch 5/60, Train Loss: 1.4602, Val Loss: 1.5502, Val Acc: 0.3456
Epoch 6/60, Train Loss: 1.3857, Val Loss: 1.5289, Val Acc: 0.3960
Epoch 7/60, Train Loss: 1.2929, Val Loss: 1.4957, Val Acc: 0.4329
Epoch 8/60, Train Loss: 1.1896, Val Loss: 1.4634, Val Acc: 0.4799
Epoch 9/60, Train Loss: 1.0926, Val Loss: 1.4240, Val Acc: 0.5235
Epoch 10/60, Train Loss: 1.0179, Val Loss: 1.3818, Val Acc: 0.5570
Epoch 11/60, Train Loss: 0.9697, Val Loss: 1.3506, Val Acc: 0.5772
Epoch 12/60, Train Loss: 0.9429, Val Loss: 1.3281, Val Acc: 0.5805
Epoch 13/60, Train Loss: 0.9302, Val Loss: 1.3107, Val Acc: 0.6007
Epoch 14/60, Train Loss: 0.9233, Val Loss: 1.2969, Val Acc: 0.6074
Epoch 15/60, Train Loss: 0.9191, Val Loss: 1.2865, V

In [15]:
sample

Unnamed: 0,ArticleId,Category
0,1018,sport
1,1319,tech
2,1138,business
3,459,entertainment
4,1020,politics
...,...,...
730,1923,sport
731,373,tech
732,1704,business
733,206,entertainment


In [16]:
test['Text'] = test['Text'].apply(txtprocess)
test['Text'] = test['Text'].apply(lambda x: remove_stopwords(x))

testtext = test['Text'].values

paddedtesttext = tokenizer(testtext.tolist(), padding=True, truncation=True, return_tensors='pt')['input_ids']

In [17]:
torch.cuda.empty_cache()

In [18]:
test_dataset = TensorDataset(paddedtesttext)
test_loader = DataLoader(test_dataset, batch_size=256)

all_preds = []
model.eval()
with torch.no_grad():
    for (X_batch,) in test_loader:
        X_batch = X_batch.to(device)
        outputs = model(X_batch)
        preds = torch.argmax(outputs, dim=1).cpu().numpy()
        all_preds.extend(preds)

# Map integer predictions to original category names
mapping = dict(enumerate(category.columns))
pred_labels = pd.Series(all_preds).map(mapping)

# Combine with ArticleId
answer = pd.concat([test['ArticleId'].reset_index(drop=True), pred_labels.reset_index(drop=True)], axis=1)
answer.columns = ['ArticleId', 'Category']

answer

Unnamed: 0,ArticleId,Category
0,1018,entertainment
1,1319,tech
2,1138,sport
3,459,business
4,1020,entertainment
...,...,...
730,1923,entertainment
731,373,entertainment
732,1704,politics
733,206,sport
