In [1]:
import pandas as pd
import contractions
import re
import nltk
nltk.download('stopwords')
from sklearn.model_selection import train_test_split
import numpy as np

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/alvinrach/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
d = pd.read_csv('https://raw.githubusercontent.com/alvinrach/learn-ai-bbc/main/BBC%20News%20Train.csv')
test = pd.read_csv('https://raw.githubusercontent.com/alvinrach/learn-ai-bbc/main/BBC%20News%20Test.csv')
sample = pd.read_csv('https://raw.githubusercontent.com/alvinrach/learn-ai-bbc/main/BBC%20News%20Sample%20Solution.csv')

In [3]:
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1490 entries, 0 to 1489
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ArticleId  1490 non-null   int64 
 1   Text       1490 non-null   object
 2   Category   1490 non-null   object
dtypes: int64(1), object(2)
memory usage: 35.1+ KB


In [4]:
d = d.drop('ArticleId',axis=1)

In [5]:
d

Unnamed: 0,Text,Category
0,worldcom ex-boss launches defence lawyers defe...,business
1,german business confidence slides german busin...,business
2,bbc poll indicates economic gloom citizens in ...,business
3,lifestyle governs mobile choice faster bett...,tech
4,enron bosses in $168m payout eighteen former e...,business
...,...,...
1485,double eviction from big brother model caprice...,entertainment
1486,dj double act revamp chart show dj duo jk and ...,entertainment
1487,weak dollar hits reuters revenues at media gro...,business
1488,apple ipod family expands market apple has exp...,tech


In [6]:
def txtprocess(txt):
    txt = str(txt).lower()
    txt = contractions.fix(txt)

    txt = re.sub(r'[^a-zA-Z]', ' ', txt)
    txt = re.sub(' +', ' ', txt)

    txt = ' '.join(txt.split())

    return txt

d['Text'] = d['Text'].apply(txtprocess)

In [7]:
stop_words = set(nltk.corpus.stopwords.words('english'))

# kayak you'll gitu masih ada ' nya , apa bagusnya sebelum txtprocess, tapi kecil semua sih
def remove_stopwords(txt):
    no_stopword_txt = [w for w in txt.split() if not w in stop_words]
    return ' '.join(no_stopword_txt)

d['Text'] = d['Text'].apply(lambda x: remove_stopwords(x))

In [8]:
category = pd.get_dummies(d.Category, dtype=int)
d_new = pd.concat([d, category], axis=1)
d_new = d_new.drop('Category', axis=1)
d_new

Unnamed: 0,Text,business,entertainment,politics,sport,tech
0,worldcom ex boss launches defence lawyers defe...,1,0,0,0,0
1,german business confidence slides german busin...,1,0,0,0,0
2,bbc poll indicates economic gloom citizens maj...,1,0,0,0,0
3,lifestyle governs mobile choice faster better ...,0,0,0,0,1
4,enron bosses payout eighteen former enron dire...,1,0,0,0,0
...,...,...,...,...,...,...
1485,double eviction big brother model caprice holb...,0,1,0,0,0
1486,dj double act revamp chart show dj duo jk joel...,0,1,0,0,0
1487,weak dollar hits reuters revenues media group ...,1,0,0,0,0
1488,apple ipod family expands market apple expande...,0,0,0,0,1


In [9]:
article = d_new['Text'].values
label = d_new[category.columns].values

In [10]:
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Tokenize directly (no fit_on_texts needed - it's already pretrained!)
tokens = tokenizer(article.tolist(), padding=True, truncation=True, return_tensors='pt')

# Split the tokenized data
padded_train, padded_test, y_train, y_test = train_test_split(
    tokens['input_ids'], label, test_size=0.2, random_state=42
)

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
padded_train, y_train

(tensor([[  101, 11865,  6562,  ...,     0,     0,     0],
         [  101,  4121,  5481,  ...,     0,     0,     0],
         [  101,  7206,  3404,  ...,     0,     0,     0],
         ...,
         [  101, 23413,  2229,  ...,     0,     0,     0],
         [  101,  3153,  2189,  ...,     0,     0,     0],
         [  101,  3306,  3940,  ...,     0,     0,     0]]),
 array([[0, 0, 0, 1, 0],
        [1, 0, 0, 0, 0],
        [0, 0, 1, 0, 0],
        ...,
        [0, 0, 1, 0, 0],
        [0, 1, 0, 0, 0],
        [0, 0, 0, 1, 0]]))

In [12]:
tokenizer.vocab_size

30522

In [13]:
# katanya sih biar mimic keras kita harus mimic weight nya keras dan pake forget gate bias nya keras
# ini yang terbaik
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class MyModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim=500, hidden_dim=64, output_dim=5):
        super(MyModel, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size+1, embedding_dim=embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # x: [batch_size, seq_len]
        x = self.embedding(x)  # [batch_size, seq_len, embedding_dim]

        # compute lengths of non-padded tokens
        lengths = torch.sum(x.abs().sum(dim=2) != 0, dim=1)  # or use original input: x_input != 0
        # pack the sequence
        packed = pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        # pass through LSTM
        packed_output, (h_n, c_n) = self.lstm(packed)
        # unpack if needed (not necessary if just taking last hidden)
        output, _ = pad_packed_sequence(packed_output, batch_first=True)

        # take last hidden state for classification
        x = h_n[-1]  # [batch_size, hidden_dim]
        x = self.fc(x)  # [batch_size, output_dim]
        x = F.softmax(x, dim=1)
        return x

# Example usage
vocab_size = tokenizer.vocab_size
model = MyModel(vocab_size)

# 4) Weight initialization to mimic Keras
def init_lstm_like_keras(m):
    if isinstance(m, nn.Embedding):
        # Keras embed init is usually uniform small; this is OK
        nn.init.normal_(m.weight, mean=0.0, std=0.01)
        if m.padding_idx is not None:
            with torch.no_grad():
                m.weight[m.padding_idx].fill_(0)
    if isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight)
        nn.init.zeros_(m.bias)
    if isinstance(m, nn.LSTM):
        for name, param in m.named_parameters():
            if 'weight_ih' in name:
                nn.init.xavier_uniform_(param.data)   # kernel ~ glorot
            elif 'weight_hh' in name:
                nn.init.orthogonal_(param.data)       # recurrent ~ orthogonal
            elif 'bias' in name:
                nn.init.zeros_(param.data)

model.apply(init_lstm_like_keras)

# 5) Set forget-gate bias = 1 (handles bias_ih + bias_hh)
for names in model.lstm._all_weights:
    for name in names:
        if 'bias' in name:
            bias = getattr(model.lstm, name)
            n = bias.size(0)
            # gates are i, f, g, o => forget gate slice is n//4 to n//4*2
            start = n // 4
            end = start + n // 4
            with torch.no_grad():
                bias[start:end].fill_(1.0)

# Define loss and optimizer
criterion = nn.CrossEntropyLoss()  # same as categorical_crossentropy
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, eps=1e-7)

In [14]:
#gpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Move model to GPU
model = model.to(device)


import torch
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

# Convert your data to PyTorch tensors
X_train_tensor = padded_train.clone().detach().long()
y_train_tensor = torch.tensor(np.argmax(y_train, axis=1), dtype=torch.long)  # if one-hot
X_val_tensor = padded_test.clone().detach().long()
y_val_tensor = torch.tensor(np.argmax(y_test, axis=1), dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128)

# Early stopping parameters
patience = 15
best_val_loss = float('inf')
counter = 0

num_epochs = 60

for epoch in range(num_epochs):
    # Training
    model.train()
    train_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        y_pred = model(X_batch)
        loss = criterion(y_pred, y_batch)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * X_batch.size(0)
    train_loss /= len(train_loader.dataset)

    # Validation
    model.eval()
    val_loss = 0
    correct = 0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)
            val_loss += loss.item() * X_batch.size(0)
            correct += (y_pred.argmax(1) == y_batch).sum().item()
    val_loss /= len(val_loader.dataset)
    val_acc = correct / len(val_loader.dataset)

    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

    # Early stopping check
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        counter = 0
        best_model_state = model.state_dict()  # save best model
    else:
        counter += 1
        if counter >= patience:
            print("Early stopping triggered")
            model.load_state_dict(best_model_state)  # restore best model
            break

Using device: cuda
Epoch 1/60, Train Loss: 1.5462, Val Loss: 1.4764, Val Acc: 0.4799
Epoch 2/60, Train Loss: 1.4291, Val Loss: 1.3843, Val Acc: 0.5738
Epoch 3/60, Train Loss: 1.3124, Val Loss: 1.3364, Val Acc: 0.5503
Epoch 4/60, Train Loss: 1.2475, Val Loss: 1.2360, Val Acc: 0.6409
Epoch 5/60, Train Loss: 1.1760, Val Loss: 1.1996, Val Acc: 0.8255
Epoch 6/60, Train Loss: 1.0698, Val Loss: 1.1633, Val Acc: 0.8054
Epoch 7/60, Train Loss: 1.0025, Val Loss: 1.0639, Val Acc: 0.8658
Epoch 8/60, Train Loss: 0.9476, Val Loss: 1.0032, Val Acc: 0.9295
Epoch 9/60, Train Loss: 0.9247, Val Loss: 0.9998, Val Acc: 0.9161
Epoch 10/60, Train Loss: 0.9182, Val Loss: 0.9861, Val Acc: 0.9228
Epoch 11/60, Train Loss: 0.9129, Val Loss: 0.9831, Val Acc: 0.9262
Epoch 12/60, Train Loss: 0.9161, Val Loss: 0.9907, Val Acc: 0.9228
Epoch 13/60, Train Loss: 0.9164, Val Loss: 0.9870, Val Acc: 0.9262
Epoch 14/60, Train Loss: 0.9133, Val Loss: 0.9788, Val Acc: 0.9329
Epoch 15/60, Train Loss: 0.9102, Val Loss: 0.9760, V

In [15]:
model

MyModel(
  (embedding): Embedding(30523, 500, padding_idx=0)
  (lstm): LSTM(500, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=5, bias=True)
)

In [16]:
test['Text'] = test['Text'].apply(txtprocess)
test['Text'] = test['Text'].apply(lambda x: remove_stopwords(x))

testtext = test['Text'].values

paddedtesttext = tokenizer(testtext.tolist(), padding=True, truncation=True, return_tensors='pt')['input_ids']
paddedtesttext

tensor([[  101,  1053, 18098,  ...,     0,     0,     0],
        [  101,  4007,  3666,  ...,     0,     0,     0],
        [  101,  8115,  2100,  ...,     0,     0,     0],
        ...,
        [  101,  4368, 19244,  ...,     0,     0,     0],
        [  101, 22989, 12390,  ...,     0,     0,     0],
        [  101,  9068,  2724,  ...,     0,     0,     0]])

In [17]:
testtext[9]

'tsunami hit sri lanka banks sri lanka banks face hard times following december tsunami disaster officials warned sri lanka banks association said waves killed people also washed away huge amounts property securing loans according estimate much loans made private banks clients disaster zone written damaged state owned lenders may even worse hit said association estimates private banking sector bn rupees loans outstanding disaster zone one hand banks dealing death customers along damaged destroyed collateral extending cheap loans rebuilding recovery well giving clients time repay existing borrowing combination means revenue shortfall slba chairman commercial bank managing director al gooneratne told news conference banks given moratoriums collecting interest least quarter said public sector one ten state owned people bank customers south sri lanka affected bank spokesman told reuters estimated bank loss bn rupees'

# 1 First method to save model

In [18]:
# Method 1: Save entire model (less common in production)
torch.save(model, '1_complete_model.pth')

In [29]:
# To load, you MUST have:
# - Exact same Python environment
# - Same PyTorch version
# - Same model class definition imported
# - All dependencies available
# - Python Dependency hell
# - Deployment environment issues
# - Security concerns
# - Platform Limitation (cant in c++)

In [19]:
#gpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

loaded_model = torch.load("1_complete_model.pth", weights_only=False)
loaded_model.eval()

x_test = paddedtesttext[[9]]
with torch.no_grad():
    x_test = x_test.to(device)
    logits = loaded_model(x_test)
    pred = logits.argmax(dim=1).item()

pred

Using device: cuda


0

# 2 Second Method

This method needs to rebuild class architecture

In [20]:
# Method 2: Save state dict (recommended for training checkpoints)
torch.save(model.state_dict(), '2_model_weights.pth')

In [21]:
# it needs rebuilding architecture
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class MyModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim=500, hidden_dim=64, output_dim=5):
        super(MyModel, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size+1, embedding_dim=embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # x: [batch_size, seq_len]
        x = self.embedding(x)  # [batch_size, seq_len, embedding_dim]

        # compute lengths of non-padded tokens
        lengths = torch.sum(x.abs().sum(dim=2) != 0, dim=1)  # or use original input: x_input != 0
        # pack the sequence
        packed = pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        # pass through LSTM
        packed_output, (h_n, c_n) = self.lstm(packed)
        # unpack if needed (not necessary if just taking last hidden)
        output, _ = pad_packed_sequence(packed_output, batch_first=True)

        # take last hidden state for classification
        x = h_n[-1]  # [batch_size, hidden_dim]
        x = self.fc(x)  # [batch_size, output_dim]
        x = F.softmax(x, dim=1)
        return x

# Example usage
vocab_size = tokenizer.vocab_size
loaded_model = MyModel(vocab_size)

# 4) Weight initialization to mimic Keras
def init_lstm_like_keras(m):
    if isinstance(m, nn.Embedding):
        # Keras embed init is usually uniform small; this is OK
        nn.init.normal_(m.weight, mean=0.0, std=0.01)
        if m.padding_idx is not None:
            with torch.no_grad():
                m.weight[m.padding_idx].fill_(0)
    if isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight)
        nn.init.zeros_(m.bias)
    if isinstance(m, nn.LSTM):
        for name, param in m.named_parameters():
            if 'weight_ih' in name:
                nn.init.xavier_uniform_(param.data)   # kernel ~ glorot
            elif 'weight_hh' in name:
                nn.init.orthogonal_(param.data)       # recurrent ~ orthogonal
            elif 'bias' in name:
                nn.init.zeros_(param.data)

loaded_model.apply(init_lstm_like_keras)

# 5) Set forget-gate bias = 1 (handles bias_ih + bias_hh)
for names in loaded_model.lstm._all_weights:
    for name in names:
        if 'bias' in name:
            bias = getattr(loaded_model.lstm, name)
            n = bias.size(0)
            # gates are i, f, g, o => forget gate slice is n//4 to n//4*2
            start = n // 4
            end = start + n // 4
            with torch.no_grad():
                bias[start:end].fill_(1.0)

#gpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Move model to GPU
loaded_model = loaded_model.to(device)

loaded_model.load_state_dict(torch.load("2_model_weights.pth"))
loaded_model.eval()

x_test = paddedtesttext[[9]]
with torch.no_grad():
    x_test = x_test.to(device)
    logits = loaded_model(x_test)
    pred = logits.argmax(dim=1).item()

pred

Using device: cuda


0

# 3 Third Method

### 3a

In [31]:
# TorchScript creates a serialized representation that includes:
# - Model architecture
# - Trained weights
# - Computation graph
# - NO Python dependencies!
# - Self-contained
# - Cross-Platform Deployment (include C++)
# - Performance Optimized
# - Version Independence

In [32]:
# Example
# // Can even load in pure C++ applications!
# #include <torch/script.h>

# torch::jit::script::Module model = torch::jit::load("model.pt");

In [22]:
# Method 3: Save for production with TorchScript (RECOMMENDED FOR PRODUCTION) 
model.eval() # Set to evaluation mode

MyModel(
  (embedding): Embedding(30523, 500, padding_idx=0)
  (lstm): LSTM(500, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=5, bias=True)
)

In [23]:
# Tracing works by running the model once with a sample input and recording all tensor operations that happen.
# The traced graph is then saved as a TorchScript model.
# That means you must provide a real example input tensor (with the same shape/type as your real data).
# Without an input, TorchScript has no idea what the computation graph looks like.

x_test = paddedtesttext[[9]].to(device)

# Option 3a: Tracing (most common)
traced_model = torch.jit.trace(model, x_test)
traced_model.save('3a_model_traced.pt')

In [24]:
#gpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

loaded_model = torch.jit.load("3a_model_traced.pt")
loaded_model.eval()

x_test = paddedtesttext[[9]]
with torch.no_grad():
    x_test = x_test.to(device)
    logits = loaded_model(x_test)
    pred = logits.argmax(dim=1).item()

pred

Using device: cuda


0

### 3b

In [25]:
# Option 3b: Scripting (better for models with control flow)
scripted_model = torch.jit.script(model)
scripted_model.save('3b_model_scripted.pt')

In [26]:
#gpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

loaded_model = torch.jit.load("3b_model_scripted.pt")
loaded_model.eval()

x_test = paddedtesttext[[9]]
with torch.no_grad():
    x_test = x_test.to(device)
    logits = loaded_model(x_test)
    pred = logits.argmax(dim=1).item()

pred

Using device: cuda


0

# Summary

| Method | Use Case | Production Ready |
|--------|----------|------------------|
| Complete Model (1)| Research/Development | ❌ Python dependencies |
| State Dict (2)| Training Checkpoints | ❌ Needs class definition |
| TorchScript (3)| Production Serving | ✅ Self-contained |

### 3a or 3b

Use Tracing (3a) when:

- Standard feedforward networks (like our BBC classifier)
- CNNs with fixed architecture
- Transformers with fixed sequence length
- Models without dynamic branching
- You want maximum performance optimization

Use Scripting (3b) when:

- Models with if/else statements based on input
- Dynamic RNNs with variable sequence lengths
- Models with loops that depend on data
- Control flow that changes based on input values

In [34]:
# # 3a
# class BBCNewsClassifier(nn.Module):
#     def forward(self, x):
#         x = self.fc1(x)        # Always executed
#         x = self.relu(x)       # Always executed  
#         x = self.dropout(x)    # Always executed
#         x = self.fc2(x)        # Always executed
#         return x               # Always executed

# # Perfect for tracing - same path every time
# traced = torch.jit.trace(model, example_input)  # ✅ Use this

In [35]:
# # 3b
# class DynamicModel(nn.Module):
#     def forward(self, x):
#         if x.size(1) > 100:           # Dynamic branching!
#             x = self.large_path(x)
#         else:
#             x = self.small_path(x)
        
#         for i in range(x.size(0)):    # Dynamic loops!
#             x[i] = self.process_item(x[i])
        
#         return x

# # Tracing would only capture ONE path
# # Scripting captures ALL possible paths
# scripted = torch.jit.script(model)  # ✅ Use this for dynamic models

In [27]:
# Memory is almost the same for each model but ofc those are different

In [28]:
!ls -l

total 240828
-rw-rw-r-- 1 alvinrach alvinrach 61631287 Aug 31 17:08 1_complete_model.pth
-rw-rw-r-- 1 alvinrach alvinrach    25368 Aug 31 17:07 1_Deployment_4.ipynb
-rw-rw-r-- 1 alvinrach alvinrach 61629677 Aug 31 17:08 2_model_weights.pth
-rw-rw-r-- 1 alvinrach alvinrach 61638369 Aug 31 17:08 3a_model_traced.pt
-rw-rw-r-- 1 alvinrach alvinrach 61655091 Aug 31 17:08 3b_model_scripted.pt


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
