In [1]:
import json
import torch
from collections import Counter
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
from pathlib import Path

In [4]:
# Load the JSON data
with open('../cleaned_data/cleaned_articles.json', 'r', encoding='utf-8') as file:
    articles = json.load(file)

# Flatten the data into a list of texts
texts = [article['content'] for article in articles] + [article['title'] for article in articles]

# Tokenize and count word frequencies
word_counter = Counter()
for text in texts:
    word_counter.update(text.split())

# Sort words by frequency
most_common_words = word_counter.most_common()

# Determine vocab size
desired_vocab_size = len(most_common_words)
print(f"Optimal Vocabulary Size: {desired_vocab_size}")


Optimal Vocabulary Size: 19739


In [58]:
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer
import torch
import json

from tokenizers import BertWordPieceTokenizer
import torch
import json

# Directory paths
data_files = ["../cleaned_data/cleaned_articles.json"]
vocab_size = 19739  # Based on your earlier calculation
special_tokens = ["[CLS]", "[SEP]", "[PAD]", "[UNK]", "[MASK]"]

# Load the data
with open(data_files[0], 'r', encoding='utf-8') as file:
    articles = json.load(file)

# Extract text content
texts = [article['content'] for article in articles]
with open("texts.txt", "w", encoding="utf-8") as f:
    for text in texts:
        f.write(text + "\n")

tokenizer = BertWordPieceTokenizer(lowercase=False)
tokenizer.train(files=["texts.txt"], vocab_size=vocab_size, special_tokens=special_tokens)
tokenizer.save("../cleaned_data/tokenizer.json")
print("BPE Tokenizer trained and saved.")

# Tokenize the texts
tokenized_texts = [tokenizer.encode(text).ids for text in texts]

# Save token IDs
tokenized_data_path_pt = "../cleaned_data/tokenized_articles.pt"
torch.save(tokenized_texts, tokenized_data_path_pt)
print(f"Full tokenized data saved to: {tokenized_data_path_pt}")





BPE Tokenizer trained and saved.
Full tokenized data saved to: ../cleaned_data/tokenized_articles.pt


In [55]:
import os

# Check contents before saving
print(os.listdir('../cleaned_data/'))

# Save the tokenizer again
tokenizer.save_model("../cleaned_data/")

# Check contents after saving
print(os.listdir('../cleaned_data/'))


['tokenized_articles.pt', 'cleaned_data.csv', 'merges.txt', 'vocab.txt', 'cleaned_articles.json']
['tokenized_articles.pt', 'cleaned_data.csv', 'merges.txt', 'vocab.txt', 'cleaned_articles.json']


In [49]:
from transformers import BertTokenizer
import torch
from torch.utils.data import Dataset, DataLoader
import json

class TextDataset(Dataset):
    def __init__(self, token_ids, max_length=512):
        self.token_ids = token_ids
        self.max_length = max_length
        self.labels = [0] * len(token_ids)  # Use dummy labels (0)
    
    def __len__(self):
        return len(self.token_ids)
    
    def __getitem__(self, idx):
        tokens = self.token_ids[idx]
        input_ids = torch.tensor(tokens, dtype=torch.long)
        # Pad/truncate the input_ids to max_length
        if len(input_ids) > self.max_length:
            input_ids = input_ids[:self.max_length]
        else:
            padding_length = self.max_length - len(input_ids)
            input_ids = torch.cat([input_ids, torch.zeros(padding_length, dtype=torch.long)])
        
        attention_mask = (input_ids != 0).long()  # Create attention mask
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': label
        }

# Load token IDs
token_ids = torch.load('../cleaned_data/tokenized_articles.pt')
dataset = TextDataset(token_ids)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)


  token_ids = torch.load('../cleaned_data/tokenized_articles.pt')


In [14]:
from transformers import BertForSequenceClassification, AdamW

# Load pre-trained model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# Move model to GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)


  from .autonotebook import tqdm as notebook_tqdm
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [52]:
# Setup optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

def train(model, dataloader, optimizer, device, epochs=3):
    model.train()
    for epoch in range(epochs):
        for batch in dataloader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            print(outputs)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            break
        print(f'Epoch {epoch + 1}/{epochs} completed. Loss: {loss.item()}')

# Train the model
train(model, dataloader, optimizer, device)





SequenceClassifierOutput(loss=tensor(0.0036, grad_fn=<NllLossBackward0>), logits=tensor([[ 2.9056, -2.9931],
        [ 2.6866, -2.7539],
        [ 2.9378, -2.8211],
        [ 2.7765, -2.9682],
        [ 2.6876, -2.7863],
        [ 2.9094, -2.8773],
        [ 2.3587, -2.7742],
        [ 2.9675, -3.1159]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
Epoch 1/3 completed. Loss: 0.003602477489039302
SequenceClassifierOutput(loss=tensor(0.0032, grad_fn=<NllLossBackward0>), logits=tensor([[ 3.0805, -3.1363],
        [ 2.8175, -3.0067],
        [ 2.9313, -3.2063],
        [ 2.9199, -2.9347],
        [ 2.7873, -3.0725],
        [ 2.5048, -2.7379],
        [ 2.7103, -3.0190],
        [ 2.7033, -2.7771]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
Epoch 2/3 completed. Loss: 0.0031860042363405228
SequenceClassifierOutput(loss=tensor(0.0021, grad_fn=<NllLossBackward0>), logits=tensor([[ 3.0636, -3.3252],
        [ 3.2086, -3.1514],
        [ 3.1547, -2.9644]

In [22]:
import torch

# Load tokenized texts
tokenized_texts = torch.load('../cleaned_data/tokenized_articles.pt')
print(type(tokenized_texts))
print(tokenized_texts[:2])  # Print first two entries to inspect


<class 'list'>
[['[CLS]', '𞤤𞤫𞤧𞤯𞤫', '𞤸𞤭𞥅𞤪𞤲𞤢', '##𞤲', '##𞤳𞤮𞥅', '##𞤶𞤫', '(', '𞤮', '##𞤳𞤧', '##𞤭𞤣', '##𞤢𞤲', ')', '𞥑𞥕', '𞤫', '𞤤𞤫𞤴', '𞤳𞤮𞤲𞤺𞤮𞤤', '𞤢𞤥𞤫𞤪𞤭𞤳', '𞤳𞤢𞤻𞤵𞤲', '𞤫', '𞤮', '##𞤧𞤼𞤢', '##𞤪𞤢𞤤', '##𞤭𞤴𞤢', '𞤽𞤭𞤻𞤭𞥅', '𞤷𞤢', '##𞤴𞤲', '##𞤢', '𞤧𞤢𞤦𞤵', '𞤱𞤢𞥄𞤧𞤵𞤺𞤮𞤤', '𞤥𞤢𞤩𞥆𞤫', '𞤼𞤮𞤼𞥆𞤵𞤺𞤮𞤤', '𞤸𞤢𞤹𞥆𞤫𞥅𞤶𞤭', '𞤱𞤭', '##𞤺𞤵', '##𞥅', '##𞤪', '##𞤩𞤫', '𞤩𞤫𞤲', '(', '𞤶𞤵𞥅𞤤𞤩𞤫', '𞤶𞤫𞤴𞥆𞤢𞤩𞤫', '𞤸𞤭𞥅𞤪𞤲𞤢𞥄𞤲𞤺𞤫', '-', '𞤲𞤢𞤲𞤮', '𞤷𞤢', '##𞤴𞤲', '##𞤢', '𞤫', '𞤤𞤢𞤪𞤢𞤤', '𞤭𞤲𞥆𞤫𞤼𞤫𞥅𞤲𞤺𞤢𞤤', '𞥃', '##𞤭𞤲', '##𞥁', '##𞤭𞤴𞤢𞤲', ')', '.', '𞤷𞤢', '##𞤴', '##𞤦𞤢', '𞤶𞤢𞥄𞤦𞤭𞥅', '𞤩𞤫', '⹁', '𞤸𞤮𞤤', '𞤳𞤮', '𞤱𞤭𞥅𞤴𞤵𞤯𞤮𞤲', '𞤫', '𞤳𞤮', '𞤱𞤮𞤲𞤭', '𞤫', '𞤬𞤫𞤰𞥆𞤵𞤣𞤫', '𞤦𞤢𞤽𞥆𞤫', '𞤺𞤢𞥄', '##𞥁', '##𞥆𞤢', '؟', '𞤲𞤭𞤤𞥆𞤢𞥄𞤶𞤮', '𞤷𞤢', '##𞤴𞤲', '##𞤢', '𞤳𞤢', '𞤲𞤶𞤵𞤩𞥆𞤵𞤣𞤭', '𞤺𞤫𞤲', '##𞤯𞤫', '𞤬𞤮𞤼', '##𞥆𞤵𞤯𞤫', '(', 'on', '##u', '–', 'un', ')', '𞤸𞤮𞤤𞥆𞤭', '𞤱𞤮𞤲𞤣𞤫𞤥𞤢', ':', '«', '𞤥𞤵𞤪𞤢𞥄𞤣𞤵', '𞤬𞤭𞥅', '𞤸𞤢𞤹𞥆𞤫𞥅𞤶𞤭', '𞤲𞤫𞤯𞥆𞤢𞤲𞤳𞤫', '𞤲𞤣𞤵', '𞤺𞤮𞥅𞤥𞤵', '𞤺𞤫𞤲', '##𞤯𞤫', '𞤬𞤮𞤼', '##𞥆𞤵𞤯𞤫', '𞤯𞤫𞤲', '𞤬𞤵𞤼𞤭', '𞤸𞤭𞥅𞤤𞤲𞤢𞥄𞤣𞤫', '𞤸𞤭𞤳𞥆𞤢', '⹁', '𞤧𞤭𞤳𞥆𞤫', '𞤢𞤤𞤢𞥄', '𞤳𞤮', '𞤺𞤢𞥄', '##𞥁', '##𞥆𞤢', '.', '»', '𞤮', '𞤩𞤫𞤴𞤣𞤭', '𞤳𞤢𞤣𞤭', ':', '«', '𞤯𞤮𞥅', '𞤴𞤮', '𞤮', '##𞤧𞤼𞤢', '##𞤪𞤢𞤤',

  tokenized_texts = torch.load('../cleaned_data/tokenized_articles.pt')
