In [1]:
import json
import torch
from collections import Counter
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
from pathlib import Path

In [4]:
# Load the JSON data
with open('../cleaned_data/cleaned_articles.json', 'r', encoding='utf-8') as file:
    articles = json.load(file)

# Flatten the data into a list of texts
texts = [article['content'] for article in articles] + [article['title'] for article in articles]

# Tokenize and count word frequencies
word_counter = Counter()
for text in texts:
    word_counter.update(text.split())

# Sort words by frequency
most_common_words = word_counter.most_common()

# Determine vocab size
desired_vocab_size = len(most_common_words)
print(f"Optimal Vocabulary Size: {desired_vocab_size}")


Optimal Vocabulary Size: 19739


In [11]:
from tokenizers import BertWordPieceTokenizer

# Directory paths
data_files = ["../cleaned_data/cleaned_articles.json"]
vocab_size = 19739  # Based on your earlier calculation
special_tokens = ["[CLS]", "[SEP]", "[PAD]", "[UNK]", "[MASK]"]

# Load the data
with open(data_files[0], 'r', encoding='utf-8') as file:
    articles = json.load(file)

# Extract text content
texts = [article['content'] for article in articles]

# Save texts to a file for tokenizer training
with open("texts.txt", "w", encoding="utf-8") as f:
    for text in texts:
        f.write(text + "\n")

# Initialize a BPE tokenizer
tokenizer = BertWordPieceTokenizer(lowercase=False)

# Train the tokenizer
tokenizer.train(files=["texts.txt"], vocab_size=vocab_size, special_tokens=special_tokens)

# Save the tokenizer
tokenizer.save_model("../cleaned_data/")
print("BPE Tokenizer trained and saved.")

# Load the tokenizer
tokenizer = BertWordPieceTokenizer(
    "../cleaned_data/vocab.txt",
    "../cleaned_data/tokenizer.json"
)

# Tokenize the texts
tokenized_texts = [tokenizer.encode(text).tokens for text in texts]

# Save tokenized samples and full tokenized data
tokenized_samples = {}
for i, text in enumerate(texts[:3]):
    tokenized_samples[f"Text {i + 1}"] = {
        "original_text": text,
        "tokens": tokenized_texts[i]
    }

tokenized_samples_path = "../cleaned_data/tokenized_samples.json"
with open(tokenized_samples_path, 'w', encoding='utf-8') as file:
    json.dump(tokenized_samples, file, ensure_ascii=False, indent=4)
print(f"Tokenized samples saved to: {tokenized_samples_path}")

tokenized_data_path_pt = "../cleaned_data/tokenized_articles.pt"
torch.save(tokenized_texts, tokenized_data_path_pt)
print(f"Full tokenized data saved to: {tokenized_data_path_pt}")





BPE Tokenizer trained and saved.
Tokenized samples saved to: ../cleaned_data/tokenized_samples.json
Full tokenized data saved to: ../cleaned_data/tokenized_articles.pt


In [13]:
import torch
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, tokenized_texts, max_length=512):
        self.tokenized_texts = tokenized_texts
        self.max_length = max_length
    
    def __len__(self):
        return len(self.tokenized_texts)
    
    def __getitem__(self, idx):
        tokens = self.tokenized_texts[idx]
        input_ids = torch.tensor(tokens, dtype=torch.long)
        attention_mask = (input_ids != 0).long()  # Assuming 0 is the padding token
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask
        }

# Load tokenized texts
tokenized_texts = torch.load('../cleaned_data/tokenized_articles.pt')

# Create dataset
dataset = TextDataset(tokenized_texts)

# Create DataLoader
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)


  tokenized_texts = torch.load('../cleaned_data/tokenized_articles.pt')


In [None]:
from transformers import BertForSequenceClassification, AdamW

# Load pre-trained model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# Move model to GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)


  from .autonotebook import tqdm as notebook_tqdm
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
# Setup optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
def train(model, dataloader, optimizer, device, epochs=3):
    model.train()
    for epoch in range(epochs):
        for batch in dataloader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=None)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
        print(f'Epoch {epoch + 1}/{epochs} completed. Loss: {loss.item()}')

# Train the model
train(model, dataloader, optimizer, device)
