In [10]:
import json

DATA_DIR = "../data"
DATASET_NAME = "News_Category_Dataset_v3"

with open(f"{DATA_DIR}/{DATASET_NAME}.json") as file:
    articles = []
    for line in file.readlines():
        x = json.loads(line.strip("\n"))
        date = x["date"].split("-")
        x["date"] = {"year": int(date[0]), "month": int(date[1]), "day": int(date[2])}

        articles.append(x)

print(articles[0])

{'link': 'https://www.huffpost.com/entry/covid-boosters-uptake-us_n_632d719ee4b087fae6feaac9', 'headline': 'Over 4 Million Americans Roll Up Sleeves For Omicron-Targeted COVID Boosters', 'category': 'U.S. NEWS', 'short_description': 'Health experts said it is too early to predict whether demand would match up with the 171 million doses of the new boosters the U.S. ordered for the fall.', 'authors': 'Carla K. Johnson, AP', 'date': {'year': 2022, 'month': 9, 'day': 23}}


In [None]:
import nltk

from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download("punkt_tab", quiet=True)
nltk.download("punkt", quiet=True)
nltk.download("stopwords", quiet=True)
nltk.download("wordnet", quiet=True)

STOPWORDS = set(stopwords.words("english"))
LEMMATIZER = WordNetLemmatizer()


def preprocess_texts(articles: list[dict[str, str]]) -> list[list[str]]:
    def _process(text: str) -> list[str]:
        return [
            LEMMATIZER.lemmatize(token)
            for token in word_tokenize(text.lower())
            if token.isalpha() and token not in STOPWORDS and len(token) > 2
        ]

    for article in tqdm(articles):
        article["processed_short_description"] = _process(article["short_description"])
        article["processed_headline"] = _process(article["headline"])


preprocess_texts(articles)

100%|██████████| 209527/209527 [00:33<00:00, 6171.05it/s]


In [12]:
with open(f"{DATA_DIR}/{DATASET_NAME}_processed.json", "w", encoding="utf-8") as out_f:
    json.dump(articles, out_f, ensure_ascii=True, indent=4)

In [13]:
import numpy as np
from sentence_transformers import SentenceTransformer

texts = [" ".join(article["processed_headline"] + article["processed_short_description"]) for article in articles]
model = SentenceTransformer("distilbert-base-nli-mean-tokens")
embeddings = model.encode(texts, show_progress_bar=True)

np.save(f"{DATA_DIR}/{DATASET_NAME}_bert_embeddings.npy", embeddings)

Batches:   0%|          | 0/6548 [00:00<?, ?it/s]