In [2]:
import json

f = open("dataset.json", "r")
articles = []

for line in f.readlines():
    x = json.loads(line.strip("\n"))
    date = x['date'].split("-")
    x['date'] = {
        'year': int(date[0]),
        'month': int(date[1]),
        'day': int(date[2])
    }
    
    articles.append(x)

f.close()
print(articles[0])

{'link': 'https://www.huffpost.com/entry/covid-boosters-uptake-us_n_632d719ee4b087fae6feaac9', 'headline': 'Over 4 Million Americans Roll Up Sleeves For Omicron-Targeted COVID Boosters', 'category': 'U.S. NEWS', 'short_description': 'Health experts said it is too early to predict whether demand would match up with the 171 million doses of the new boosters the U.S. ordered for the fall.', 'authors': 'Carla K. Johnson, AP', 'date': {'year': 2022, 'month': 9, 'day': 23}}


In [3]:
from typing import List, Dict

import nltk
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download("punkt_tab", quiet=True)
nltk.download("punkt", quiet=True)
nltk.download("stopwords", quiet=True)
nltk.download("wordnet", quiet=True)

STOPWORDS = set(stopwords.words("english"))
LEMMATIZER = WordNetLemmatizer()

def preprocess_texts(articles) -> List[List[str]]:

    def _process(text: str) -> List[str]:
        return [
            LEMMATIZER.lemmatize(token)
            for token in word_tokenize(text.lower())
            if token.isalpha() and token not in STOPWORDS and len(token) > 2
        ]

    for article in articles:
        article['processed_short_description'] = _process(article['short_description'])
        article['processed_headline'] = _process(article['headline'])

preprocess_texts(articles)

In [4]:
with open("processed_articles.json", "w", encoding="utf-8") as out_f:
    json.dump(articles, out_f, ensure_ascii=True, indent=4)

In [5]:
import numpy as np
from sentence_transformers import SentenceTransformer

texts = [
    " ".join(article["processed_headline"] + article["processed_short_description"])
    for article in articles
]

model = SentenceTransformer("distilbert-base-nli-mean-tokens")
embeddings = model.encode(texts, show_progress_bar=True)

np.save("bert_embeddings.npy", embeddings)

Batches: 100%|██████████| 6548/6548 [25:29<00:00,  4.28it/s]
