In [55]:
## Initialize
import spacy
from spacy.language import Language
import requests
import json
import numpy as np
import re
from tqdm.notebook import trange, tqdm
  
apiKey = "hTLGygty2Sj5IB368rcArA63Xu29hW2r"
archiveUrl = f'https://api.nytimes.com/svc/archive/v1/#1/#2.json?api-key={apiKey}'

# Disabling components not needed (optional, but useful if run on a large dataset)
nlp = spacy.load("en_core_web_lg", disable=["tok2vec", "parser", "senter", "lemmatizer", "tagger", "attribute_ruler"])
nlp.add_pipe("merge_noun_chunks")
nlp.add_pipe("merge_entities")

def parseDoc(doc) -> str:
  headline = doc["headline"]["print_headline"] or doc["headline"]["main"]
  abstracts = [doc["lead_paragraph"], doc["abstract"], doc["snippet"]]
  abstracts.sort(key = len, reverse=True)

  return str(
    headline 
    + (f" {abstracts[0]} " if abstracts[0] else "")
  ).lower()

# Example text
text = "German Chancellor Angela Merkel died in 1936 in New York. She got shot by a mysterious terrorist, terror, terrorism"

In [56]:
## Fetch

print(archiveUrl.replace("#1", "1963").replace("#2", "10"))
res = requests.get(archiveUrl.replace("#1", "1963").replace("#2", "10"))
obj = json.loads(res.text)

https://api.nytimes.com/svc/archive/v1/1963/10.json?api-key=hTLGygty2Sj5IB368rcArA63Xu29hW2r


In [57]:
## Save fetch result for testing

#with open("articles.json", "w", encoding="utf-8") as json_file:
#    json_file.write(json.dumps(json.loads(res.text), indent="\t"))

In [None]:
## Search

def checkMatches(token, relevant_words):
    for comp in relevant_words.keys():
        comp_lg = nlp(comp)
        similarity = token.similarity(comp_lg)
        if(similarity >= 0.8):
            return (comp, similarity)
    return False

def countWordsInToken(token, wordCount, matches):
    if token.text in matches.keys():
            matches[token.text]["amount"] = matches[token.text]["amount"] + 1
    elif token.text in wordCount.keys():
        wordCount[token.text]["amount"] = wordCount[token.text]["amount"] + 1
    elif not token.is_stop and not token.is_punct:
        matched = checkMatches(token, wordCount)
        if(not matched):
            wordCount[token.text] = {}
            wordCount[token.text]["amount"] = 1
        else:
            matches[token.text] = {
                "match": matched[0],
                "similarity": matched[1],
                "amount": 1
            }
    return [wordCount, matches]

def transferToWordCount(matches, wordCount):
    for m in matches.items():
        if "duplicates" not in wordCount[m[1]["match"]]:
            wordCount[m[1]["match"]]["duplicates"] = {}
        wordCount[m[1]["match"]]["duplicates"][m[0]] = {
            "amount": m[1]["amount"],
            "similarity": m[1]["similarity"]
        }
        wordCount[m[1]["match"]]["amount"] = wordCount[m[1]["match"]]["amount"] + m[1]["amount"]
    return wordCount

# Task 1: Extracting relevant words using the transformer-based model
articles = list(obj["response"]["docs"])
articleWordCount = {}
for i in trange(len(articles), desc='Article'):
    # Count word inside the article
    words = parseDoc(articles[i])
    wordCount = {}
    matches: dict[str, dict] = {}
    doc_trf = nlp(words)
    for j in range(len(list(doc_trf))):
        token = doc_trf[j]
        wordCount, matches = countWordsInToken(token, wordCount, matches)
    wordCount = transferToWordCount(matches, wordCount)
    # Create article object with wordCount attached
    date = articles[i]["pub_date"]
    if date not in articleWordCount.keys():
        articleWordCount[date] = []
    articleWordCount[date].append({
        "href": articles[i]["web_url"],
        "topics": articles[i]["keywords"],
        "wordCount": wordCount
    })


Article:   0%|          | 0/11585 [00:00<?, ?it/s]

  similarity = token.similarity(comp_lg)


In [None]:
print(wordCount)
print("\n--------------------\n")
print(matches)

In [None]:
import spacy
taggers = [{
    "name": "Taggers",
    "values": [
        "$", "''", ",", "-LRB-", "-RRB-", ".", ":", "ADD", "AFX", "CC", "CD", "DT", "EX", "FW", "HYPH", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NFP", "NN", "NNP", "NNPS", "NNS", "PDT", "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "SYM", "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB", "XX", "_SP", "``"
    ]
}]

for tagger in taggers:
    print(tagger["name"])
    print(tagger["values"])

for tagger in taggers:
    print(f"---------{tagger['name']}----------")
    for tag in tagger["values"]:
        print(f"{tag}: {spacy.explain(tag)}")

In [None]:
sections =  [
    {
      "section": "admin",
      "display_name": "Admin"
    },
    {
      "section": "arts",
      "display_name": "Arts"
    },
    {
      "section": "automobiles",
      "display_name": "Automobiles"
    },
    {
      "section": "books",
      "display_name": "Books"
    },
    {
      "section": "briefing",
      "display_name": "Briefing"
    },
    {
      "section": "business",
      "display_name": "Business"
    },
    {
      "section": "climate",
      "display_name": "Climate"
    },
    {
      "section": "corrections",
      "display_name": "Corrections"
    },
    {
      "section": "education",
      "display_name": "Education"
    },
    {
      "section": "en español",
      "display_name": "En español"
    },
    {
      "section": "fashion",
      "display_name": "Fashion"
    },
    {
      "section": "food",
      "display_name": "Food"
    },
    {
      "section": "gameplay",
      "display_name": "Gameplay"
    },
    {
      "section": "guide",
      "display_name": "Guide"
    },
    {
      "section": "health",
      "display_name": "Health"
    },
    {
      "section": "home & garden",
      "display_name": "Home & Garden"
    },
    {
      "section": "home page",
      "display_name": "Home Page"
    },
    {
      "section": "job market",
      "display_name": "Job Market"
    },
    {
      "section": "the learning network",
      "display_name": "The Learning Network"
    },
    {
      "section": "lens",
      "display_name": "Lens"
    },
    {
      "section": "magazine",
      "display_name": "Magazine"
    },
    {
      "section": "movies",
      "display_name": "Movies"
    },
    {
      "section": "multimedia/photos",
      "display_name": "Multimedia/Photos"
    },
    {
      "section": "new york",
      "display_name": "New York"
    },
    {
      "section": "obituaries",
      "display_name": "Obituaries"
    },
    {
      "section": "opinion",
      "display_name": "Opinion"
    },
    {
      "section": "parenting",
      "display_name": "Parenting"
    },
    {
      "section": "podcasts",
      "display_name": "Podcasts"
    },
    {
      "section": "reader center",
      "display_name": "Reader Center"
    },
    {
      "section": "real estate",
      "display_name": "Real Estate"
    },
    {
      "section": "smarter living",
      "display_name": "Smarter Living"
    },
    {
      "section": "science",
      "display_name": "Science"
    },
    {
      "section": "sports",
      "display_name": "Sports"
    },
    {
      "section": "style",
      "display_name": "Style"
    },
    {
      "section": "sunday review",
      "display_name": "Sunday Review"
    },
    {
      "section": "t brand",
      "display_name": "T Brand"
    },
    {
      "section": "t magazine",
      "display_name": "T Magazine"
    },
    {
      "section": "technology",
      "display_name": "Technology"
    },
    {
      "section": "theater",
      "display_name": "Theater"
    },
    {
      "section": "times insider",
      "display_name": "Times Insider"
    },
    {
      "section": "today’s paper",
      "display_name": "Today’s Paper"
    },
    {
      "section": "travel",
      "display_name": "Travel"
    },
    {
      "section": "u.s.",
      "display_name": "U.S."
    },
    {
      "section": "universal",
      "display_name": "Universal"
    },
    {
      "section": "the upshot",
      "display_name": "The Upshot"
    },
    {
      "section": "video",
      "display_name": "Video"
    },
    {
      "section": "the weekly",
      "display_name": "The Weekly"
    },
    {
      "section": "well",
      "display_name": "Well"
    },
    {
      "section": "world",
      "display_name": "World"
    },
    {
      "section": "your money",
      "display_name": "Your Money"
    }
  ]

list(map(lambda s: s["section"], sections))
