# Week 3: Content analysis with LLMs

In [22]:
import os 
import json
import ollama

### First, let's load the CNS data we scraped

In [17]:
# load cns data 
with open('data/cns_stories.json', 'r') as filepath:
    cns_stories = json.load(filepath)

In [19]:
with open('data/dbk_stories.json', 'r') as dbk_filepath:
    dbk_stories = json.load(dbk_filepath)

### How many stories do we have in each dataset?

In [21]:
# how many stories do we have in our dataset? 
print(f"Our CNS dataset contains {len(cns_stories)} stories.")
print(f"Our Diamondback dataset contains {len(dbk_stories)} stories.")

Our CNS dataset contains 1000 stories.
Our Diamondback dataset contains 1000 stories.


In [None]:
# Let's see the structure of the json data for CNS.
print(type(cns_stories), type(cns_stories[0]))
print("CNS keys:", list(cns_stories[0].keys()))


<class 'list'> <class 'dict'>
CNS keys: ['id', 'date', 'date_gmt', 'guid', 'modified', 'modified_gmt', 'slug', 'status', 'type', 'link', 'title', 'content', 'excerpt', 'author', 'featured_media', 'comment_status', 'ping_status', 'sticky', 'template', 'format', 'meta', 'categories', 'tags', 'coauthors', 'class_list', 'parsely', 'jetpack_publicize_connections', 'jetpack_featured_media_url', 'jetpack_shortlink', 'jetpack_sharing_enabled', '_links']


In [None]:
# same for dbk 
print("DBK keys:", list(dbk_stories[0].keys()))


DBK keys: ['id', 'date', 'date_gmt', 'guid', 'modified', 'modified_gmt', 'slug', 'status', 'type', 'link', 'title', 'content', 'excerpt', 'author', 'featured_media', 'comment_status', 'ping_status', 'sticky', 'template', 'format', 'meta', 'categories', 'tags', 'class_list', 'acf', 'apple_news_notices', '_links']


### Let's look at a snippet of each to see how they're structured

In [31]:
# look at a snippet of each
print(json.dumps(cns_stories[0], indent=2))


{
  "id": 126059,
  "date": "2026-02-17T20:31:13",
  "date_gmt": "2026-02-18T01:31:13",
  "guid": {
    "rendered": "https://cnsmaryland.org/?p=126059"
  },
  "modified": "2026-02-17T20:31:13",
  "modified_gmt": "2026-02-18T01:31:13",
  "slug": "the-pressure-to-perform-changes-and-challenges-in-youth-sports",
  "status": "publish",
  "type": "post",
  "link": "https://cnsmaryland.org/2026/02/17/the-pressure-to-perform-changes-and-challenges-in-youth-sports/",
  "title": {
    "rendered": "The pressure to perform: changes and challenges in youth sports"
  },
  "content": {
    "rendered": "<p><span style=\"font-weight: 400;\">COLLEGE PARK, Md. \u2013 Youth sports is changing fast.\u00a0 What was once primarily recreational has become more commercial and competitive.\u00a0 Financial investments are\u00a0 pressuring families and players to stand out earlier as collegiate and even potential professional opportunities lure young athletes.\u00a0 Coaches, athletes, parents, and sports psychol

In [32]:
# JSON snippet for DBK 
print(json.dumps(dbk_stories[0], indent=2))

{
  "id": 478595,
  "date": "2026-02-17T22:58:26",
  "date_gmt": "2026-02-18T03:58:26",
  "guid": {
    "rendered": "https://dbknews.com/?p=478595"
  },
  "modified": "2026-02-17T22:58:26",
  "modified_gmt": "2026-02-18T03:58:26",
  "slug": "sga-accountability-umbc-alum-disappeared-philippines",
  "status": "publish",
  "type": "post",
  "link": "https://dbknews.com/2026/02/17/sga-accountability-umbc-alum-disappeared-philippines/",
  "title": {
    "rendered": "UMD SGA calls for accountability after UMBC alum disappeared in Philippines"
  },
  "content": {
    "rendered": "<p>Sacha Llanto said one of the longest weeks of his life was when his friend, Chantal Anicoche, went missing in the Philippines after a military attack.</p>\n<p>Anicoche, a University of Maryland, Baltimore County alum, was conducting humanitarian work when the Armed Forces of the Philippines conducted bombings in Abra de Ilog, Occidental Mindoro on Jan. 1, according to a petition circulated by the Action Network. A

## Now, let's use ollama to create queries. 
Let's do an example one together

### Let's see if the LLM can identify the people who are quoted in a Diamondback story 

In [38]:
# Find a DBK story by URL and ask Ollama who is quoted
MODEL = "llama3"  # change to an installed Ollama model name if needed
target_url = "https://dbknews.com/2026/02/13/umd-discovery-house-graduate-affordability/"
target_story = next((s for s in dbk_stories if s.get("link") == target_url), None)

if not target_story:
    print("Story not found in dbk_stories. Make sure dbk_stories.json includes this URL.")
else:
    title = target_story.get("title", {}).get("rendered", "")
    content = target_story.get("content", {}).get("rendered", "")
    prompt = (
        "From the story text, list EVERY person who is directly quoted. "
        "Return ONLY a python list of full names (strings). If none, return an empty array.\n\n"
        f"Title: {title}\n\nStory:\n{content}"
    )
    try:
        response = ollama.generate(model=MODEL, prompt=prompt)
        print(response["response"])
    except Exception as exc:
        print("Ollama error:", exc)

Here is the list of directly quoted people:

`['Keegan Clements-Housser', 'Lucas Rengifo-Keller', 'Dennis Passarella-George', 'Stephen Roth', 'Jason Farman']`

Note that these are full names, as requested.


## What if I want to know the number of times someone is quoted OR paraphrased in that same story?

In [40]:
# Count direct quotes + paraphrased attributions per person in the same story
MODEL = "llama3"  # change if needed
target_url = "https://dbknews.com/2026/02/13/umd-discovery-house-graduate-affordability/"
target_story = next((s for s in dbk_stories if s.get("link") == target_url), None)

if not target_story:
    print("Story not found in dbk_stories. Make sure dbk_stories.json includes this URL.")
else:
    title = target_story.get("title", {}).get("rendered", "")
    content = target_story.get("content", {}).get("rendered", "")
    ## THIS IS THE PROMPT -- THIS IS WHAT YOU CONTROL!!!!!!!
    prompt = (
        "You are a meticulous editor. For the story below, identify EVERY person who is either (a) directly quoted, "
        "or (b) attributed via paraphrased information (e.g., 'X said', 'according to X', 'X stated').\n\n"
        "Return ONLY a JSON array. Each item must be an object with: "
        "name (full name), direct_quote_count (integer), paraphrase_count (integer), evidence (array of short snippets).\n\n"
        "If none, return an empty array.\n\n"
        f"Title: {title}\n\nStory:\n{content}"
    )
    try:
        response = ollama.generate(model=MODEL, prompt=prompt)
        print(response["response"])
    except Exception as exc:
        print("Ollama error:", exc)

Here is the JSON array of people mentioned in the article:

```
[
  {
    "name": "Stephen Roth",
    "direct_quote_count": 1,
    "paraphrase_count": 2,
    "evidence": [
      "\"From focus groups, even before the thing was designed, helping to understand the market and the need within the community, and then throughout the process, they've been involved in the design efforts,\" ",
      "Roth said."
    ]
  },
  {
    "name": "Keegan Clements-Housser",
    "direct_quote_count": 2,
    "paraphrase_count": 3,
    "evidence": [
      "\"We really just need somewhere that's not priced out of our stipend level,\" ",
      "\"Because nobody can afford to live here, people don't live here\"",
      "\"I'm not close enough to campus to walk, so I had to get an e-bike, and my e-bike was stolen, and then I had to get a car\"",
      "Clements-Housser said."
    ]
  },
  {
    "name": "Lucas Rengifo-Keller",
    "direct_quote_count": 1,
    "paraphrase_count": 0,
    "evidence": [
      "\"I'm

### What if I want titles?

In [49]:
# Count direct quotes + paraphrased attributions per person in the same story
target_url = "https://dbknews.com/2026/02/17/sga-accountability-umbc-alum-disappeared-philippines/"
target_story = next((s for s in dbk_stories if s.get("link") == target_url), None)

if not target_story:
    print("Story not found in dbk_stories. Make sure dbk_stories.json includes this URL.")
else:
    title = target_story.get("title", {}).get("rendered", "")
    content = target_story.get("content", {}).get("rendered", "")
    ## THIS IS THE PROMPT -- THIS IS WHAT YOU CONTROL!!!!!!!
    prompt = (
        "For the story below, identify every person who is either (a) directly quoted, or (b) attributed via paraphrased information (e.g., 'X said', 'according to X', 'X stated').\n\n"
        "Return ONLY a JSON array with their full name, direct quote count, paraphrase count, and their title or position.\n\n"
        "If the person is a student, return their class year and major instead of title/position. If the person is a university official, return their department or office.\n\n"
        "If none, return an empty array.\n\n"
        f"Title: {title}\n\nStory:\n{content}"
    )
    try:
        response = ollama.generate(model=MODEL, prompt=prompt)
        print(response["response"])
    except Exception as exc:
        print("Ollama error:", exc)

Here is the JSON array with the requested information:

```
[
  {
    "full_name": "Sacha Llanto",
    "direct_quote_count": 3,
    "paraphrase_count": 2,
    "title_position": "Senior Public Health Science Major, Member of Filipino Advocacy Organization"
  },
  {
    "full_name": "Zyad Khan",
    "direct_quote_count": 2,
    "paraphrase_count": 1,
    "title_position": "SGA Representative for Computer, Mathematical, and Natural Sciences College, Senior Computer Science Major"
  },
  {
    "full_name": "Gabriel Pasion",
    "direct_quote_count": 2,
    "paraphrase_count": 1,
    "title_position": "Vice Chairperson for Anakbayan College Park, University Alum"
  }
]
```

Let me know if you have any further questions!


### Now, what if I want to know who was quoted in the last 20 Diamondback stories as well as the number of times they were quoted?

In [50]:
# Per-story JSON + aggregated JSON across the last 10 DBK stories
import re
MODEL = "llama3"  # change if needed
last_10 = dbk_stories[:10]

def strip_html(html):
    if not html:
        return ""
    text = re.sub(r"<[^>]+>", " ", html)
    return re.sub(r"\s+", " ", text).strip()

def extract_json(text):
    text = text.strip()
    if not text:
        return None
    # Try direct parse first
    try:
        return json.loads(text)
    except json.JSONDecodeError:
        pass
    # Try to extract first JSON array or object
    match = re.search(r"(\[.*\]|\{.*\})", text, re.DOTALL)
    if match:
        try:
            return json.loads(match.group(1))
        except json.JSONDecodeError:
            return None
    return None

def people_mentions_for_story(story):
    title = strip_html(story.get("title", {}).get("rendered", ""))
    content = strip_html(story.get("content", {}).get("rendered", ""))
    prompt = (
        "You are a meticulous editor. For the story below, identify EVERY person who is either (a) directly quoted, "
        "or (b) attributed via paraphrased information (e.g., 'X said', 'according to X', 'X stated').\n\n"
        "EXCLUDE reporters, staff writers, editors, or byline authors. Do NOT include staff writers.\n\n"
        "Return ONLY valid JSON (no extra text). The JSON must be an array. Each item must be an object with: "
        "name (full name), direct_quote_count (integer), paraphrase_count (integer).\n\n"
        "If none, return an empty array [].\n\n"
        f"Title: {title}\n\nStory:\n{content}"
    )
    response = ollama.generate(model=MODEL, prompt=prompt)
    parsed = extract_json(response["response"])
    if parsed is None:
        print("Non-JSON response:", response["response"][:500])
        return []
    return parsed if isinstance(parsed, list) else []

per_story = []
counts = {}
for i, story in enumerate(last_10, start=1):
    try:
        people = people_mentions_for_story(story)
    except Exception as exc:
        print("Ollama error:", exc)
        continue
    per_story.append({"story_index": i, "link": story.get("link"), "people": people})
    for person in people:
        name = person.get("name") if isinstance(person, dict) else None
        if not name:
            continue
        if name not in counts:
            counts[name] = {"direct_quote_count": 0, "paraphrase_count": 0}
        counts[name]["direct_quote_count"] += int(person.get("direct_quote_count", 0))
        counts[name]["paraphrase_count"] += int(person.get("paraphrase_count", 0))

aggregated = [
    {
        "name": name,
        "direct_quote_count": vals["direct_quote_count"],
        "paraphrase_count": vals["paraphrase_count"],
        "total": vals["direct_quote_count"] + vals["paraphrase_count"],
    }
    for name, vals in counts.items()
 ]
aggregated.sort(key=lambda x: x["total"], reverse=True)

print(json.dumps({"per_story": per_story, "aggregated": aggregated}, indent=2))

{
  "per_story": [
    {
      "story_index": 1,
      "link": "https://dbknews.com/2026/02/17/sga-accountability-umbc-alum-disappeared-philippines/",
      "people": [
        {
          "name": "Sacha Llanto",
          "direct_quote_count": 2,
          "paraphrase_count": 1
        },
        {
          "name": "Zyad Khan",
          "direct_quote_count": 3,
          "paraphrase_count": 2
        },
        {
          "name": "Gabriel Pasion",
          "direct_quote_count": 2,
          "paraphrase_count": 1
        }
      ]
    },
    {
      "story_index": 2,
      "link": "https://dbknews.com/2026/02/17/maryland-softball-lauren-karn-transfer-portal/",
      "people": [
        {
          "name": "Lauren Karn",
          "direct_quote_count": 2,
          "paraphrase_count": 3
        },
        {
          "name": "Caroline Fox",
          "direct_quote_count": 0,
          "paraphrase_count": 1
        },
        {
          "name": "Anna McGowan",
          "direct_quot

In [None]:
# Try generating something on your own!!!!