# NER

In [2]:
import boto3
import pandas as pd, json, csv
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

from itertools import chain
from functools import partial

import requests
from tenacity import retry

from projects_secretes import *

# Basic Analysis

In [3]:
df_domestic_news = pd.read_parquet('data/headlines/data_domestic_news_Feb.parquet')

In [4]:
# basic data cleaning
df_domestic_news = df_domestic_news[~df_domestic_news['headline'].isna()] # remove na
df_domestic_news = df_domestic_news[df_domestic_news["headline"].apply(lambda headline: len(headline.split(" ")) >= 3)] # remove very short headlines

In [5]:
len(df_domestic_news["headline"].unique())

131568

There are 156208 unique headlines in the first month of data colleciton, about 100 per day per site.

## NER

Perform NER to extract names and organizations.

In [4]:
# https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8991165
# https://huggingface.co/dslim/bert-base-NER

In [58]:
import requests, os

HUGGINGFACE_TOKEN = os.environ["HUGGINGFACE_TOKEN"]  

API_URL = "https://qi5zn9v17nv6hljd.us-east-1.aws.endpoints.huggingface.cloud"
headers = {
	"Accept" : "application/json",
	"Authorization": f"Bearer {HUGGINGFACE_TOKEN}",
	"Content-Type": "application/json" 
}

In [66]:
@retry
def get_NER(payload, url, headers = headers):
    response = requests.post(url, headers=headers, json=payload, timeout = 10)
    if "error" in response.json():
        # error response
        print("error")
        raise exception
    else:
        return {"Headline" : payload["inputs"], "NER" : response.json()}

def get_NER_in_parllel(url, lines, parameters, maxworkers):
    # zip with parameters
    lines = [{"inputs": line, "parameters": parameters} for line in lines]
    
    # Use ThreadPoolExecutor read files in parallel
    with ThreadPoolExecutor(max_workers=maxworkers) as executor:
        get_NER_partial = partial(get_NER, url=url)
        news_headlines = list(tqdm(executor.map(get_NER_partial, lines), total=len(lines)))

    return news_headlines

In [73]:
NER_lines = df_domestic_news["headline"].unique()
parameters = {"aggregation_strategy": "simple"} # perserve different tags
month = "Feb"

cur_index = 0
step = 10000
while cur_index < len(NER_lines):
    # process NER
    NER_lines_seg = NER_lines[cur_index : cur_index + step]
    NER_result = get_NER_in_parllel(API_URL, NER_lines_seg, parameters, 64) # 4 A10g

    # save file
    NER_filename = f'./data/NERs/{month}/headline_NER_{month}_{int(cur_index / step)}.json'
    
    # Write to a JSON file
    with open(NER_filename, 'w') as file:
        json.dump(NER_result, file)

    cur_index += step

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [06:42<00:00, 24.87it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [06:44<00:00, 24.72it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [06:54<00:00, 24.14it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [07:05<00:00, 23.53it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████

## Merge NER

In [16]:
import json, glob

In [83]:
df_domestic_Jan = pd.read_parquet('data/headlines/data_domestic_news_Jan.parquet').drop_duplicates(subset=['headline'])
df_domestic_Feb = pd.read_parquet('data/headlines/data_domestic_news_Feb.parquet').drop_duplicates(subset=['headline'])
df_Jan_Feb = pd.concat([df_domestic_Jan, df_domestic_Feb]).drop_duplicates(subset=['headline'])

In [84]:
# Path to the folder containing JSON files
folder_path = ['./data/NERs/Jan/*.json', './data/NERs/Feb/*.json']

# List to store the loaded JSON data
NER_list = []

# Iterate over all JSON files in the specified folder
for x in folder_path:
    for file_name in glob.glob(x):
        with open(file_name, 'r') as file:
            json_data = json.load(file)
            NER_list += json_data

In [96]:
def simple_NER_merger(NER_list, min_score = 0.5):
    NERs = []

    prev_NER = ""
    for i in range(len(NER_list)):
        if NER_list[i]["score"] < min_score:
            continue
        
        NE = NER_list[i]["word"]
        if NE.startswith("#"):
            if len(NERs) != 0:
                NERs.pop() # remove the last incomplete NE
            
            merged_NER = prev_NER     
            while i < len(NER_list) and NER_list[i]["word"].startswith("#"):
                NE = NER_list[i]["word"]
                merged_NER += NE[2:]

                i += 1

            # add NE to the list
            NERs.append(merged_NER)
        else:
            prev_NER = NE
            NERs.append(NE)

    return NERs

simple_NER_merger(json_list[20]["NER"])

['Sydney', 'Australia', 'New Year']

In [98]:
# convert the ner list to dict, where headline is the key, NER list is the value
# headlines are unique

NER_dict = {}

for x in tqdm(NER_list):
    NER_dict[x["Headline"]] = simple_NER_merger(x["NER"])

100%|█████████████████████████████████████████████| 288106/288106 [00:01<00:00, 150077.83it/s]


In [111]:
def merge_NER_to_df(headline, NER_dict = NER_dict):
    if headline not in NER_dict:
        print(headline)
        return []
    else:
        return NER_dict[headline]

In [113]:
df_Jan_Feb["NE"] = df_Jan_Feb.apply(lambda x : merge_NER_to_df(x["headline"]), axis = 1)

Interest-Based Ads
None
videos
SHARE THIS
FOX Weather
Fox Radio
SIGN IN
MSNBC Daily
Morning Joe
11th Hour
December 2023
October 2023
August 2023
Maui wildfires
Holding On
Metropolitan Diary
Wordle
Connections Companion
Connections
Spelling Bee
The Crossword
Letter Boxed
The frontlines
Final Thoughts
World
Legal
World-Check
Digital Accessibility
Wide Angle
Relationships
The World
George Neumayr
More Americana
Corruption Unbound
Midwinter
Crossword
Helen Roy
Collin Jones
Streaming Now
Talk Shows
Inspired Stories
The Winterkeeper
Winter magic
Tom Wilkinson
Quiz
CNN Business
Best-in-class
Expert-backed guides
Style
US
CNN podcasts
Sports
video
comments
1
3
300
'IT'S BEAUTIFUL'
Rachel Maddow
The Beat
Most Read
Shen Yun
Live Updates
5
2
193
315
85
Sponsored by
NPR Extra
The Magi
fashion
Beyond Science
Tetris Hint
10
350
33
Fox Nation
Photography
John Ruwitch/NPR
.cls-1{fill:#fff}
Israel-Hamas War
Sean Dong
The Athletic
Cooking
Product recommendations
Advertising Guidelines
Franklin Foer
Ruth

In [135]:
df_Jan_Feb = df_Jan_Feb[~df_Jan_Feb['siteName'].isin(f_site)]