# Thesis demo

In [12]:
import boto3
import pandas as pd, json, csv
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

from itertools import chain
from functools import partial

import requests
from tenacity import retry

from projects_secretes import *

In [1]:
# for disabling global var lookup for functions
# https://gist.github.com/ax3l/59d92c6e1edefcef85ac2540eb056da3
import types
from itertools import islice

def imports():
    for name, val in globals().items():            
        # module imports
        if isinstance(val, types.ModuleType):
            yield name, val
        # functions / callables
        if hasattr(val, '__call__'):
            yield name, val

noglobal = lambda fn: types.FunctionType(fn.__code__, dict(imports()))

## Get/update data from s3

In [104]:
# get all files name
@noglobal
def get_time_from_fileName(fileName):
    dateName = fileName.split("/")[-1][:-5]
    date, siteName = dateName.split("_")

    return (date, siteName)

@noglobal
def get_news_fileNames(bucket_name, prefix):
    # list all files
    s3_client = boto3.client('s3')
    
    # Initialize the paginator
    paginator = s3_client.get_paginator('list_objects_v2')
    page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=prefix)
    
    # Iterate through each page of objects
    fileNames = []
    for page in page_iterator:
        if "Contents" in page:  # Check if the page contains objects
            for obj in page['Contents']:
                if obj['Key'].endswith(".json"):
                    fileNames.append(obj['Key'])

    # filter to exclude already updated data
    return fileNames

In [105]:
domestic_bucket_name = "news-collection-2024-3371"
domestic_fnames = get_news_fileNames(domestic_bucket_name, prefix = "current_data")

international_bucket_name = 'international-news-collection-2024-2251'
international_fnames = get_news_fileNames(international_bucket_name, prefix = "")

In [214]:
## get data
@noglobal
def read_news_file(filekey, bucket):
    s3 = boto3.resource('s3')
    content = s3.Object(bucket, filekey).get()['Body'].read()

    # get date and site name
    date, siteName = get_time_from_fileName(filekey)

    #  read headline, url, probability and time into a list
    content = json.loads(content)
    if "articles" in content:
        result = []

        for x in content["articles"]:
            row = {}
            row["url"] = x.get("url")
            row["headline"] = x.get("headline")
            row["datePublished_site"] = x.get("datePublished")
            row["probability"] = x["metadata"]["probability"]
    
            row["date_collected"] = date
            row["siteName"] = siteName
            result.append(row)

        return (True, result)
    else:
        # failed collection
        return (False, filekey)

In [20]:
@noglobal
def read_files_in_parallel(bucketName, fnames):
    # Use ThreadPoolExecutor read files in parallel
    with ThreadPoolExecutor(max_workers=4) as executor:
        read_news_file_with_partial = partial(read_news_file, bucket=bucketName)
        news_headlines = list(tqdm(executor.map(read_news_file_with_partial, fnames), total=len(fnames)))

    return news_headlines

@noglobal
def post_processsing(result):
    # post_processsing to create a list of failed collection
    processed_result = []
    failed_sites = []
    for x in result:
        if x[0]:
            processed_result.append(x[1])
        else:
            failed_sites.append(x[1])

    processed_result = list(chain.from_iterable(processed_result))
    return (processed_result, failed_sites)

news_headlines_domestic, failed_collection_domestic = post_processsing(read_files_in_parallel(domestic_bucket_name, domestic_fnames))
news_headlines_international, failed_collection_international = post_processsing(read_files_in_parallel(international_bucket_name, international_fnames))

In [216]:
df_domestic_news = pd.DataFrame(news_headlines_domestic)
df_domestic_news.to_parquet('data/data_domestic_news.parquet', index=False)

df_international_news = pd.DataFrame(news_headlines_international)
df_international_news.to_parquet('data/data_international_news.parquet', index=False)

In [218]:
# Writing the list to a CSV filed

def save_to_csv(list, path):
    with open(path, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        # Writing each item in the list as a row
        for item in list:
            writer.writerow([item])

save_to_csv(failed_collection_domestic, "data/failed_collection_domestic.csv")
save_to_csv(failed_collection_international, "data/failed_collection_international.csv")

# Basic Analysis

In [8]:
df_domestic_news = pd.read_parquet('data/data_domestic_news.parquet')
df_international_news = pd.read_parquet('data/data_international_news.parquet')

In [9]:
# basic data cleaning
df_domestic_news = df_domestic_news[~df_domestic_news['headline'].isna()] # remove na
df_domestic_news = df_domestic_news[df_domestic_news["headline"].apply(lambda headline: len(headline.split(" ")) >= 3)] # remove very short headlines

In [10]:
len(df_domestic_news["headline"].unique())

156538

There are 156208 unique headlines in the first month of data colleciton, about 100 per day per site.

## NER

In [4]:
# https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8991165
# https://huggingface.co/dslim/bert-base-NER

In [41]:
@retry
def get_NER(payload, url):
    headers = {"Accept" : "application/json", "Content-Type": "application/json" }
    
    response = requests.post(url, headers=headers, json=payload, timeout = 10)
    if "error" in response.json():
        # error response
        print("error")
        raise exception
    else:
        return {"Headline" : payload["inputs"], "NER" : response.json()}

def get_NER_in_parllel(url, lines, parameters, maxworkers):
    # zip with parameters
    lines = [{"inputs": line, "parameters": parameters} for line in lines]
    
    # Use ThreadPoolExecutor read files in parallel
    with ThreadPoolExecutor(max_workers=maxworkers) as executor:
        get_NER_partial = partial(get_NER, url=url)
        news_headlines = list(tqdm(executor.map(get_NER_partial, lines), total=len(lines)))

    return news_headlines

In [42]:
NER_lines = df_domestic_news["headline"].unique()
parameters = {"aggregation_strategy": "simple"} # perserve different tags

cur_index = 0
step = 10000
while cur_index < len(NER_lines):
    NER_lines_seg = NER_lines[cur_index : cur_index + step]
    NER_result = get_NER_in_parllel(NER_URL, NER_lines_seg, parameters, 8)

    # save file
    NER_filename = f'./data/NERs/headline_NER_{int(cur_index / step)}.json'
    
    # Write to a JSON file
    with open(NER_filename, 'w') as file:
        json.dump(NER_result, file)
        
    cur_index += step

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [03:25<00:00, 48.75it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [03:24<00:00, 48.90it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [03:24<00:00, 48.80it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [03:25<00:00, 48.75it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/1

## SA