In [1]:
import requests
import json
from dotenv import load_dotenv
import os
import uuid
import pandas as pd

In [2]:
load_dotenv('dags/.env')  # Loads variables from .env into environment
apikey = os.getenv("NEWS_API_KEY")

In [3]:
url = 'https://eventregistry.org/api/v1/article/getArticles?'
params = {
        'apiKey': apikey,
        'dataType': ["news", "pr", "blog"],
        #'dateStart': "2023-01-01",
        #'dateEnd': "2023-01-02",
        'articlesSortByAsc': True,
        'includeArticleSocialScore': True,
        'includeArticleCategories': True,
        'lang': "eng",
        'forceMaxDataTimeWindow': 7,
}

In [4]:
response = requests.get(url, params=params)

In [5]:
if response.status_code == 200:
    try:
        data = response.json()
        print(f"Success: {response.status_code} - {response.text}")
    except ValueError as e:
        print("Error parsing JSON:", e)
else:
    print(f"Error: {response.status_code} - {response.text}")



In [6]:
result = data['articles']['results']
result[0]

{'uri': '8340397762',
 'lang': 'eng',
 'isDuplicate': False,
 'date': '2024-09-28',
 'time': '00:00:00',
 'dateTime': '2024-09-28T00:00:00Z',
 'dateTimePub': '2024-09-27T23:59:35Z',
 'dataType': 'news',
 'sim': 0.7490196228027344,
 'url': 'https://www.reformer.com/news/national/ice-662-000-criminal-foreign-nationals-to-be-deported-are-living-free-nationwide/article_68d77761-a66f-565b-ac44-51250aaf3fea.html',
 'title': 'ICE: 662,000 criminal foreign nationals to be deported are living free nationwide',
 'body': '(The Center Square) - More than 660,000 criminal foreign nationals identified to be deported by U.S. Customs and Immigration Enforcement are freely living in communities nationwide.\n\nAmong them are those convicted or charged with violent crimes, including homicide, sexual assault and kidnapping, according to information released in response to a congressional request.\n\nICE was requested to provide information about the number of noncitizens on its docket for removal who are 

In [8]:
# Function to get first, second, and third-level keywords
def extract_levels(label):
    parts = label.split('/')
    first_level = parts[1] if len(parts) > 1 else None
    second_level = parts[2] if len(parts) > 2 else None
    third_level = parts[3] if len(parts) > 3 else None
    return first_level, second_level, third_level

### Using source link and author email as unique id, i.e. primary key

In [12]:
# Initialise empty lists to store data
articles = []
sources = []
authors = []
categories = []
facebook_shares = []

# Loop through each item in the result
for item in result:
    # Append article data using relevant keys
    articles.append({
        'id': item.get('uri'),
        'is_duplicate': item.get('isDuplicate'),
        'datetime_found': item.get('dateTime'),
        'datetime_published': item.get('dateTimePub'),
        'article_type': item.get('dataType'),
        'sim': item.get('sim'), # cosine similarity of the article to the centroid of the story
        'url': item.get('url'),
        'title': item.get('title'),
        'body': item.get('body'),
        'image': item.get('image'),
        'sentiment': item.get('sentiment'),
        #'wgt': item.get('wgt'), # parameter used internally for sorting purposes (DO NOT USE THE VALUE)
        'relevance': item.get('relevance') # represents how well does the article match the query
                                           # the higher the value, the better the match
    })

    # Append source information
    source_list = item.get('source')
    sources.append({
            #'source_id': str(uuid.uuid4()),
            'article_id': item.get('uri'),
            'source_name': source_list.get('title'),
            'source_link': source_list.get('uri'),
    })

    # Append author information
    author_list = item.get('authors', [])
    if not author_list:
        authors.append({
            #'author_id': str(uuid.uuid4()),
            'article_id': item.get('uri'),
            'author_name': None,
            'author_email': None,
            'author_type': None,
            'is_agency': None
    })
    else:
        # Loop through each author if multiple authors are present
        for author in author_list:
            if isinstance(author, dict):
                authors.append({
                    #'author_id': str(uuid.uuid4()),
                    'article_id': item.get('uri'),
                    'author_name': author.get('name', None),
                    'author_email': author.get('uri', None),
                    'author_type': author.get('type', None),
                    'is_agency': author.get('isAgency', None)
            })
            else:
                # Handle cases where author data isn't a dict
                authors.append({
                    #'author_id': str(uuid.uuid4()),
                    'article_id': item.get('uri'),
                    'author_name': None,
                    'author_email': None,
                    'author_type': None,
                    'is_agency': None
            })
    # Append category information
    category_list = item.get('categories', [])
    for category in category_list:  # Iterate directly over the list
        first_level, second_level, third_level = extract_levels(category.get('label', None))
        categories.append({
                'article_id': item.get('uri'),
                #'uri': category.get('uri', None),
                'label': category.get('label', None),
                'keyword_1': first_level,
                'keyword_2': second_level,
                'keyword_3': third_level
            })

    # Append share information on social medias
    share_list = item.get('shares')
    facebook_shares.append({
            #'share_id': str(uuid.uuid4()),
            'article_id': item.get('uri'),
            'facebook_share': share_list.get('facebook'),
    })

<br> </br>
### take into consideration of same authors
    # Append author information
    author_list = item.get('authors', [])
    if not author_list:
        # Handle case with no authors
        author_id = str(uuid.uuid4())  # Generate a unique ID for the author
        authors.append({
            'author_id': author_id,
            'article_id': item.get('uri'),
            'uri': None,
            'name': None,
            'type': None,
            'is_agency': None
        })
    else:
        # Loop through each author if multiple authors are present
        for author in author_list:
            if isinstance(author, dict):
                author_name = author.get('name', None)
                if author_name not in author_id_map:
                    # If the author is not in the map, create a new entry
                    author_id = str(uuid.uuid4())  # Generate a new unique ID
                    author_id_map[author_name] = author_id
                else:
                    # If the author is already in the map, use the existing ID
                    author_id = author_id_map[author_name]

                authors.append({
                    'author_id': author_id,  # Use consistent author ID
                    'article_id': item.get('uri'),  # explicitly add article_id for relational linking
                    'uri': author.get('uri', None),
                    'name': author_name,
                    'type': author.get('type', None),
                    'is_agency': author.get('isAgency', None)
                })
            else:
                # Handle cases where author data isn't a dict
                author_id = str(uuid.uuid4())  # Generate a new unique ID
                authors.append({
                    'author_id': author_id,
                    'article_id': item.get('uri'),
                    'uri': None,
                    'name': None,
                    'type': None,
                    'is_agency': None
                })
<br> </br>

In [13]:
len(articles)

100

In [14]:
len(authors)

101

In [15]:
len(sources)

100

In [16]:
len(categories)

449

In [17]:
len(facebook_shares)

100

In [18]:
print(articles)



In [19]:
print(sources)

[{'article_id': '8340397762', 'source_name': 'Brattleboro Reformer', 'source_link': 'reformer.com'}, {'article_id': '8340397475', 'source_name': 'The News-Gazette', 'source_link': 'news-gazette.com'}, {'article_id': '8340397800', 'source_name': 'Yakima Herald-Republic', 'source_link': 'yakimaherald.com'}, {'article_id': '8340395087', 'source_name': 'Yahoo', 'source_link': 'yahoo.com'}, {'article_id': '8340397441', 'source_name': 'The Globe and Mail', 'source_link': 'theglobeandmail.com'}, {'article_id': '8340397856', 'source_name': 'One America News Network', 'source_link': 'oann.com'}, {'article_id': '8340397420', 'source_name': 'Sports Illustrated', 'source_link': 'si.com'}, {'article_id': '8340397474', 'source_name': 'The News-Gazette', 'source_link': 'news-gazette.com'}, {'article_id': '8340397476', 'source_name': 'The News-Gazette', 'source_link': 'news-gazette.com'}, {'article_id': '8340395978', 'source_name': 'Owensboro Messenger-Inquirer', 'source_link': 'messenger-inquirer.com

In [20]:
print(authors)

[{'article_id': '8340397762', 'author_name': 'Bethany Blankley', 'author_email': 'bethany_blankley@reformer.com', 'author_type': 'author', 'is_agency': False}, {'article_id': '8340397475', 'author_name': 'Chuck Schilken', 'author_email': 'chuck_schilken@news-gazette.com', 'author_type': 'author', 'is_agency': False}, {'article_id': '8340397800', 'author_name': 'Associated Press', 'author_email': 'associated_press@yakimaherald.com', 'author_type': 'author', 'is_agency': True}, {'article_id': '8340395087', 'author_name': None, 'author_email': None, 'author_type': None, 'is_agency': None}, {'article_id': '8340397441', 'author_name': None, 'author_email': None, 'author_type': None, 'is_agency': None}, {'article_id': '8340397856', 'author_name': None, 'author_email': None, 'author_type': None, 'is_agency': None}, {'article_id': '8340397420', 'author_name': None, 'author_email': None, 'author_type': None, 'is_agency': None}, {'article_id': '8340397474', 'author_name': 'Dylan Hernández', 'aut

In [21]:
print(categories)

[{'article_id': '8340397762', 'label': 'dmoz/Society/Crime', 'keyword_1': 'Society', 'keyword_2': 'Crime', 'keyword_3': None}, {'article_id': '8340397762', 'label': 'dmoz/Society/Issues/Crime and Justice', 'keyword_1': 'Society', 'keyword_2': 'Issues', 'keyword_3': 'Crime and Justice'}, {'article_id': '8340397762', 'label': 'dmoz/Society/Crime/Theft', 'keyword_1': 'Society', 'keyword_2': 'Crime', 'keyword_3': 'Theft'}, {'article_id': '8340397762', 'label': 'dmoz/Society/Crime/Victims', 'keyword_1': 'Society', 'keyword_2': 'Crime', 'keyword_3': 'Victims'}, {'article_id': '8340397762', 'label': 'dmoz/Society/Crime/Research', 'keyword_1': 'Society', 'keyword_2': 'Crime', 'keyword_3': 'Research'}, {'article_id': '8340397762', 'label': 'news/Politics', 'keyword_1': 'Politics', 'keyword_2': None, 'keyword_3': None}, {'article_id': '8340397475', 'label': 'dmoz/Society/Relationships', 'keyword_1': 'Society', 'keyword_2': 'Relationships', 'keyword_3': None}, {'article_id': '8340397475', 'label'

In [22]:
print(facebook_shares)

[{'article_id': '8340397762', 'facebook_share': None}, {'article_id': '8340397475', 'facebook_share': None}, {'article_id': '8340397800', 'facebook_share': None}, {'article_id': '8340395087', 'facebook_share': None}, {'article_id': '8340397441', 'facebook_share': None}, {'article_id': '8340397856', 'facebook_share': 9}, {'article_id': '8340397420', 'facebook_share': None}, {'article_id': '8340397474', 'facebook_share': None}, {'article_id': '8340397476', 'facebook_share': None}, {'article_id': '8340395978', 'facebook_share': None}, {'article_id': '8340397799', 'facebook_share': None}, {'article_id': '8340398047', 'facebook_share': 16}, {'article_id': '8340397676', 'facebook_share': 1}, {'article_id': '8340398118', 'facebook_share': 7}, {'article_id': '8340398126', 'facebook_share': 1}, {'article_id': '8340395085', 'facebook_share': None}, {'article_id': '8340398121', 'facebook_share': 37}, {'article_id': '8340397943', 'facebook_share': None}, {'article_id': '8340395086', 'facebook_shar

In [None]:
# unique_author_ids = set(item['author_id'] for item in authors)
# unique_count = len(unique_author_ids)

# print("Number of unique author_ids:", unique_count)

In [None]:
unique_article_ids = set(item['article_id'] for item in authors)
unique_count = len(unique_article_ids)

print("Number of unique article_ids:", unique_count)

In [None]:
# Initialize sets for tracking unique and duplicate author_ids
unique_article_ids = set()
duplicate_article_ids = set()

# Loop through each author in the list to check for duplicates
for article in authors:
    article_id = article['article_id']
    if article_id in unique_article_ids:
        duplicate_article_ids.add(article_id)
    else:
        unique_article_ids.add(article_id)

# Convert the set of duplicate IDs to a list (optional)
duplicates_list = list(duplicate_article_ids)

print("Duplicate article_ids:", duplicates_list)

In [None]:
df = pd.DataFrame(result)
df.head()

In [None]:
df.shape

In [None]:
df.value_counts(subset='dataType', dropna=True)

In [None]:
df.isnull().sum()

In [None]:
df['authors']

In [None]:
# Define the list of values to filter
uri_values = ['8384451260', '2024-10-528318960', '8384450686', '8384450685', '8384451709']

# Filter rows where 'uri' is in the specified list
filtered_df = df[df['uri'].isin(uri_values)]

print(filtered_df)

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
print(df['authors'].loc[[47, 49, 72, 95, 97]])

In [None]:
print(authors[47]['author_type'])

In [None]:
print(result)

In [None]:
for item in result:
    author_list = item.get('authors', [])
    for author in author_list:
        name = author.get('name')
        print(name)

In [None]:
category_df = pd.DataFrame(categories)

In [None]:
category_df.head()

In [None]:
category_df['article_id'].nunique()

In [None]:
print(category_df['keyword_1'].nunique())
print(category_df['keyword_2'].nunique())
print(category_df['keyword_3'].nunique())

In [None]:
category_df.value_counts(subset='keyword_1', dropna=False)

In [None]:
category_df.value_counts(subset='keyword_2', dropna=False)

In [None]:
category_df.value_counts(subset='keyword_3', dropna=False)