In [1]:
import requests
import json
from dotenv import load_dotenv
import os
import uuid
import pandas as pd

In [2]:
load_dotenv('dags/.env')  # Loads variables from .env into environment
apikey = os.getenv("NEWS_API_KEY")

In [3]:
url = 'https://eventregistry.org/api/v1/article/getArticles?'
para = {
    'apiKey' : apikey,
    'dataType' : ["news", "pr", "blog"],
    'dataStart' : "2023-01-01",
    'articlesSortByAsc' : True,
    'includeArticleSocialScore' : True,
    'includeArticleCategories' : True,
    'lang' : "eng",
}

In [4]:
response = requests.get(url, params=para)

In [5]:
if response.status_code == 200:
    try:
        data = response.json()
        print(f"Success: {response.status_code} - {response.text}")
    except ValueError as e:
        print("Error parsing JSON:", e)
else:
    print(f"Error: {response.status_code} - {response.text}")



In [11]:
result = data['articles']['results']['categories']
result[0]

TypeError: list indices must be integers or slices, not str

In [10]:
data['articles']['results'][5]

{'uri': '8338878037',
 'lang': 'eng',
 'isDuplicate': False,
 'date': '2024-09-27',
 'time': '00:00:04',
 'dateTime': '2024-09-27T00:00:04Z',
 'dateTimePub': '2024-09-26T23:59:38Z',
 'dataType': 'news',
 'sim': 0,
 'url': 'https://www.wtvm.com/2024/09/26/1-dead-2-injured-lee-county-crash-tuesday/',
 'title': '1 dead, 2 injured in Lee County crash Tuesday',
 'body': 'LEE COUNTY, Ala. (WTVM) - One person has died and two others were injured following a Tuesday evening crash in Lee County.\n\nAccording to the Alabama Law Enforcement Agency, 21-year-old John A. Lucas was critically injured when the car he was driving collided head-on with an SUV. Lucas was taken to East Alabama Medical Center, where he later died.\n\nTroopers said the driver and a 6-year-old passenger in the other vehicle were also taken to EAMC for their injuries.\n\nThe crash happened just before 5:30 p.m. Tuesday on Lee Road 166, about five miles south of Opelika, authorities said.\n\nNo further details surrounding the 

In [31]:
# Function to get first, second, and third-level keywords
def extract_levels(label):
    parts = label.split('/')
    first_level = parts[1] if len(parts) > 1 else None
    second_level = parts[2] if len(parts) > 2 else None
    third_level = parts[3] if len(parts) > 3 else None
    return first_level, second_level, third_level

### Using source link and author email as unique id, i.e. primary key

In [32]:
# Initialise empty lists to store data
articles = []
sources = []
authors = []
categories = []

# Loop through each item in the result
for item in result:
    # Append article data using relevant keys
    articles.append({
        'id': item.get('uri'),
        'is_duplicate': item.get('isDuplicate'),
        'datetime_found': item.get('dateTime'),
        'datetime_published': item.get('dateTimePub'),
        'article_type': item.get('dataType'),
        'sim': item.get('sim'), # cosine similarity of the article to the centroid of the story
        'url': item.get('url'),
        'title': item.get('title'),
        'body': item.get('body'),
        'image': item.get('image'),
        'sentiment': item.get('sentiment'),
        #'wgt': item.get('wgt'), # parameter used internally for sorting purposes (DO NOT USE THE VALUE)
        'relevance': item.get('relevance') # represents how well does the article match the query
                                           # the higher the value, the better the match
    })

    # Append source information
    source_list = item.get('source')
    sources.append({
            #'source_id': str(uuid.uuid4()),
            'article_id': item.get('uri'),
            'source_name': source_list.get('title'),
            'source_link': source_list.get('uri'),
    })

    # Append author information
    author_list = item.get('authors', [])
    if not author_list:
        authors.append({
            #'author_id': str(uuid.uuid4()),
            'article_id': item.get('uri'),
            'author_name': None,
            'author_email': None,
            'author_type': None,
            'is_agency': None
        })
    else:
        # Loop through each author if multiple authors are present
        for author in author_list:
            if isinstance(author, dict):
                authors.append({
                    #'author_id': str(uuid.uuid4()),
                    'article_id': item.get('uri'),
                    'author_name': author.get('name', None),
                    'author_email': author.get('uri', None),
                    'author_type': author.get('type', None),
                    'is_agency': author.get('isAgency', None)
                })
            else:
                # Handle cases where author data isn't a dict
                authors.append({
                    #'author_id': str(uuid.uuid4()),
                    'article_id': item.get('uri'),
                    'author_name': None,
                    'author_email': None,
                    'author_type': None,
                    'is_agency': None
                })
    # Append category information
    category_list = item.get('categories', [])
    for category in category_list:  # Iterate directly over the list
        first_level, second_level, third_level = extract_levels(category.get('label', None))
        categories.append({
                'article_id': item.get('uri'),
                #'uri': category.get('uri', None),
                'label': category.get('label', None),
                'keyword_1': first_level,
                'keyword_2': second_level,
                'keyword_3': third_level
            })

<br> </br>
### take into consideration of same authors
    # Append author information
    author_list = item.get('authors', [])
    if not author_list:
        # Handle case with no authors
        author_id = str(uuid.uuid4())  # Generate a unique ID for the author
        authors.append({
            'author_id': author_id,
            'article_id': item.get('uri'),
            'uri': None,
            'name': None,
            'type': None,
            'is_agency': None
        })
    else:
        # Loop through each author if multiple authors are present
        for author in author_list:
            if isinstance(author, dict):
                author_name = author.get('name', None)
                if author_name not in author_id_map:
                    # If the author is not in the map, create a new entry
                    author_id = str(uuid.uuid4())  # Generate a new unique ID
                    author_id_map[author_name] = author_id
                else:
                    # If the author is already in the map, use the existing ID
                    author_id = author_id_map[author_name]

                authors.append({
                    'author_id': author_id,  # Use consistent author ID
                    'article_id': item.get('uri'),  # explicitly add article_id for relational linking
                    'uri': author.get('uri', None),
                    'name': author_name,
                    'type': author.get('type', None),
                    'is_agency': author.get('isAgency', None)
                })
            else:
                # Handle cases where author data isn't a dict
                author_id = str(uuid.uuid4())  # Generate a new unique ID
                authors.append({
                    'author_id': author_id,
                    'article_id': item.get('uri'),
                    'uri': None,
                    'name': None,
                    'type': None,
                    'is_agency': None
                })
<br> </br>

In [18]:
len(articles)

100

In [19]:
len(authors)

101

In [20]:
len(sources)

100

In [21]:
len(categories)

448

In [22]:
print(articles)



In [23]:
print(sources)

[{'article_id': '8338877931', 'source_name': 'KCRW', 'source_link': 'kcrw.com'}, {'article_id': '8338878132', 'source_name': 'Ashcroft Cache Creek Journal', 'source_link': 'ashcroftcachecreekjournal.com'}, {'article_id': '8338877508', 'source_name': 'Victoria Buzz', 'source_link': 'victoriabuzz.com'}, {'article_id': '8338877641', 'source_name': 'Newsday', 'source_link': 'newsday.com'}, {'article_id': '8338878126', 'source_name': 'Lakers Nation', 'source_link': 's22928.pcdn.co'}, {'article_id': '8338878037', 'source_name': 'https://www.wtvm.com', 'source_link': 'wtvm.com'}, {'article_id': '8338878118', 'source_name': 'CNA', 'source_link': 'channelnewsasia.com'}, {'article_id': '8338878063', 'source_name': 'The Straits Times', 'source_link': 'straitstimes.com'}, {'article_id': '8338877930', 'source_name': 'KCRW', 'source_link': 'kcrw.com'}, {'article_id': '8338878060', 'source_name': 'TASS', 'source_link': 'tass.com'}, {'article_id': '8338878119', 'source_name': 'RTL Today', 'source_link

In [24]:
print(authors)

[{'article_id': '8338877931', 'author_name': None, 'author_email': None, 'author_type': None, 'is_agency': None}, {'article_id': '8338878132', 'author_name': None, 'author_email': None, 'author_type': None, 'is_agency': None}, {'article_id': '8338877508', 'author_name': 'Curtis Blandy', 'author_email': 'curtis_blandy@victoriabuzz.com', 'author_type': 'author', 'is_agency': False}, {'article_id': '8338877641', 'author_name': None, 'author_email': None, 'author_type': None, 'is_agency': None}, {'article_id': '8338878126', 'author_name': 'Daniel Starkand', 'author_email': 'daniel_starkand@s22928.pcdn.co', 'author_type': 'author', 'is_agency': False}, {'article_id': '8338878037', 'author_name': None, 'author_email': None, 'author_type': None, 'is_agency': None}, {'article_id': '8338878118', 'author_name': None, 'author_email': None, 'author_type': None, 'is_agency': None}, {'article_id': '8338878063', 'author_name': None, 'author_email': None, 'author_type': None, 'is_agency': None}, {'art

In [33]:
print(categories)

[{'article_id': '8338877931', 'label': 'dmoz/Home/Cooking/Baking and Confections', 'keyword_1': 'Home', 'keyword_2': 'Cooking', 'keyword_3': 'Baking and Confections'}, {'article_id': '8338877931', 'label': 'dmoz/Home/Cooking/For Children', 'keyword_1': 'Home', 'keyword_2': 'Cooking', 'keyword_3': 'For Children'}, {'article_id': '8338877931', 'label': 'dmoz/Home/Cooking/Outdoors', 'keyword_1': 'Home', 'keyword_2': 'Cooking', 'keyword_3': 'Outdoors'}, {'article_id': '8338877931', 'label': 'dmoz/Recreation/Humor/Food and Drink', 'keyword_1': 'Recreation', 'keyword_2': 'Humor', 'keyword_3': 'Food and Drink'}, {'article_id': '8338877931', 'label': 'news/Health', 'keyword_1': 'Health', 'keyword_2': None, 'keyword_3': None}, {'article_id': '8338878132', 'label': 'dmoz/Business/Transportation and Logistics/Maritime', 'keyword_1': 'Business', 'keyword_2': 'Transportation and Logistics', 'keyword_3': 'Maritime'}, {'article_id': '8338878132', 'label': 'dmoz/Recreation/Outdoors/Camping', 'keyword_

In [None]:
# unique_author_ids = set(item['author_id'] for item in authors)
# unique_count = len(unique_author_ids)

# print("Number of unique author_ids:", unique_count)

In [None]:
unique_article_ids = set(item['article_id'] for item in authors)
unique_count = len(unique_article_ids)

print("Number of unique article_ids:", unique_count)

In [None]:
# Initialize sets for tracking unique and duplicate author_ids
unique_article_ids = set()
duplicate_article_ids = set()

# Loop through each author in the list to check for duplicates
for article in authors:
    article_id = article['article_id']
    if article_id in unique_article_ids:
        duplicate_article_ids.add(article_id)
    else:
        unique_article_ids.add(article_id)

# Convert the set of duplicate IDs to a list (optional)
duplicates_list = list(duplicate_article_ids)

print("Duplicate article_ids:", duplicates_list)

In [None]:
df = pd.DataFrame(result)
df.head()

In [None]:
df.shape

In [None]:
df.value_counts(subset='dataType', dropna=True)

In [None]:
df.isnull().sum()

In [None]:
df['authors']

In [None]:
# Define the list of values to filter
uri_values = ['8384451260', '2024-10-528318960', '8384450686', '8384450685', '8384451709']

# Filter rows where 'uri' is in the specified list
filtered_df = df[df['uri'].isin(uri_values)]

print(filtered_df)

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
print(df['authors'].loc[[47, 49, 72, 95, 97]])

In [None]:
print(authors[47]['author_type'])

In [None]:
print(result)

In [None]:
for item in result:
    author_list = item.get('authors', [])
    for author in author_list:
        name = author.get('name')
        print(name)

In [34]:
category_df = pd.DataFrame(categories)

In [35]:
category_df.head()

Unnamed: 0,article_id,label,keyword_1,keyword_2,keyword_3
0,8338877931,dmoz/Home/Cooking/Baking and Confections,Home,Cooking,Baking and Confections
1,8338877931,dmoz/Home/Cooking/For Children,Home,Cooking,For Children
2,8338877931,dmoz/Home/Cooking/Outdoors,Home,Cooking,Outdoors
3,8338877931,dmoz/Recreation/Humor/Food and Drink,Recreation,Humor,Food and Drink
4,8338877931,news/Health,Health,,


In [36]:
category_df['article_id'].nunique()

100

In [38]:
print(category_df['keyword_1'].nunique())
print(category_df['keyword_2'].nunique())
print(category_df['keyword_3'].nunique())

14
98
177


In [43]:
category_df.value_counts(subset='keyword_1', dropna=False)

keyword_1
Society                   128
Sports                     75
Business                   61
Recreation                 52
Games                      19
Home                       19
Politics                   17
Arts and Entertainment     16
Health                     15
Science                    15
Shopping                   12
Arts                       11
Technology                  6
Computers                   2
dtype: int64

In [44]:
category_df.value_counts(subset='keyword_2', dropna=False)

keyword_2
NaN                          86
Issues                       36
Bowling                      26
Humor                        25
Transgendered                17
                             ..
Recreation                    1
Religion and Spirituality     1
Retail Trade                  1
Human Resources               1
Accounting                    1
Length: 99, dtype: int64

In [45]:
category_df.value_counts(subset='keyword_3', dropna=False)

keyword_3
NaN                     145
Coming Out               16
Instructors               7
Candlepin                 7
Warfare and Conflict      6
                       ... 
Fuel Cells                1
Frugality                 1
Financial Planning        1
Fats and Oils             1
Korean                    1
Length: 178, dtype: int64