This notebook fetches and processes article content from URLs and checks for the presence of specific keywords. 

1. Fetch article content from URLs using `requests` and `BeautifulSoup`.
2. Check if the article content contains keywords.
3. Save article sthat contain the keywords to a CSV file. 

In [8]:
%pip install requests bs4 pandas

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

In [10]:
# get article content from url
def fetch_article_content(url):
    try:
        # send GET request
        response = requests.get(url)
        response.raise_for_status()     # exception for bad responses
        
        # parse HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # get paragraphs and join them
        paragraphs = soup.find_all('p')
        content = ' '.join([para.get_text() for para in paragraphs])

        return content      # return article content
    except requests.RequestException as e:
        print(f"Error fetching article content: {e}")
        return ""           # return empty string on error

In [11]:
# check if content contains keywords
def contains_keywords(content, keywords):
    return any(keyword.lower() in content.lower() for keyword in keywords)

In [12]:
# get row count from the output CSV file
def get_existing_row_count(file_path):
    if os.path.exists(file_path):
        return pd.read_csv(file_path).shape[0]
    return 0

In [13]:
# process articles in chunks and save filtered articles to output file
def process_articles_in_chunks(df, chunk_size=50, output_file='filtered_articles.csv'):
    offset = get_existing_row_count(output_file)
    
    # process articles in chunks
    for chunk_start in range(0, len(df), chunk_size):
        chunk_end = min(chunk_start + chunk_size, len(df))
        chunk = df.iloc[chunk_start:chunk_end]
        
        results = []        # store filtered articles
        for index, row in chunk.iterrows():
            url = row['url']
            print(f'Processing article {index}: {url}')
            try:
                # fetch article content
                content = fetch_article_content(url)
                
                # check if content contains keywords
                if 'nike' in content.lower() or 'adidas' in content.lower():
                    # append article to results
                    results.append({
                        'athlete': row['athlete'],
                        'title': row['title'],
                        'url': url,
                        'content': content
                    })
            except Exception as e:
                print(f"Error processing article {url}: {e}")
        
        # convert results to dataframe and save to csv
        results_df = pd.DataFrame(results)
        try:
            results_df.to_csv(output_file, mode='a', header=not os.path.exists(output_file), index=False)
        except Exception as e:
            print(f"Error saving results to file {output_file}: {e}")
        
        # update offset for next chunk
        offset += len(results_df)
        
        print(f"Processed chunk from index {chunk_start} to {chunk_end}. {offset} rows appended to {output_file}")

#### Handle long-running processes with timeout
This cell defines a function that runs a task with a specified timeout. If the task exceeds the timeout, the function stops.

In [15]:
import time
import threading

# run function with timeout
def run_with_timeout(timeout, func, *args, **kwargs):
    def wrapper():
        try:
            func(*args, **kwargs)
        except Exception as e:
            pass  # suppress exception

    thread = threading.Thread(target=wrapper)
    thread.start()
    thread.join(timeout)
    if thread.is_alive():
        # timeout reached, stop process
        pass

# simulate processing articles with timeout
def process_articles_in_chunks(articles):
    time.sleep(30)

try:
    if os.path.exists('articles.csv') and os.path.getsize('articles.csv') > 0:
        articles = pd.read_csv('articles.csv')
        run_with_timeout(20, process_articles_in_chunks, articles)
    else:
        pass  # suppress file not found or empty error
except Exception as e:
    pass  # suppress exception

In [4]:
# load and display filtered articles
filtered_articles = pd.read_csv('data/articles/filtered_articles.csv')
filtered_articles.head()

[1;31merror[0m: [1mexternally-managed-environment[0m

[31m×[0m This environment is externally managed
[31m╰─>[0m To install Python packages system-wide, try brew install
[31m   [0m xyz, where xyz is the package you are trying to
[31m   [0m install.
[31m   [0m 
[31m   [0m If you wish to install a Python library that isn't in Homebrew,
[31m   [0m use a virtual environment:
[31m   [0m 
[31m   [0m python3 -m venv path/to/venv
[31m   [0m source path/to/venv/bin/activate
[31m   [0m python3 -m pip install xyz
[31m   [0m 
[31m   [0m If you wish to install a Python application that isn't in Homebrew,
[31m   [0m it may be easiest to use 'pipx install xyz', which will manage a
[31m   [0m virtual environment for you. You can install pipx with
[31m   [0m 
[31m   [0m brew install pipx
[31m   [0m 
[31m   [0m You may restore the old behavior of pip by passing
[31m   [0m the '--break-system-packages' flag to pip, or by adding
[31m   [0m 'break-system-packag

ModuleNotFoundError: No module named 'pandas'