In [286]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from datetime import datetime, timedelta
import nltk
from nltk.stem import WordNetLemmatizer
from dateutil.parser import parse
from dateutil.relativedelta import relativedelta

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Wille\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Wille\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In verband met dat er aan het begin tot aan 1 week staat dat het bijvoorbeeld 5 dagen geleden is word dat omgezet naar een absolute datum. Ook in het geval van 3 uur geleden word dit omgezet in een absolute datum. 

In [288]:
# Functie om relatieve of absolute datums om te zetten naar een absolute datum
def convert_date(date_str):
    try:
        # Controleer op relatieve datums zoals "7 days ago"
        if "day" in date_str:
            days_ago = int(date_str.split()[0])
            return (datetime.now() - timedelta(days=days_ago)).date()
        elif "hour" in date_str:
            hours_ago = int(date_str.split()[0])
            return (datetime.now() - timedelta(hours=hours_ago)).date()
        elif "minute" in date_str:
            minutes_ago = int(date_str.split()[0])
            return (datetime.now() - timedelta(minutes=minutes_ago)).date()

        # Als geen relatieve datum, probeer dan een absolute datum te parsen
        # Formaat: "Month Day, Year" (bijvoorbeeld "June 26, 2024")
        try:
            return datetime.strptime(date_str.strip(), '%B %d, %Y').date()
        except ValueError as ve:
            print(f"ValueError while parsing date: {date_str}. Error: {ve}")
            return None  # Als parsing mislukt, retourneer None

    except Exception as e:
        print(f"Error converting date: {date_str}. Exception: {e}")
        return None


### Beschrijving van de scraper

De scraper begint met het initialiseren van de variabelen `news_data` (om de artikelen op te slaan) en `offset` (voor het bezoeken van de volgende pagina). Vervolgens wordt een header aangemaakt met een `User-Agent` om te voorkomen dat de scraper door de website als een bot wordt geïdentificeerd. De `offset` wordt gebruikt om na elke 10 artikelen door te gaan naar de volgende pagina van de resultaten.

De HTML-structuur van de opgehaalde pagina wordt omgezet met BeautifulSoup om specifieke elementen te kunnen vinden. Daarna worden de links naar de artikelen opgehaald met behulp van `soup.find_all()`. Artikelen worden geïdentificeerd aan de hand van de bijbehorende HTML-classes.

Vervolgens haalt de scraper de gegevens van elk artikel op, zoals de titel, de link, het fragment (excerpt), en de publicatiedatum. Als een artikel geen datum bevat, wordt dit artikel overgeslagen.

Wat daarna gebeurt, is het omzetten van relatieve of absolute datums naar een uniform formaat (bijvoorbeeld een `datetime.date`). De functie biedt nu flexibiliteit: je kunt óf een specifieke einddatum (`until_date`) meegeven, óf een relatieve periode in dagen (`days_ago`). Als beide ontbreken, wordt standaard gekeken naar artikelen van maximaal 30 dagen geleden. Zodra een artikel wordt gevonden dat ouder is dan de opgegeven periode of datum, stopt de scraper. Dit is efficiënt, omdat de artikelen op de website meestal al op publicatiedatum worden gesorteerd.

De opgehaalde attributen worden in het laatste gedeelte opgeslagen in een lijst genaamd `news_data`. Hierbij wordt de URL van elk artikel omgezet naar een volledige URL. Deze lijst wordt aan het einde van de functie geretourneerd.

In [289]:
from dateutil.parser import parse

# Scraper functie voor nieuwsartikelen
def scrape_news_financialpost(base_url, until_date=None, days_ago=None):
    # Bepaal de until_date op basis van de parameters
    try:
        if until_date is None and days_ago is not None:
            until_date = datetime.now().date() - timedelta(days=days_ago)
        elif until_date is None:
            # Standaard 30 dagen geleden als fallback
            until_date = datetime.now().date() - timedelta(days=30)
        else:
            # Controleer of de opgegeven until_date een geldig datumformaat heeft
            if isinstance(until_date, str):
                try:
                    until_date = datetime.strptime(until_date, '%Y-%m-%d').date()
                except ValueError:
                    raise ValueError("until_date is not in the correct format (expected 'YYYY-MM-DD').")
            elif not isinstance(until_date, datetime.date):
                raise TypeError("until_date must be a string in 'YYYY-MM-DD' format or a datetime.date object.")
    except Exception as e:
        print(f"Error setting until_date: {e}")
        # Optioneel: fallback naar standaardwaarde
        until_date = datetime.now().date() - timedelta(days=30)
        print(f"Fallback: until_date set to {until_date}")
    
    news_data = []
    offset = 0  # Start offset for pagination
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
    
    while True:
        url = f"{base_url}&from={offset}"
        print(f"Fetching URL: {url}")
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all article containers
        articles = soup.find_all('a', {'class': 'article-card__link'})
        
        if not articles:
            print("No more articles found. Stopping pagination.")
            break  # Stop if no articles are found on this page

        for article in articles:
            title_tag = article.find('h3', {'class': 'article-card__headline'})
            excerpt_tag = article.find('p', {'class': 'article-card__excerpt'})
            meta_bottom_tag = article.find_next('div', {'class': 'article-card__meta-bottom'})
            date_tag = meta_bottom_tag.find('span', {'class': 'article-card__time-clamp'}) if meta_bottom_tag else None

            if not date_tag:
                print("HTML of the article without a date:")
                print(article.prettify())
                continue  # Skip articles without a date

            title = title_tag.get_text(strip=True) if title_tag else None
            link = article['href'] if article.has_attr('href') else None
            excerpt = excerpt_tag.get_text(strip=True) if excerpt_tag else None
            date_str = date_tag.get_text(strip=True) if date_tag else None

            # Parse the date if available
            date = None
            if date_str:
                try:
                    # Gebruik de dateutil parser
                    date = parse(date_str).date()
                except ValueError:
                    date = convert_date(date_str)
                    if date is None:
                        print(f"Error parsing date with convert_date: {date_str}. Skipping this article.")
                        continue


            # Controleer consistent op datetime-objecten
            if until_date and date and isinstance(date, datetime):
                date = date.date()
            if until_date and date < until_date:
                print(f"Article is older than {until_date}. Stopping.")
                return news_data

            # Add valid articles to the list
            if title and link:
                news_data.append({
                    'title': title,
                    'link': f"https://financialpost.com{link}",
                    'excerpt': excerpt,
                    'date': date
                })

        # Pagination logic: increase offset by 10
        offset += 10

    return news_data


### Beschrijving van de functie `get_article_content`

De functie `get_article_content` wordt gebruikt om de inhoud van een individueel artikel op te halen, op basis van de link die wordt meegegeven. Het proces begint met het toevoegen van een `User-Agent` header aan de aanvraag om toegang te krijgen tot de webpagina zonder dat de scraper als een bot wordt gezien.

Vervolgens wordt een GET-verzoek verstuurd naar de opgegeven URL met de headers toegevoegd. Als het verzoek succesvol is (statuscode 200), wordt de HTML-inhoud van de pagina verwerkt met BeautifulSoup, wat zorgt voor een handige manier om de structuur van de pagina te navigeren en relevante data te extraheren.

De scraper zoekt naar mogelijke secties van het artikel die verschillende HTML-tags kunnen bevatten, zoals `<div>`, `<section>`, en `<article>`. Dit gebeurt via de `find_all()` functie van BeautifulSoup, die meerdere secties teruggeeft die als mogelijke inhoud van het artikel dienen.

Daarna itereren we door elke gevonden sectie en extraheren de alinea’s (`<p>` tags) binnen deze secties. Voor elke alinea wordt de tekst verzameld. Als er sterke tekst (zoals vetgedrukte woorden) wordt gevonden binnen een alinea, wordt deze eerst toegevoegd aan de inhoud.

Het resultaat is de volledige tekst van het artikel, inclusief eventuele sterke tekst, die wordt samengevoegd en teruggegeven als één string. Indien de inhoud niet succesvol kan worden opgehaald (bijvoorbeeld bij een fout in het verzoek), wordt er een foutmelding weergegeven en retourneert de functie `None`.

Deze aanpak maakt het mogelijk om gestructureerd en gedetailleerd de tekst van een artikel te verzamelen door meerdere secties en alinea’s van de pagina te doorzoeken en te extraheren.


In [290]:

# Function to scrape content from individual article
def get_article_content(link):
    # Voeg headers toe om toegang te krijgen tot de pagina
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    # Maak een GET-verzoek met de juiste headers
    response = requests.get(link, headers=headers)

    # Controleer of het verzoek succesvol was
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        # Zoek naar verschillende mogelijke secties van het artikel
        possible_content = soup.find_all(['div', 'section', 'article'])  # Meer tags proberen

        content = ""
        
        # Itereer over gevonden secties en probeer tekst te extraheren
        for content_section in possible_content:
            paragraphs = content_section.find_all('p')  # Zoek naar alinea's binnen de sectie
            for para in paragraphs:
                # Verkrijg de tekst en zoek naar sterke tekst als dat bestaat
                strong_text = para.find('strong')
                if strong_text:
                    content += strong_text.get_text(strip=True) + " "
                content += para.get_text(strip=True) + " "
        
        # Geef de content terug, indien gevonden
        return content.strip()
    else:
        print(f"Failed to retrieve the article. Status code: {response.status_code}")
        return None


### Beschrijving van de functie `process_news`

De functie `process_news` wordt gebruikt om de verzamelde nieuwsartikelen verder te verwerken en aan te vullen met extra informatie, zoals de inhoud van het artikel en de sentimentanalyse. Het begint met het initialiseren van een lege lijst genaamd `processed_news`, die zal worden gevuld met verrijkte gegevens van de artikelen.

Eerst wordt de `SentimentIntensityAnalyzer` van de NLTK-bibliotheek geïnitialiseerd, evenals de `WordNetlemmatizer` voor het lemmatizeren van tokens. Vervolgens doorloopt de functie elk artikel in de meegegeven lijst `news_data`.

Voor elk artikel worden de titel, de link en de publicatiedatum opgehaald. Daarna wordt de inhoud van het artikel opgehaald via de eerder gedefinieerde `get_article_content`-functie. Als de inhoud van het artikel niet beschikbaar is, wordt de titel gebruikt als alternatief voor de sentimentanalyse.

De tekst (inhoud of titel) wordt vervolgens geanalyseerd met de `SentimentIntensityAnalyzer`, die een sentimentscore retourneert, die aangeeft of de tekst positief, negatief of neutraal is. Daarnaast wordt de tekst opgedeeld in tokens (woorden) via de functie `word_tokenize` en vervolgens lemmatized voor verdere tekstverwerking of analyse.

De verrijkte gegevens, inclusief de titel, link, inhoud, tokens, sentimentanalyse en publicatiedatum, worden vervolgens toegevoegd aan de lijst `processed_news`.

Na het doorlopen van alle artikelen, wordt de lijst `processed_news` geretourneerd, die nu alle benodigde informatie bevat voor verdere analyse of verwerking.


In [291]:
# Process and enrich news with content and sentiment
def process_news(news_data):
    processed_news = []

    # Initialize SentimentIntensityAnalyzer and WordNetLemmatizer
    sia = SentimentIntensityAnalyzer()
    lemmatizer = WordNetLemmatizer()

    for news in news_data:
        title = news['title']
        link = news['link']
        date = news['date']  # Verkrijg de datum hier

        # Scrape the article content
        content = get_article_content(link)

        # If no content is available, use the title for tokenization and sentiment analysis
        text_to_analyze = content if content else title

        # Sentiment Analysis
        sentiment = sia.polarity_scores(text_to_analyze)

        # Tokenization
        tokens = word_tokenize(text_to_analyze)

        # Lemmatization
        lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

        # Append enriched data, inclusief de datum
        processed_news.append({
            'title': title,
            'link': link,
            'content': content,
            'tokens': lemmatized_tokens,
            'sentiment': sentiment,
            'date': date  # Voeg de datum toe aan de verwerkte data
        })

    return processed_news


In [292]:
from dateutil.parser import parse
import pandas as pd

# Base URL for Financial Post S&P 500 search
base_url = "https://financialpost.com/search/?search_text=S%26P+500&date_range=-3650d&sort=desc"

# Step 1: Scrape news articles to get links

news_data = scrape_news_financialpost(base_url, until_date="2025-10-01")

# Step 2: Process news articles (scrape content, analyze sentiment)
processed_news = process_news(news_data)

# Step 3: Convert to DataFrame
df_news = pd.DataFrame(processed_news)

# Debugging: Show the first few rows of the DataFrame
display(df_news)

Fetching URL: https://financialpost.com/search/?search_text=S%26P+500&date_range=-3650d&sort=desc&from=0
Error parsing date with dateutil: 17 minutes ago. Trying convert_date function.
Error parsing date with dateutil: 19 hours ago. Trying convert_date function.
Error parsing date with dateutil: 21 hours ago. Trying convert_date function.
Error parsing date with dateutil: 23 hours ago. Trying convert_date function.
Error parsing date with dateutil: 1 day ago. Trying convert_date function.
Error parsing date with dateutil: 1 day ago. Trying convert_date function.
Error parsing date with dateutil: 1 day ago. Trying convert_date function.
Error parsing date with dateutil: 1 day ago. Trying convert_date function.
Error parsing date with dateutil: 1 day ago. Trying convert_date function.
Error parsing date with dateutil: 2 days ago. Trying convert_date function.
Fetching URL: https://financialpost.com/search/?search_text=S%26P+500&date_range=-3650d&sort=desc&from=10
Error parsing date with 

Unnamed: 0,title,link,content,tokens,sentiment,date
0,Impax CEO Says Sudden Loss of $6 Billion Manda...,https://financialpost.com/pmn/business-pmn/imp...,The chief executive officer of Impax Asset Man...,"[The, chief, executive, officer, of, Impax, As...","{'neg': 0.049, 'neu': 0.801, 'pos': 0.151, 'co...",2025-01-22
1,Stocks March Higher With Netflix Poised for Re...,https://financialpost.com/pmn/business-pmn/asi...,US equity futures signaled that the strong Wal...,"[US, equity, future, signaled, that, the, stro...","{'neg': 0.026, 'neu': 0.837, 'pos': 0.137, 'co...",2025-01-22
2,Top Wall Street Banks Rake In ECM Revenue as I...,https://financialpost.com/pmn/business-pmn/top...,Wall Street’s biggest banks anticipate there’s...,"[Wall, Street, ’, s, biggest, bank, anticipate...","{'neg': 0.014, 'neu': 0.858, 'pos': 0.128, 'co...",2025-01-21
3,"Tesla slides, space stocks soar after Trump's ...",https://financialpost.com/news/tesla-slides-sp...,Could have wide-ranging impacts on markets Aut...,"[Could, have, wide-ranging, impact, on, market...","{'neg': 0.023, 'neu': 0.827, 'pos': 0.151, 'co...",2025-01-21
4,Trump tariff threats widen gap between U.S. an...,https://financialpost.com/commodities/energy/o...,U.S. energy stocks outperformed their northern...,"[U.S., energy, stock, outperformed, their, nor...","{'neg': 0.022, 'neu': 0.844, 'pos': 0.134, 'co...",2025-01-21
...,...,...,...,...,...,...
69,Corporate earnings growth faces uncertainty am...,https://financialpost.com/pmn/corporate-earnin...,Author of the article: You can save this artic...,"[Author, of, the, article, :, You, can, save, ...","{'neg': 0.044, 'neu': 0.789, 'pos': 0.167, 'co...",2025-01-02
70,Four considerations for your investments in th...,https://financialpost.com/investing/4-consider...,If you do not know how much you are paying in ...,"[If, you, do, not, know, how, much, you, are, ...","{'neg': 0.023, 'neu': 0.849, 'pos': 0.128, 'co...",2025-01-02
71,Here's what the CAPE ratio is signalling about...,https://financialpost.com/investing/what-cape-...,Noah Solomon: Investors are standing at a cros...,"[Noah, Solomon, :, Investors, are, standing, a...","{'neg': 0.04, 'neu': 0.83, 'pos': 0.131, 'comp...",2025-01-02
72,Wall Street Rattled by Rough Start to New Year...,https://financialpost.com/pmn/business-pmn/asi...,Major US benchmarks extended a selloff for a f...,"[Major, US, benchmark, extended, a, selloff, f...","{'neg': 0.047, 'neu': 0.831, 'pos': 0.122, 'co...",2025-01-01


The issue seems to be with the `isinstance` check in the `scrape_news_financialpost` function. The `until_date` is already a `datetime` object, but the check is not correctly handling it.

Let's fix the `scrape_news_financialpost` function to correctly handle the `until_date` parameter:





In this updated version, the `scrape_news_financialpost` function correctly handles the `until_date` parameter by ensuring it is a `datetime.date` object. The `convert_date` function is used to handle relative dates like "7 days ago" and convert them to absolute dates. This should resolve the issues you were encountering.

The error message indicates that there is an issue with how the `until_date` is being used within the `scrape_news_financialpost` function. Specifically, it seems like there is an `isinstance` check that is not correctly handling the `until_date`.

Let's update the `scrape_news_financialpost` function to ensure it correctly handles the `until_date` parameter. Here's an example of how you might modify the function:





In this example, the `scrape_news_financialpost` function checks if `until_date` is a `datetime` object and raises a `ValueError` if it is not. It then uses the `until_date` to filter out articles that are published after the specified date.

Make sure to adjust the scraping logic according to your actual implementation. If you provide the actual implementation of `scrape_news_financialpost`, I can give more specific guidance.

In [293]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Combineer de content en title als een enkele tekst
corpus = [f"{news['title']} {news['content']}" for news in processed_news]

# Zet de tekst om naar een TF-IDF matrix
vectorizer = TfidfVectorizer(max_features=20, stop_words='english')  # Verhoog max_features om meer woorden te krijgen
X_tfidf = vectorizer.fit_transform(corpus)

# Verkrijg de feature-namen (de belangrijkste woorden)
feature_names = vectorizer.get_feature_names_out()

# Zet de TF-IDF waarden om naar een array
tfidf_values = X_tfidf.toarray()

# Maak een DataFrame met de TF-IDF waarden, waarbij de kolommen de woorden zijn
df_tfidf = pd.DataFrame(tfidf_values, columns=feature_names)

# Voeg de titels toe aan de DataFrame als een extra kolom
df_tfidf['title'] = [news['title'] for news in processed_news]

# Toon de DataFrame
display(df_tfidf)


Unnamed: 0,account,article,articles,canada,comments,continue,create,experience,follow,freehere,...,latest,news,orsign,postmedia,read,reading,registering,save,sign,title
0,0.456647,0.319653,0.228323,0.152216,0.228323,0.182659,0.152216,0.152216,0.213102,0.167437,...,0.243545,0.273988,0.167437,0.213102,0.197880,0.152216,0.167437,0.243545,0.152216,Impax CEO Says Sudden Loss of $6 Billion Manda...
1,0.412543,0.333207,0.238005,0.190404,0.238005,0.190404,0.158670,0.158670,0.158670,0.174537,...,0.253872,0.253872,0.174537,0.222138,0.206271,0.158670,0.174537,0.253872,0.158670,Stocks March Higher With Netflix Poised for Re...
2,0.437733,0.353553,0.252538,0.202031,0.252538,0.202031,0.168359,0.168359,0.168359,0.185195,...,0.168359,0.168359,0.185195,0.134687,0.218866,0.168359,0.185195,0.269374,0.168359,Top Wall Street Banks Rake In ECM Revenue as I...
3,0.407846,0.329414,0.235296,0.282355,0.235296,0.188237,0.156864,0.156864,0.156864,0.172550,...,0.156864,0.282355,0.172550,0.219610,0.203923,0.156864,0.172550,0.250982,0.156864,"Tesla slides, space stocks soar after Trump's ..."
4,0.429532,0.346930,0.247807,0.297368,0.247807,0.165205,0.165205,0.165205,0.165205,0.181725,...,0.165205,0.165205,0.181725,0.132164,0.214766,0.165205,0.181725,0.264327,0.165205,Trump tariff threats widen gap between U.S. an...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69,0.443296,0.358047,0.255748,0.170499,0.255748,0.170499,0.170499,0.170499,0.170499,0.187548,...,0.170499,0.170499,0.187548,0.136399,0.221648,0.170499,0.187548,0.272798,0.170499,Corporate earnings growth faces uncertainty am...
70,0.456066,0.243235,0.228033,0.334448,0.258437,0.152022,0.152022,0.152022,0.152022,0.167224,...,0.152022,0.273640,0.167224,0.212831,0.197629,0.152022,0.167224,0.243235,0.182426,Four considerations for your investments in th...
71,0.415960,0.255975,0.239977,0.255975,0.239977,0.191982,0.159985,0.159985,0.159985,0.175983,...,0.159985,0.287972,0.175983,0.223978,0.207980,0.159985,0.175983,0.255975,0.223978,Here's what the CAPE ratio is signalling about...
72,0.412543,0.333207,0.238005,0.158670,0.238005,0.158670,0.158670,0.158670,0.158670,0.174537,...,0.253872,0.253872,0.174537,0.222138,0.206271,0.190404,0.174537,0.253872,0.190404,Wall Street Rattled by Rough Start to New Year...


In [299]:
from datetime import datetime, timedelta
from dateutil.parser import parse
import requests
from bs4 import BeautifulSoup

class NewsScraper:
    def __init__(self, base_url, until_date=None, days_ago=None):
        """
        Initializes the web scraping object with the given base URL and optional date parameters.

        Args:
            base_url (str): The base URL for the web scraping.
            until_date (str, optional): The end date for the web scraping in 'YYYY-MM-DD' format. Defaults to None.
            days_ago (int, optional): The number of days ago from today to set the end date. Defaults to None.

        Attributes:
            base_url (str): The base URL for the web scraping.
            until_date (str): The calculated end date for the web scraping.
            news_data (list): A list to store the scraped news data.
        """
        self.base_url = base_url
        self.until_date = self.set_until_date(until_date, days_ago)
        self.news_data = []

    def set_until_date(self, until_date, days_ago):
        """
        Set the 'until_date' based on the provided parameters.

        Parameters:
        until_date (str or datetime.date or None): The target date until which to set. 
            If a string is provided, it should be in 'YYYY-MM-DD' format.
            If None, the date will be calculated based on 'days_ago'.
        days_ago (int or None): The number of days ago from today to set the 'until_date'.
            If None, defaults to 30 days ago.

        Returns:
        datetime.date: The calculated 'until_date'.

        Raises:
        ValueError: If 'until_date' is a string but not in the correct 'YYYY-MM-DD' format.
        TypeError: If 'until_date' is not a string or datetime.date object.

        Notes:
        - If both 'until_date' and 'days_ago' are None, the function defaults to 30 days ago.
        - If an error occurs, the function prints an error message and defaults to 30 days ago.
        """
        try:
            if until_date is None and days_ago is not None:
                return datetime.now().date() - timedelta(days=days_ago)
            elif until_date is None:
                return datetime.now().date() - timedelta(days=30)
            else:
                if isinstance(until_date, str):
                    try:
                        return datetime.strptime(until_date, '%Y-%m-%d').date()
                    except ValueError:
                        raise ValueError("until_date is not in the correct format (expected 'YYYY-MM-DD').")
                elif not isinstance(until_date, datetime.date):
                    raise TypeError("until_date must be a string in 'YYYY-MM-DD' format or a datetime.date object.")
        except Exception as e:
            print(f"Error setting until_date: {e}")
            return datetime.now().date() - timedelta(days=30)

    def convert_date(self, date_str):
        """
        Converts a relative or absolute date string into a date object.
        The function handles relative date strings such as "2 days ago", "3 hours ago",
        "15 minutes ago", and "1 week ago". It also handles absolute date strings in the
        format "Month Day, Year" (e.g., "January 1, 2020").
        Args:
            date_str (str): The date string to convert.
        Returns:
            date: A date object representing the converted date, or None if the conversion fails.
        Raises:
            ValueError: If the date string is in an unrecognized format.
            Exception: For any other exceptions that occur during conversion.
        """
        try:
            if "day" in date_str:
                days_ago = int(date_str.split()[0])
                return (datetime.now() - timedelta(days=days_ago)).date()
            elif "hour" in date_str:
                hours_ago = int(date_str.split()[0])
                return (datetime.now() - timedelta(hours=hours_ago)).date()
            elif "minute" in date_str:
                minutes_ago = int(date_str.split()[0])
                return (datetime.now() - timedelta(minutes=minutes_ago)).date()
            elif "week" in date_str:
                weeks_ago = int(date_str.split()[0])
                return (datetime.now() - timedelta(weeks=weeks_ago)).date()

            try:
                return datetime.strptime(date_str.strip(), '%B %d, %Y').date()
            except ValueError as ve:
                print(f"ValueError while parsing date: {date_str}. Error: {ve}")
                return None
        except Exception as e:
            print(f"Error converting date: {date_str}. Exception: {e}")
            return None

    def scrape_news(self):
        """
        Scrapes news articles from the specified base URL with pagination.
        This method fetches news articles by sending HTTP GET requests to the base URL with an offset parameter.
        It parses the HTML content to extract article details such as title, link, excerpt, and date.
        The method continues to paginate until no more articles are found or an article older than the specified `until_date` is encountered.
        Returns:
            list: A list of dictionaries, each containing the following keys:
                - 'title' (str): The title of the article.
                - 'link' (str): The full URL to the article.
                - 'excerpt' (str): A brief excerpt of the article.
                - 'date' (datetime.date): The publication date of the article.
        Raises:
            ValueError: If the date string cannot be parsed and `convert_date` method also fails to convert it.
        Notes:
            - The method uses a User-Agent header to mimic a browser request.
            - The method prints debug information such as the URL being fetched and any errors encountered during parsing.
            - The method stops pagination if no more articles are found or if an article is older than the specified `until_date`.
        """
        offset = 0
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
        
        while True:
            url = f"{self.base_url}&from={offset}"
            print(f"Fetching URL: {url}")
            response = requests.get(url, headers=headers)
            soup = BeautifulSoup(response.content, 'html.parser')

            articles = soup.find_all('a', {'class': 'article-card__link'})
            
            if not articles:
                print("No more articles found. Stopping pagination.")
                break

            for article in articles:
                title_tag = article.find('h3', {'class': 'article-card__headline'})
                excerpt_tag = article.find('p', {'class': 'article-card__excerpt'})
                meta_bottom_tag = article.find_next('div', {'class': 'article-card__meta-bottom'})
                date_tag = meta_bottom_tag.find('span', {'class': 'article-card__time-clamp'}) if meta_bottom_tag else None

                if not date_tag:
                    print("HTML of the article without a date:")
                    print(article.prettify())
                    continue

                title = title_tag.get_text(strip=True) if title_tag else None
                link = article['href'] if article.has_attr('href') else None
                excerpt = excerpt_tag.get_text(strip=True) if excerpt_tag else None
                date_str = date_tag.get_text(strip=True) if date_tag else None

                date = None
                if date_str:
                    try:
                        date = parse(date_str).date()
                    except ValueError:
                        date = self.convert_date(date_str)
                        if date is None:
                            print(f"Error parsing date with convert_date: {date_str}. Skipping this article.")
                            continue

                if self.until_date and date and isinstance(date, datetime):
                    date = date.date()
                if self.until_date and date < self.until_date:
                    print(f"Article is older than {self.until_date}. Stopping.")
                    return self.news_data

                if title and link:
                    self.news_data.append({
                        'title': title,
                        'link': f"https://financialpost.com{link}",
                        'excerpt': excerpt,
                        'date': date
                    })

            offset += 10

        return self.news_data

    def get_article_content(self, link):
        """
        Retrieves the content of an article from the given URL.
        This method sends a GET request to the specified link with a custom User-Agent header.
        If the request is successful (status code 200), it parses the HTML content using BeautifulSoup
        and extracts text from all <p> tags within <div>, <section>, or <article> tags. If a <p> tag
        contains a <strong> tag, the text within the <strong> tag is given priority.
        Args:
            link (str): The URL of the article to retrieve.
        Returns:
            str: The extracted article content as a single string, or None if the request failed.
        Raises:
            requests.exceptions.RequestException: If there is an issue with the network request.
        """
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }

        response = requests.get(link, headers=headers)

        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            possible_content = soup.find_all(['div', 'section', 'article'])

            content = ""
            for content_section in possible_content:
                paragraphs = content_section.find_all('p')
                for para in paragraphs:
                    strong_text = para.find('strong')
                    if strong_text:
                        content += strong_text.get_text(strip=True) + " "
                    content += para.get_text(strip=True) + " "
            
            return content.strip()
        else:
            print(f"Failed to retrieve the article. Status code: {response.status_code}")
            return None

    def process_news(news_data):
        processed_news = []

        # Initialize SentimentIntensityAnalyzer and WordNetLemmatizer
        sia = SentimentIntensityAnalyzer()
        lemmatizer = WordNetLemmatizer()

        for news in news_data:
            title = news['title']
            link = news['link']
            date = news['date']  # Verkrijg de datum hier

            # Scrape the article content
            content = get_article_content(link)

            # If no content is available, use the title for tokenization and sentiment analysis
            text_to_analyze = content if content else title

            # Sentiment Analysis
            sentiment = sia.polarity_scores(text_to_analyze)

            # Tokenization
            tokens = word_tokenize(text_to_analyze)

            # Lemmatization
            lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

            # Append enriched data, inclusief de datum
            processed_news.append({
                'title': title,
                'link': link,
                'content': content,
                'tokens': lemmatized_tokens,
                'sentiment': sentiment,
                'date': date  # Voeg de datum toe aan de verwerkte data
            })

        return processed_news
    
    def run(self):
        """
        Execute the entire workflow in the correct sequence.
        """
        news_data = self.scrape_news()
        processed_news = process_news(news_data)
        return processed_news
        
# Voorbeeld van hoe de class te gebruiken
scraper = NewsScraper(base_url='https://financialpost.com/search/?search_text=S%26P+500&date_range=-3650d&sort=desc', until_date='2025-01-01')
news_data = scraper.scrape_news()

NewsScraper.run(base_url='https://financialpost.com/search/?search_text=S%26P+500&date_range=-3650d&sort=desc', until_date='2025-01-01')

Fetching URL: https://financialpost.com/search/?search_text=S%26P+500&date_range=-3650d&sort=desc&from=0
Fetching URL: https://financialpost.com/search/?search_text=S%26P+500&date_range=-3650d&sort=desc&from=10
Fetching URL: https://financialpost.com/search/?search_text=S%26P+500&date_range=-3650d&sort=desc&from=20
Fetching URL: https://financialpost.com/search/?search_text=S%26P+500&date_range=-3650d&sort=desc&from=30
Fetching URL: https://financialpost.com/search/?search_text=S%26P+500&date_range=-3650d&sort=desc&from=40
Fetching URL: https://financialpost.com/search/?search_text=S%26P+500&date_range=-3650d&sort=desc&from=50
Fetching URL: https://financialpost.com/search/?search_text=S%26P+500&date_range=-3650d&sort=desc&from=60
Fetching URL: https://financialpost.com/search/?search_text=S%26P+500&date_range=-3650d&sort=desc&from=70
Article is older than 2025-01-01. Stopping.
