In [59]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from datetime import datetime, timedelta
import nltk

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Wille\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Wille\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [60]:
# Functie om relatieve datums zoals "x days ago" om te zetten naar een absolute datum
def convert_relative_to_absolute(date_str):
    if "day" in date_str:
        try:
            # Extract the number of days
            days_ago = int(date_str.split()[0])
            # Calculate the date
            return datetime.now().date() - timedelta(days=days_ago)
        except ValueError:
            return None
    return None

In [61]:
# Scraper functie voor nieuwsartikelen
def scrape_news_financialpost(base_url, until_date=None):
    news_data = []
    offset = 0  # Start offset for pagination
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
    
    while True:
        url = f"{base_url}&from={offset}"
        print(f"Fetching URL: {url}")
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all article containers
        articles = soup.find_all('a', {'class': 'article-card__link'})  # Change this if necessary
        
        if not articles:
            print("No more articles found. Stopping pagination.")
            break  # Stop if no articles are found on this page

        for article in articles:
            title_tag = article.find('h3', {'class': 'article-card__headline'})
            excerpt_tag = article.find('p', {'class': 'article-card__excerpt'})
            
            # Find the date in the correct container (article-card__meta-bottom)
            meta_bottom_tag = article.find_next('div', {'class': 'article-card__meta-bottom'})
            date_tag = meta_bottom_tag.find('span', {'class': 'article-card__time-clamp'}) if meta_bottom_tag else None

            # Debugging: Print het artikel als de datum niet wordt gevonden
            if not date_tag:
                print("HTML of the article without a date:")
                print(article.prettify())
                continue  # Skip articles without a date

            title = title_tag.get_text(strip=True) if title_tag else None
            link = article['href'] if article.has_attr('href') else None
            excerpt = excerpt_tag.get_text(strip=True) if excerpt_tag else None
            date_str = date_tag.get_text(strip=True) if date_tag else None  # Extracts the date text directly

            # Parse the date if available
            date = None
            if date_str:
                # Eerst proberen de relatieve datum om te zetten, zoals "6 days ago"
                date = convert_relative_to_absolute(date_str)
                if not date:
                    # Als het geen relatieve datum is, probeer dan een absolute datum te parsen
                    try:
                        date = datetime.strptime(date_str, '%B %d, %Y').date()
                    except ValueError:
                        print(f"Error parsing date: {date_str}. Skipping this article.")
                        continue  # Skip articles met een niet-parseerbare datum

            # Skip articles if they are older than until_date
            if until_date and date and date < until_date:
                print(f"Article is older than {until_date}. Stopping.")
                return news_data

            # Add valid articles to the list
            if title and link:
                news_data.append({
                    'title': title,
                    'link': f"https://financialpost.com{link}",
                    'excerpt': excerpt,
                    'date': date  # Voeg de datum toe aan de verzamelde data
                })

        # Pagination logic: increase offset by 10
        offset += 10

    return news_data

In [62]:

# Function to scrape content from individual article
def get_article_content(link):
    # Voeg headers toe om toegang te krijgen tot de pagina
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    # Maak een GET-verzoek met de juiste headers
    response = requests.get(link, headers=headers)

    # Controleer of het verzoek succesvol was
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        # Zoek naar verschillende mogelijke secties van het artikel
        possible_content = soup.find_all(['div', 'section', 'article'])  # Meer tags proberen

        content = ""
        
        # Itereer over gevonden secties en probeer tekst te extraheren
        for content_section in possible_content:
            paragraphs = content_section.find_all('p')  # Zoek naar alinea's binnen de sectie
            for para in paragraphs:
                # Verkrijg de tekst en zoek naar sterke tekst als dat bestaat
                strong_text = para.find('strong')
                if strong_text:
                    content += strong_text.get_text(strip=True) + " "
                content += para.get_text(strip=True) + " "
        
        # Geef de content terug, indien gevonden
        return content.strip()
    else:
        print(f"Failed to retrieve the article. Status code: {response.status_code}")
        return None


In [63]:
# Process and enrich news with content and sentiment
def process_news(news_data):
    processed_news = []

    for news in news_data:
        title = news['title']
        link = news['link']
        date = news['date']  # Verkrijg de datum hier

        # Scrape the article content
        content = get_article_content(link)

        # If no content is available, use the title for tokenization and sentiment analysis
        text_to_analyze = content if content else title

        # Sentiment Analysis
        sentiment = sia.polarity_scores(text_to_analyze)

        # Tokenization (optional)
        tokens = word_tokenize(text_to_analyze)

        # Append enriched data, inclusief de datum
        processed_news.append({
            'title': title,
            'link': link,
            'content': content,
            'tokens': tokens,
            'sentiment': sentiment,
            'date': date  # Voeg de datum toe aan de verwerkte data
        })

    return processed_news


In [64]:
# Base URL for Financial Post S&P 500 search
base_url = "https://financialpost.com/search/?search_text=S%26P+500&date_range=-3650d&sort=desc"

# Define the until_date if needed (e.g., articles only from the last year)
until_date = datetime.now().date() - pd.Timedelta(days=30)

# Step 1: Scrape news articles to get links
news_data = scrape_news_financialpost(base_url, until_date=until_date)

# Step 2: Process news articles (scrape content, analyze sentiment)
processed_news = process_news(news_data)

# Step 3: Convert to DataFrame
df_news = pd.DataFrame(processed_news)

# Debugging: Show the first few rows of the DataFrame
display(df_news)

Fetching URL: https://financialpost.com/search/?search_text=S%26P+500&date_range=-3650d&sort=desc&from=0
Error parsing date: 2 hours ago. Skipping this article.
Error parsing date: 5 hours ago. Skipping this article.
Error parsing date: 7 hours ago. Skipping this article.
Error parsing date: 13 hours ago. Skipping this article.
Error parsing date: 22 hours ago. Skipping this article.
Fetching URL: https://financialpost.com/search/?search_text=S%26P+500&date_range=-3650d&sort=desc&from=10
Fetching URL: https://financialpost.com/search/?search_text=S%26P+500&date_range=-3650d&sort=desc&from=20
Error parsing date: January 14, 2025. Skipping this article.
Error parsing date: January 14, 2025. Skipping this article.
Error parsing date: January 14, 2025. Skipping this article.
Error parsing date: January 14, 2025. Skipping this article.
Error parsing date: January 14, 2025. Skipping this article.
Error parsing date: January 14, 2025. Skipping this article.
Fetching URL: https://financialpost

Unnamed: 0,title,link,content,tokens,sentiment,date
0,European Stocks Muted; Easing Tariff Threat Bo...,https://financialpost.com/pmn/business-pmn/eur...,"European stocks ended Monday steady, erasing e...","[European, stocks, ended, Monday, steady, ,, e...","{'neg': 0.011, 'neu': 0.855, 'pos': 0.134, 'co...",2025-01-20
1,Azimut and KGHM Drill High-Grade Nickel-PGE Mi...,https://financialpost.com/globe-newswire/azimu...,Author of the article: You can save this artic...,"[Author, of, the, article, :, You, can, save, ...","{'neg': 0.018, 'neu': 0.882, 'pos': 0.1, 'comp...",2025-01-20
2,Downbeat UK Earnings Updates Dim Hopes for Dom...,https://financialpost.com/pmn/business-pmn/dow...,The UK’s FTSE 100 Index may be at an all-time ...,"[The, UK, ’, s, FTSE, 100, Index, may, be, at,...","{'neg': 0.047, 'neu': 0.831, 'pos': 0.122, 'co...",2025-01-20
3,"Trump's win sparked market euphoria, but the b...",https://financialpost.com/news/trumps-sparked-...,"Tariffs are the biggest risk, sparking fears t...","[Tariffs, are, the, biggest, risk, ,, sparking...","{'neg': 0.058, 'neu': 0.805, 'pos': 0.137, 'co...",2025-01-19
4,Is the U.S. stock market in a bubble? It depen...,https://financialpost.com/investing/us-stock-m...,Some say ‘American exceptionalism’ justifies e...,"[Some, say, ‘, American, exceptionalism, ’, ju...","{'neg': 0.034, 'neu': 0.827, 'pos': 0.139, 'co...",2025-01-19
5,Rhyolite Ridge Lithium-Boron Project Closes Up...,https://financialpost.com/pmn/business-wire-ne...,Author of the article: You can save this artic...,"[Author, of, the, article, :, You, can, save, ...","{'neg': 0.029, 'neu': 0.828, 'pos': 0.144, 'co...",2025-01-17
6,Stalled-Out Tech Stocks Leave 'Other 493' to D...,https://financialpost.com/pmn/business-pmn/sta...,Some less-loved sectors of US equities are tak...,"[Some, less-loved, sectors, of, US, equities, ...","{'neg': 0.01, 'neu': 0.805, 'pos': 0.185, 'com...",2025-01-17
7,Canada a 'good place to hide' if U.S. stocks drop,https://financialpost.com/investing/canada-goo...,Concerns about a U.S. equity bubble concentrat...,"[Concerns, about, a, U.S., equity, bubble, con...","{'neg': 0.027, 'neu': 0.839, 'pos': 0.134, 'co...",2025-01-17
8,Nasdaq Futures Jump 1% as Fed Rate-Cut Bets Re...,https://financialpost.com/pmn/business-pmn/us-...,US stock futures rose strongly on Friday as th...,"[US, stock, futures, rose, strongly, on, Frida...","{'neg': 0.025, 'neu': 0.838, 'pos': 0.136, 'co...",2025-01-17
9,Three scenarios of how the capital gains incre...,https://financialpost.com/personal-finance/tax...,Here's who may be affected and the monetary tr...,"[Here, 's, who, may, be, affected, and, the, m...","{'neg': 0.029, 'neu': 0.815, 'pos': 0.156, 'co...",2025-01-17


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Combineer de content en title als een enkele tekst
corpus = [f"{news['title']} {news['content']}" for news in processed_news]

# Zet de tekst om naar een TF-IDF matrix
vectorizer = TfidfVectorizer(max_features=20, stop_words='english')  # Verhoog max_features om meer woorden te krijgen
X_tfidf = vectorizer.fit_transform(corpus)

# Verkrijg de feature-namen (de belangrijkste woorden)
feature_names = vectorizer.get_feature_names_out()

# Zet de TF-IDF waarden om naar een array
tfidf_values = X_tfidf.toarray()

# Maak een DataFrame met de TF-IDF waarden, waarbij de kolommen de woorden zijn
df_tfidf = pd.DataFrame(tfidf_values, columns=feature_names)

# Voeg de titels toe aan de DataFrame als een extra kolom
df_tfidf['title'] = [news['title'] for news in processed_news]

# Toon de DataFrame
display(df_tfidf)


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Wille\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,100,2024,2025,account,article,canada,cent,ci,city,comment,...,sign,signing,site,stock,subscribe,update,vanguard,way,year,title
0,0.071729,0.0,0.043037,0.37299,0.444718,0.172149,0.0,0.0,0.143458,0.146416,...,0.215186,0.0,0.114766,0.093843,0.172149,0.073208,0.0,0.0,0.06227,European Stocks Muted; Easing Tariff Threat Bo...
1,0.083235,0.135657,0.083235,0.309157,0.36861,0.142688,0.0,0.0,0.118906,0.121358,...,0.118906,0.160653,0.095125,0.0,0.118906,0.084951,0.0,0.085822,0.025806,Azimut and KGHM Drill High-Grade Nickel-PGE Mi...
2,0.194827,0.0,0.060883,0.316593,0.377477,0.121767,0.0,0.0,0.121767,0.124277,...,0.121767,0.164517,0.097413,0.047792,0.121767,0.149133,0.0,0.087887,0.052854,Downbeat UK Earnings Updates Dim Hopes for Dom...
3,0.048556,0.0,0.048556,0.252489,0.301045,0.1748,0.53818,0.0,0.097111,0.099114,...,0.097111,0.131205,0.077689,0.152461,0.097111,0.049557,0.0,0.093455,0.084305,"Trump's win sparked market euphoria, but the b..."
4,0.039952,0.068371,0.055933,0.207752,0.207752,0.127848,0.442823,0.0,0.079905,0.081552,...,0.135838,0.107958,0.063924,0.261349,0.079905,0.040776,0.0,0.076896,0.138735,Is the U.S. stock market in a bubble? It depen...
5,0.087767,0.071522,0.037614,0.325991,0.388681,0.125381,0.0,0.0,0.125381,0.15356,...,0.125381,0.1694,0.175533,0.0,0.125381,0.089576,0.0,0.090496,0.108847,Rhyolite Ridge Lithium-Boron Project Closes Up...
6,0.070476,0.040202,0.197332,0.366474,0.43695,0.140952,0.0,0.0,0.140952,0.143858,...,0.169142,0.0,0.112761,0.036881,0.140952,0.071929,0.0,0.033911,0.137659,Stalled-Out Tech Stocks Leave 'Other 493' to D...
7,0.090886,0.0,0.038951,0.337576,0.402495,0.298625,0.095939,0.0,0.129837,0.132514,...,0.129837,0.0,0.10387,0.237812,0.129837,0.066257,0.0,0.0,0.112715,Canada a 'good place to hide' if U.S. stocks drop
8,0.11243,0.03563,0.062461,0.324798,0.387259,0.124922,0.0,0.0,0.124922,0.152998,...,0.124922,0.168781,0.099938,0.114405,0.124922,0.063749,0.0,0.120219,0.081336,Nasdaq Futures Jump 1% as Fed Rate-Cut Bets Re...
9,0.049702,0.368571,0.089463,0.258449,0.258449,0.099404,0.514158,0.0,0.099404,0.101453,...,0.139165,0.134303,0.079523,0.0,0.099404,0.050727,0.0,0.071746,0.021574,Three scenarios of how the capital gains incre...
