In [125]:
import requests
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
import pandas as pd

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Wille\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Wille\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [126]:
# Function to scrape news articles (links and titles)
def scrape_news(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all article links with the correct class
    articles = soup.find_all('a', {'class': 'Card-title'})
    news_data = []
    
    for article in articles:
        title = article.get_text()
        link = article['href']
        # Only add absolute links
        if link.startswith("http"):
            news_data.append({'title': title, 'link': link})
    
    return news_data


In [127]:
# Function to scrape timestamp from individual article page
def get_timestamp(article_url):
    response = requests.get(article_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find the <time> tag with the timestamp
    time_tag = soup.find('time')
    if time_tag:
        return time_tag.get_text()  # Extract the text from the tag
    return None  # Return None if no timestamp is found

In [128]:
# Function to scrape article content from individual article page
def get_article_content(article_url):
    response = requests.get(article_url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Zoek naar alle 'group' divs in de body van het artikel
    all_groups = soup.find_all('div', {'class': 'group'})
    
    content = ''
    for group in all_groups:
        # Zoek alle <p>-tags binnen de <div class="group">
        paragraphs = group.find_all('p')

        # Verwijder ongewenste elementen zoals afbeeldingen en divs
        for unwanted in group.find_all(['img', 'div']):
            unwanted.decompose()  # Verwijdert ongewenste elementen

        # Combineer de tekst van alle <p>-tags
        content += " ".join([para.get_text().strip() for para in paragraphs if para.get_text().strip()])

    return content.strip() if content else None


In [129]:
# SentimentIntensityAnalyzer voor sentimentanalyse
sia = SentimentIntensityAnalyzer()

# Function to process news and enrich with timestamps/content
def process_news(news_data):
    processed_news = []
    
    for news in news_data:
        title = news['title']
        link = news['link']
        
        # Scrape the timestamp
        timestamp = get_timestamp(link)  # Reuse your timestamp scraping function
        
        # Scrape the article content
        content = get_article_content(link)
        
        # If no content is available, use the title for tokenization and sentiment analysis
        text_to_analyze = content if content else title
        
        # Tokenization
        tokens = word_tokenize(text_to_analyze)
        
        # Sentiment Analysis
        sentiment = sia.polarity_scores(text_to_analyze)
        
        # Append enriched data
        processed_news.append({
            'title': title,
            'link': link,
            'timestamp': timestamp,
            'content': content,
            'tokens': tokens,
            'sentiment': sentiment
        })
    
    return processed_news

In [130]:
# URL of the news site
url = 'https://www.cnbc.com/markets/'

# Scrape and process news articles
news_data = scrape_news(url)
processed_news = process_news(news_data)

# Convert to DataFrame for further processing
df_news = pd.DataFrame(processed_news)

# Display the first few rows
display(df_news)

Unnamed: 0,title,link,timestamp,content,tokens,sentiment
0,"The Week That Was: January 17, 2025",https://www.cnbc.com/video/2025/01/17/the-week...,,,"[The, Week, That, Was, :, January, 17, ,, 2025]","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
1,Traders bet on more Bank of England rate cuts ...,https://www.cnbc.com/2025/01/17/bank-of-englan...,"Published Fri, Jan 17 20257:45 AM EST",LONDON — Traders bet on more Bank of England r...,"[LONDON, —, Traders, bet, on, more, Bank, of, ...","{'neg': 0.07, 'neu': 0.854, 'pos': 0.076, 'com..."
2,10-year Treasury note yield is flat Friday aft...,https://www.cnbc.com/2025/01/17/us-treasury-yi...,"Published Fri, Jan 17 20254:19 AM EST",Treasury yields were little changed on Friday ...,"[Treasury, yields, were, little, changed, on, ...","{'neg': 0.043, 'neu': 0.883, 'pos': 0.075, 'co..."
3,India is staring at an 'oil shock' as U.S. san...,https://www.cnbc.com/2025/01/17/us-sanctions-o...,"Published Fri, Jan 17 20251:06 AM EST",India's days of buying cheap Russian oil could...,"[India, 's, days, of, buying, cheap, Russian, ...","{'neg': 0.09, 'neu': 0.86, 'pos': 0.05, 'compo..."
4,"FTC sues Deere, alleging equipment repair 'mon...",https://www.cnbc.com/2025/01/15/ftc-sues-deere...,"Published Wed, Jan 15 20251:39 PM EST",The Federal Trade Commission has sued agricult...,"[The, Federal, Trade, Commission, has, sued, a...","{'neg': 0.074, 'neu': 0.87, 'pos': 0.056, 'com..."
...,...,...,...,...,...,...
57,Watch Friday's full episode of the Halftime Re...,https://www.cnbc.com/video/2025/01/17/watch-fr...,,,"[Watch, Friday, 's, full, episode, of, the, Ha...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
58,Trade Tracker: Kevin Simpson buys more Meta,https://www.cnbc.com/video/2025/01/17/trade-tr...,,,"[Trade, Tracker, :, Kevin, Simpson, buys, more...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
59,Calls of the Day: Robinhood and Salesforce,https://www.cnbc.com/video/2025/01/17/calls-of...,,,"[Calls, of, the, Day, :, Robinhood, and, Sales...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
60,"Trade Tracker: Brenda Vingiello sells AMD, Mon...",https://www.cnbc.com/video/2025/01/17/trade-tr...,,,"[Trade, Tracker, :, Brenda, Vingiello, sells, ...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."


In [131]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Combine article titles into a single corpus
corpus = df_news['title']

# Vectorize the text using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=10, stop_words='english')  # Limit to 10 features and remove common stopwords
tfidf_features = tfidf_vectorizer.fit_transform(corpus)

# Maak een unieke prefix voor de TF-IDF kolommen
tfidf_df = pd.DataFrame(tfidf_features.toarray(), 
                        columns=[f"tfidf_{word}" for word in tfidf_vectorizer.get_feature_names_out()])

# Concatenate zonder duplicatie
df_news = pd.concat([df_news, tfidf_df], axis=1)

# Controleer de DataFrame
display(df_news)


Unnamed: 0,title,link,timestamp,content,tokens,sentiment,tfidf_2025,tfidf_biggest,tfidf_china,tfidf_earnings,tfidf_making,tfidf_markets,tfidf_moves,tfidf_sector,tfidf_stocks,tfidf_week
0,"The Week That Was: January 17, 2025",https://www.cnbc.com/video/2025/01/17/the-week...,,,"[The, Week, That, Was, :, January, 17, ,, 2025]","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.633681,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.773595
1,Traders bet on more Bank of England rate cuts ...,https://www.cnbc.com/2025/01/17/bank-of-englan...,"Published Fri, Jan 17 20257:45 AM EST",LONDON — Traders bet on more Bank of England r...,"[LONDON, —, Traders, bet, on, more, Bank, of, ...","{'neg': 0.07, 'neu': 0.854, 'pos': 0.076, 'com...",1.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
2,10-year Treasury note yield is flat Friday aft...,https://www.cnbc.com/2025/01/17/us-treasury-yi...,"Published Fri, Jan 17 20254:19 AM EST",Treasury yields were little changed on Friday ...,"[Treasury, yields, were, little, changed, on, ...","{'neg': 0.043, 'neu': 0.883, 'pos': 0.075, 'co...",0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
3,India is staring at an 'oil shock' as U.S. san...,https://www.cnbc.com/2025/01/17/us-sanctions-o...,"Published Fri, Jan 17 20251:06 AM EST",India's days of buying cheap Russian oil could...,"[India, 's, days, of, buying, cheap, Russian, ...","{'neg': 0.09, 'neu': 0.86, 'pos': 0.05, 'compo...",0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
4,"FTC sues Deere, alleging equipment repair 'mon...",https://www.cnbc.com/2025/01/15/ftc-sues-deere...,"Published Wed, Jan 15 20251:39 PM EST",The Federal Trade Commission has sued agricult...,"[The, Federal, Trade, Commission, has, sued, a...","{'neg': 0.074, 'neu': 0.87, 'pos': 0.056, 'com...",0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,Watch Friday's full episode of the Halftime Re...,https://www.cnbc.com/video/2025/01/17/watch-fr...,,,"[Watch, Friday, 's, full, episode, of, the, Ha...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",1.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
58,Trade Tracker: Kevin Simpson buys more Meta,https://www.cnbc.com/video/2025/01/17/trade-tr...,,,"[Trade, Tracker, :, Kevin, Simpson, buys, more...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
59,Calls of the Day: Robinhood and Salesforce,https://www.cnbc.com/video/2025/01/17/calls-of...,,,"[Calls, of, the, Day, :, Robinhood, and, Sales...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
60,"Trade Tracker: Brenda Vingiello sells AMD, Mon...",https://www.cnbc.com/video/2025/01/17/trade-tr...,,,"[Trade, Tracker, :, Brenda, Vingiello, sells, ...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
