In [7]:
from datetime import datetime, timedelta
from dateutil.parser import parse
import requests
from bs4 import BeautifulSoup
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from collections import Counter

class NewsScraper:
    def __init__(self, base_url, until_date=None, days_ago=None):
        """
        Initializes the web scraping object with the given base URL and optional date parameters.

        Args:
            base_url (str): The base URL for the web scraping.
            until_date (str, optional): The end date for the web scraping in 'YYYY-MM-DD' format. Defaults to None.
            days_ago (int, optional): The number of days ago from today to set the end date. Defaults to None.

        Attributes:
            base_url (str): The base URL for the web scraping.
            until_date (str): The calculated end date for the web scraping.
            news_data (list): A list to store the scraped news data.
        """
        self.base_url = base_url
        self.until_date = self.set_until_date(until_date, days_ago)
        self.news_data = []

    def set_until_date(self, until_date, days_ago):
        """
        Set the 'until_date' based on the provided parameters.

        Parameters:
        until_date (str or datetime.date or None): The target date until which to set. 
            If a string is provided, it should be in 'YYYY-MM-DD' format.
            If None, the date will be calculated based on 'days_ago'.
        days_ago (int or None): The number of days ago from today to set the 'until_date'.
            If None, defaults to 30 days ago.

        Returns:
        datetime.date: The calculated 'until_date'.

        Raises:
        ValueError: If 'until_date' is a string but not in the correct 'YYYY-MM-DD' format.
        TypeError: If 'until_date' is not a string or datetime.date object.

        Notes:
        - If both 'until_date' and 'days_ago' are None, the function defaults to 30 days ago.
        - If an error occurs, the function prints an error message and defaults to 30 days ago.
        """
        try:
            if until_date is None and days_ago is not None:
                return datetime.now().date() - timedelta(days=days_ago)
            elif until_date is None:
                return datetime.now().date() - timedelta(days=30)
            else:
                if isinstance(until_date, str):
                    try:
                        return datetime.strptime(until_date, '%Y-%m-%d').date()
                    except ValueError:
                        raise ValueError("until_date is not in the correct format (expected 'YYYY-MM-DD').")
                elif not isinstance(until_date, datetime.date):
                    raise TypeError("until_date must be a string in 'YYYY-MM-DD' format or a datetime.date object.")
        except Exception as e:
            print(f"Error setting until_date: {e}")
            return datetime.now().date() - timedelta(days=30)

    def convert_date(self, date_str):
        """
        Converts a relative or absolute date string into a date object.
        The function handles relative date strings such as "2 days ago", "3 hours ago",
        "15 minutes ago", and "1 week ago". It also handles absolute date strings in the
        format "Month Day, Year" (e.g., "January 1, 2020").
        Args:
            date_str (str): The date string to convert.
        Returns:
            date: A date object representing the converted date, or None if the conversion fails.
        Raises:
            ValueError: If the date string is in an unrecognized format.
            Exception: For any other exceptions that occur during conversion.
        """
        try:
            if "day" in date_str:
                days_ago = int(date_str.split()[0])
                return (datetime.now() - timedelta(days=days_ago)).date()
            elif "hour" in date_str:
                hours_ago = int(date_str.split()[0])
                return (datetime.now() - timedelta(hours=hours_ago)).date()
            elif "minute" in date_str:
                minutes_ago = int(date_str.split()[0])
                return (datetime.now() - timedelta(minutes=minutes_ago)).date()
            elif "week" in date_str:
                weeks_ago = int(date_str.split()[0])
                return (datetime.now() - timedelta(weeks=weeks_ago)).date()

            try:
                return datetime.strptime(date_str.strip(), '%B %d, %Y').date()
            except ValueError as ve:
                print(f"ValueError while parsing date: {date_str}. Error: {ve}")
                return None
        except Exception as e:
            print(f"Error converting date: {date_str}. Exception: {e}")
            return None

    def scrape_news(self):
        """
        Scrapes news articles from the specified base URL with pagination.
        This method fetches news articles by sending HTTP GET requests to the base URL with an offset parameter.
        It parses the HTML content to extract article details such as title, link, excerpt, and date.
        The method continues to paginate until no more articles are found or an article older than the specified `until_date` is encountered.
        Returns:
            list: A list of dictionaries, each containing the following keys:
                - 'title' (str): The title of the article.
                - 'link' (str): The full URL to the article.
                - 'excerpt' (str): A brief excerpt of the article.
                - 'date' (datetime.date): The publication date of the article.
        Raises:
            ValueError: If the date string cannot be parsed and `convert_date` method also fails to convert it.
        Notes:
            - The method uses a User-Agent header to mimic a browser request.
            - The method prints debug information such as the URL being fetched and any errors encountered during parsing.
            - The method stops pagination if no more articles are found or if an article is older than the specified `until_date`.
        """
        offset = 0
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
        
        while True:
            url = f"{self.base_url}&from={offset}"
            print(f"Fetching URL: {url}")
            response = requests.get(url, headers=headers)
            soup = BeautifulSoup(response.content, 'html.parser')

            articles = soup.find_all('a', {'class': 'article-card__link'})
            
            if not articles:
                print("No more articles found. Stopping pagination.")
                break

            for article in articles:
                title_tag = article.find('h3', {'class': 'article-card__headline'})
                excerpt_tag = article.find('p', {'class': 'article-card__excerpt'})
                meta_bottom_tag = article.find_next('div', {'class': 'article-card__meta-bottom'})
                date_tag = meta_bottom_tag.find('span', {'class': 'article-card__time-clamp'}) if meta_bottom_tag else None

                if not date_tag:
                    print("HTML of the article without a date:")
                    print(article.prettify())
                    continue

                title = title_tag.get_text(strip=True) if title_tag else None
                link = article['href'] if article.has_attr('href') else None
                excerpt = excerpt_tag.get_text(strip=True) if excerpt_tag else None
                date_str = date_tag.get_text(strip=True) if date_tag else None

                date = None
                if date_str:
                    try:
                        date = parse(date_str).date()
                    except ValueError:
                        date = self.convert_date(date_str)
                        if date is None:
                            print(f"Error parsing date with convert_date: {date_str}. Skipping this article.")
                            continue

                if self.until_date and date and isinstance(date, datetime):
                    date = date.date()
                if self.until_date and date < self.until_date:
                    print(f"Article is older than {self.until_date}. Stopping.")
                    return self.news_data

                if title and link:
                    self.news_data.append({
                        'title': title,
                        'link': f"https://financialpost.com{link}",
                        'excerpt': excerpt,
                        'date': date
                    })

            offset += 10

        return self.news_data

    def get_article_content(self, link):
        """
        Retrieves the content of an article from the given URL.
        This method sends a GET request to the specified link with a custom User-Agent header.
        If the request is successful (status code 200), it parses the HTML content using BeautifulSoup
        and extracts text from all <p> tags within <div>, <section>, or <article> tags. If a <p> tag
        contains a <strong> tag, the text within the <strong> tag is given priority.
        Args:
            link (str): The URL of the article to retrieve.
        Returns:
            str: The extracted article content as a single string, or None if the request failed.
        Raises:
            requests.exceptions.RequestException: If there is an issue with the network request.
        """
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }

        response = requests.get(link, headers=headers)

        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            possible_content = soup.find_all(['div', 'section', 'article'])

            content = ""
            for content_section in possible_content:
                paragraphs = content_section.find_all('p')
                for para in paragraphs:
                    strong_text = para.find('strong')
                    if strong_text:
                        content += strong_text.get_text(strip=True) + " "
                    content += para.get_text(strip=True) + " "
            
            return content.strip()
        else:
            print(f"Failed to retrieve the article. Status code: {response.status_code}")
            return None

    def process_news(self, news_data):
        processed_news = []


        for news in news_data:
            title = news['title']
            link = news['link']
            date = news['date']  # Verkrijg de datum hier

            # Scrape the article content
            content = self.get_article_content(link)

            # Append enriched data, inclusief de datum
            processed_news.append({
                'title': title,
                'link': link,
                'content': content,
                'date': date  # Voeg de datum toe aan de verwerkte data
            })

        return processed_news
    def extract_features(self, news_data):
        """
        Converts news content into numerical features for stock prediction.

        Args:
            news_data (list): A list of dictionaries containing news data with 'content' and 'title'.

        Returns:
            list: A list of dictionaries with numerical features extracted from the news content.
        """
        sia = SentimentIntensityAnalyzer()
        lemmatizer = WordNetLemmatizer()
        feature_data = []

        for news in news_data:
            title = news.get('title', '')
            content = news.get('content', '')

            # Use content if available, otherwise fall back to title
            text = content if content else title

            # Sentiment scores
            sentiment = sia.polarity_scores(text)

            # Tokenization and lemmatization
            tokens = word_tokenize(text)
            lemmatized_tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens]

            # Word frequency (top 3 most common words)
            word_counts = Counter(lemmatized_tokens)
            top_words = word_counts.most_common(3)

            # Feature dictionary
            features = {
                'positive_sentiment': sentiment['pos'],
                'negative_sentiment': sentiment['neg'],
                'neutral_sentiment': sentiment['neu'],
                'compound_sentiment': sentiment['compound'],
                'word_count': len(tokens),
                'top_word_1': top_words[0][0] if len(top_words) > 0 else None,
                'top_word_2': top_words[1][0] if len(top_words) > 1 else None,
                'top_word_3': top_words[2][0] if len(top_words) > 2 else None,
                'date': news.get('date', None)
            }

            feature_data.append(features)

        return feature_data

    def run(self):
        """
        Execute the entire workflow: scrape news, process, and extract features.

        Returns:
            list: A list of dictionaries containing numerical features for stock prediction.
        """
        news_data = self.scrape_news()
        processed_news = self.process_news(news_data)
        feature_data = self.extract_features(processed_news)
        return feature_data

# Example run
scraper = NewsScraper(base_url='https://financialpost.com/search/?search_text=S%26P+500&date_range=-3650d&sort=desc', until_date='2025-01-01')
features = scraper.run()
display(features)
        


Fetching URL: https://financialpost.com/search/?search_text=S%26P+500&date_range=-3650d&sort=desc&from=0
Fetching URL: https://financialpost.com/search/?search_text=S%26P+500&date_range=-3650d&sort=desc&from=10
Fetching URL: https://financialpost.com/search/?search_text=S%26P+500&date_range=-3650d&sort=desc&from=20
Fetching URL: https://financialpost.com/search/?search_text=S%26P+500&date_range=-3650d&sort=desc&from=30
Fetching URL: https://financialpost.com/search/?search_text=S%26P+500&date_range=-3650d&sort=desc&from=40
Fetching URL: https://financialpost.com/search/?search_text=S%26P+500&date_range=-3650d&sort=desc&from=50
Fetching URL: https://financialpost.com/search/?search_text=S%26P+500&date_range=-3650d&sort=desc&from=60
Fetching URL: https://financialpost.com/search/?search_text=S%26P+500&date_range=-3650d&sort=desc&from=70
Article is older than 2025-01-01. Stopping.


[{'positive_sentiment': 0.139,
  'negative_sentiment': 0.018,
  'neutral_sentiment': 0.843,
  'compound_sentiment': 0.9999,
  'word_count': 4623,
  'top_word_1': '.',
  'top_word_2': ',',
  'top_word_3': 'the',
  'date': datetime.date(2025, 1, 23)},
 {'positive_sentiment': 0.151,
  'negative_sentiment': 0.049,
  'neutral_sentiment': 0.801,
  'compound_sentiment': 0.9999,
  'word_count': 4352,
  'top_word_1': '.',
  'top_word_2': 'the',
  'top_word_3': ',',
  'date': datetime.date(2025, 1, 22)},
 {'positive_sentiment': 0.137,
  'negative_sentiment': 0.026,
  'neutral_sentiment': 0.837,
  'compound_sentiment': 0.9998,
  'word_count': 3036,
  'top_word_1': '.',
  'top_word_2': 'the',
  'top_word_3': 'to',
  'date': datetime.date(2025, 1, 22)},
 {'positive_sentiment': 0.128,
  'negative_sentiment': 0.014,
  'neutral_sentiment': 0.858,
  'compound_sentiment': 0.9997,
  'word_count': 2322,
  'top_word_1': '.',
  'top_word_2': ',',
  'top_word_3': 'the',
  'date': datetime.date(2025, 1, 22)},


---

## 1. CPI (Consumer Price Index)

### What It Is
- The CPI measures the average change in prices paid by consumers for a basket of goods and services over time.
- It is a key indicator of inflation and cost-of-living changes.

### Why It’s Important
- CPI helps economists, policymakers, and investors understand inflation trends.
- Rising CPI indicates inflation, meaning the purchasing power of money decreases.
- Changes in CPI can influence interest rates, stock market trends, and economic policies.

### How It’s Calculated
1. Identify a fixed basket of goods and services typically consumed by households.
2. Track the price of each item in the basket over time.
3. Calculate the cost of the basket for a given period.
4. Compare the cost to a base year and express the change as a percentage.

Mathematically:  

$$
CPI = \frac{
    \sum_{i=1}^{n} 
    \left( \frac{
        \text{Price of Basket in Current Year}_i
    }{
        \text{Price of Basket in Base Year}_i
    } \right)
}{n} \times 100
$$

Where:
- **Cost of Basket at Current Period**: Sum of current prices of all items in the basket.
- **Cost of Basket at Base Period**: Sum of prices of the same items in the base year.

---
Fernando, J. (2024b, 24 oktober). Consumer Price Index (CPI): What It Is and How It's Used. Investopedia. https://www.investopedia.com/terms/c/consumerpriceindex.asp 