In [41]:
import sqlite3
import pandas as pd
import numpy as np
from newsapi import newsapi_client
import os
import yfinance as yf

Data Collection & API set up

In [92]:
api_key = os.getenv('NEWSAPI_KEY') # 환경 변수 설정
auth = newsapi_client.NewsApiClient(api_key='d5b0dce5cb5543b6aa800e8c60689d8c')

In [94]:
news1 = auth.get_everything(q="Trump",from_param="2024-11-12",to="2024-11-13",language="en",sort_by="relevancy",page_size=5)
news2 = auth.get_everything(q="Trump",from_param="2024-11-11",to="2024-11-12",language="en",sort_by="relevancy",page_size=5)
news3 = auth.get_everything(q="Trump",from_param="2024-11-10",to="2024-11-11",language="en",sort_by="relevancy",page_size=5)

In [96]:
aapl = yf.download(tickers='AAPL', start='2024-11-12',end='2024-11-13',interval='15m')

msft = yf.download(tickers='MSFT', start='2024-11-12',end='2024-11-13',interval='15m')

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


In [98]:
class DataLake:
    def __init__(self,db_name="datalake.db"):
        self.raw_data = {} # raw json type
        self.processed_data = {}
        self.db_name = db_name
        
    def access_control(func):
        def wrapper(*args, **kwargs):
            password = "1234"
            input_pw = input("Type in your password")
            if input_pw == password:
                result = func(*args,**kwargs)
            else:
                result = "You have no access to this DataLake!"
            return result
        return wrapper
        
    @access_control
    def store_data(self, dataset_name, data, processed=False): # adding data
        if processed:
            with sqlite3.connect(self.db_name) as conn:
                if dataset_name not in self.processed_data:
                    self.processed_data[dataset_name] = data
                    data.to_sql(dataset_name, conn, index=False, if_exists='replace')
                else:  # Append new data if table already exists
                    self.processed_data[dataset_name] = pd.concat([self.processed_data[dataset_name], data])
                    data.to_sql(dataset_name, conn, index=False, if_exists='append')

        else:
            if dataset_name not in self.raw_data:
                self.raw_data[dataset_name] = data
            else: # preventing overrridinng
                self.raw_data[dataset_name] += data
    
    @access_control
    def retrieve_data(self, dataset_name, processed=False, sql_query = None): # for data filtering and extraction
        if processed: # assuming value for each processed key is a dataframe
            with sqlite3.connect(self.db_name) as conn:
                if sql_query is None:
                    sql_query = f"SELECT * FROM {dataset_name}"
                try:
                    query_data = pd.read_sql_query(sql_query, conn)
                    return query_data
                except Exception as e:
                    print(f"Error: {e}")
                    return None
            
        else: # if raw data just return its raw data
            return self.raw_data[dataset_name]
        
    
    

In [52]:
test = DataLake()
test.store_data("test",news1['articles'])
data = test.retrieve_data("test")
test.store_data("test1",news2['articles'])
test.store_data("test2",news3['articles'])
test.store_data("aapl",aapl,processed=True)
test.store_data("msft",msft,processed=True)

Type in your password 1234
Type in your password 1234
Type in your password 1234
Type in your password 1234
Type in your password 1234
Type in your password 1234


In [54]:
test.retrieve_data("aapl",processed=True,sql_query="SELECT * FROM aapl,msft WHERE aapl.Volume<1000000 AND msft.Volume<1000000 ")

Type in your password 1234


Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Open.1,High.1,Low.1,Close.1,Adj Close.1,Volume.1
0,223.914993,224.235001,223.690002,224.169998,224.169998,863846,419.540710,420.010010,418.559998,418.744995,418.744995,716553
1,223.914993,224.235001,223.690002,224.169998,224.169998,863846,418.799988,420.260010,417.709991,420.200012,420.200012,613220
2,223.914993,224.235001,223.690002,224.169998,224.169998,863846,420.200012,420.299988,418.809998,418.989990,418.989990,400057
3,223.914993,224.235001,223.690002,224.169998,224.169998,863846,418.980011,419.965393,418.750000,419.730011,419.730011,346740
4,223.914993,224.235001,223.690002,224.169998,224.169998,863846,419.765015,420.000000,418.820007,419.000214,419.000214,450805
...,...,...,...,...,...,...,...,...,...,...,...,...
355,224.500000,224.820007,224.460007,224.600006,224.600006,763948,423.309998,423.399994,422.899994,423.265015,423.265015,236394
356,224.500000,224.820007,224.460007,224.600006,224.600006,763948,423.282501,423.839996,423.079987,423.709991,423.709991,292904
357,224.500000,224.820007,224.460007,224.600006,224.600006,763948,423.739990,424.439911,423.630096,424.007690,424.007690,369294
358,224.500000,224.820007,224.460007,224.600006,224.600006,763948,423.950012,424.049988,423.300293,423.552002,423.552002,558675


In [56]:
class DataCategory:

    def __init__(self, name):
        self.name = name
        self.datasets = {}  # Dictionary to store datasets with their metadata 

    def add_dataset(self, dataset_name, metadata=None):
        """
        param metadata: Metadata dictionary (e.g., description, parameters, etc.).
        """
        if dataset_name not in self.datasets:
            self.datasets[dataset_name] = metadata or {}

    def search(self, keyword):
        """
        Search for datasets within the category by keyword.
        :param keyword: Keyword to search in dataset names or metadata.
        :return: List of matching datasets.
        """
        results = []
        for dataset_name, metadata in self.datasets.items():
            if keyword.lower() in dataset_name.lower() or any(
                keyword.lower() in str(value).lower() for value in metadata.values()
            ):
                results.append((dataset_name, metadata))
        return results


class DataCatalog:
    """
    Organizes datasets into categories and provides metadata for easy discovery.
    """
    def __init__(self, data_lake):
        """
        Initialize the catalog with a reference to the DataLake.
        :param data_lake: An instance of the DataLake class.
        """
        self.categories = {}
        self.data_lake = data_lake  # Link to the DataLake instance

    def add_category(self, category_name):
        """
        Add a new category to the catalog.
        :param category_name: Name of the category.
        """
        if category_name not in self.categories:
            self.categories[category_name] = DataCategory(category_name)

    def add_dataset(self, category_name, dataset_name, data, metadata=None, processed=False):

        if category_name not in self.categories:
            self.add_category(category_name)

        # Add dataset to the category
        self.categories[category_name].add_dataset(dataset_name, metadata)

        # Store the dataset in the DataLake
        self.data_lake.store_data(dataset_name, data, processed=processed)

    def list_datasets(self, category_name): # 카테고리 내 모든 table

        if category_name in self.categories:
            return [
                {"dataset_name": name, "metadata": metadata}
                for name, metadata in self.categories[category_name].datasets.items()
            ]
        return f"Category '{category_name}' not found."

    def search_data(self, keyword):

        results = []
        for category_name, category in self.categories.items():
            matches = category.search(keyword) # table명 match
            for dataset_name, metadata in matches:
                results.append({
                    "category": category_name,
                    "dataset_name": dataset_name,
                    "metadata": metadata
                })
        return results

    def retrieve_dataset(self, dataset_name, processed=False, sql_query=None):
        
        return self.data_lake.retrieve_data(dataset_name, processed=processed, sql_query=sql_query)


In [58]:
data_catalog = DataCatalog(test)
data_catalog

<__main__.DataCatalog at 0x13b031d90>

In [60]:
data_catalog.add_dataset(
    "Equities", "aapl", aapl,
    metadata={"description": "Apple stock data", "parameters": ["Adj Close", "Close","High","Low","Open","Volume"]},
    processed=True
)
data_catalog.add_dataset(
    "Equities", "msft", msft,
    metadata={"description": "Microsoft stock data", "parameters": ["Adj Close", "Close","High","Low","Open","Volume"]},
    processed=True
)

Type in your password 1234
Type in your password 1234


In [62]:
# List datasets in a category
print("Datasets in 'Equities':", data_catalog.list_datasets("Equities"))

# Search for datasets
print("Search results for 'Apple':", data_catalog.search_data("Apple"))

# Retrieve a dataset with SQL query
query_result = data_catalog.retrieve_dataset("AAPL", processed=True, sql_query="SELECT * FROM AAPL WHERE Volume < 1000000")
print("Query result:\n", query_result)

Datasets in 'Equities': [{'dataset_name': 'aapl', 'metadata': {'description': 'Apple stock data', 'parameters': ['Adj Close', 'Close', 'High', 'Low', 'Open', 'Volume']}}, {'dataset_name': 'msft', 'metadata': {'description': 'Microsoft stock data', 'parameters': ['Adj Close', 'Close', 'High', 'Low', 'Open', 'Volume']}}]
Search results for 'Apple': [{'category': 'Equities', 'dataset_name': 'aapl', 'metadata': {'description': 'Apple stock data', 'parameters': ['Adj Close', 'Close', 'High', 'Low', 'Open', 'Volume']}}]


Type in your password 1234


Query result:
           Open        High         Low       Close   Adj Close  Volume
0   223.914993  224.235001  223.690002  224.169998  224.169998  863846
1   224.999893  225.199997  224.830002  224.919998  224.919998  802025
2   224.794998  224.839996  223.884995  223.919998  223.919998  896875
3   223.919998  224.399994  223.860001  224.130005  224.130005  771170
4   224.139999  224.149994  223.386002  223.419907  223.419907  712087
5   223.419998  223.897507  223.354996  223.863007  223.863007  641269
6   223.860001  224.470001  223.779999  224.470001  224.470001  689858
7   224.449997  224.589996  224.285004  224.440002  224.440002  640233
8   224.445007  224.539993  224.139999  224.149796  224.149796  650355
9   224.139999  224.669998  224.139999  224.619995  224.619995  738805
10  224.619995  224.759903  224.419998  224.619904  224.619904  529620
11  224.630005  224.649994  224.460007  224.550995  224.550995  427092
12  224.570007  224.729904  224.520096  224.529999  224.529999


## Framework Components

### BaseDataModel


`BaseDataModel` is an abstract class that provides common attributes and methods to be inherited by more specialized data models. It includes:

- **Attributes:**
  - `timestamp`: Records the time associated with the data. If not provided or invalid, defaults to the current time.
  - `symbol`: An optional identifier, such as a stock ticker symbol.

- **Methods:**
  - `is_recent`: Checks if the `timestamp` is within a specified number of recent days.
  - `is_above_threshold`: Determines if a given value exceeds a specified threshold.

### DataWorkbench


`DataWorkbench` serves as a centralized repository for managing multiple datasets and their associated metadata. Its primary responsibilities include:

- **Data Storage:** Maintains datasets in a dictionary (`data_storage`) keyed by unique dataset names.
- **Metadata Storage:** Keeps metadata related to each dataset in a separate dictionary (`metadata_storage`).
- **Data Operations:**
  - **Storing Data:** Adds new datasets along with optional metadata.
  - **Retrieving Data:** Fetches datasets and their metadata by name.
  - **Transforming Data:** Applies transformation functions to datasets and stores the results.
  - **Aggregating Data:** Performs group-by aggregations based on specified columns and functions.
  - **Statistical Analysis:** Computes basic statistics (e.g., mean, standard deviation) for datasets.

### IntradayDataModel

`IntradayDataModel` is tailored for handling intraday stock data, providing specialized methods to process and analyze such data effectively. Key functionalities include:

- **Aggregation by Interval:** Resamples stock data into specified time intervals (e.g., hourly) to compute aggregated statistics like average closing price and total volume.
- **VWAP Calculation:** Computes the Volume Weighted Average Price, offering a measure that accounts for both price and trading volume.
- **Moving Average Calculation:** Determines the moving average of closing prices over a defined window, useful for trend analysis.

### NewsDataModel

`NewsDataModel` is specialized for handling news articles data, offering functionalities that bridge news sentiment with temporal analysis. Its main features include:

- **Sentiment Filtering:** Selects news articles that meet or exceed a specified sentiment score, aiding in focusing on more impactful news.
- **Grouping by Date:** Aggregates news articles by their publication dates, calculating both the count of articles and the average sentiment score for each date.
- **Sentiment Trend Analysis:** Examines how sentiment scores evolve over time, providing insights into trends and shifts in public sentiment.

---

## Utility Functions


The `display_dataframe` function is a simple utility designed to print the name of a DataFrame followed by its first few rows. This enhances readability when outputting multiple DataFrames by providing clear section headers.

---

## Mock Data Creation

Given that real-time data fetching (especially for future dates) isn't feasible in this context, mock data is generated to simulate realistic scenarios for both news articles and stock data.

### Mock NewsAPI Responses


Three responses (`news1`, `news2`, `news3`) contain a list of articles with attributes such as:

- `publishedAt`: The publication timestamp.
- `title`: The headline of the article.
- `description`: A brief summary.
- `url`: A link to the full article.

### Extracting News Data

The above list comprehension iterates over each mock news response and its articles, extracting relevant fields to construct a consolidated pandas DataFrame (`news_data`). For simplicity, fixed values are assigned to `sentiment_score` and `relevance`.

### Mock Stock Data


Two pandas DataFrames (`aapl_data` and `msft_data`) are created to simulate stock data for Apple Inc. (AAPL) and Microsoft Corporation (MSFT). Each DataFrame includes:

- `Open`: Opening price for the interval.
- `High`: Highest price within the interval.
- `Low`: Lowest price within the interval.
- `Close`: Closing price for the interval.
- `Volume`: Trading volume within the interval.
- `timestamp`: Date and time of the data point, generated at 15-minute intervals starting from November 12, 2024, at 09:30 AM.

---

## Data Processing Workflow

### Storing Data

An instance of `DataWorkbench` is created to manage the datasets. The mock news and stock data are stored in the workbench with unique identifiers:

- `"news_data"`: Contains the consolidated news articles.
- `"aapl_data"`: Holds Apple stock data.
- `"msft_data"`: Contains Microsoft stock data.

### Aggregating Stock Data


An instance of `IntradayDataModel` is created for AAPL with the current timestamp. The `aggregate_by_interval` method is then invoked to resample the AAPL stock data into 1-hour intervals, computing the average closing price and total volume for each hour.

### Calculating VWAP

**Overview:**

The `calculate_vwap` method computes the Volume Weighted Average Price (VWAP) for the entire AAPL dataset. VWAP provides a measure of the average price at which a stock has traded throughout the day, weighted by trading volume.

### Analyzing News Data



An instance of `NewsDataModel` is created, after which the following analyses are performed:

1. **Filtering:** Selects news articles with a `sentiment_score` of 0.4 or higher.
2. **Grouping:** Aggregates articles by their publication dates, calculating the number of articles and the average sentiment score for each date.
3. **Trend Analysis:** Examines how the average sentiment score changes over different dates, identifying trends in public sentiment.

---

## Displaying Results


The `display_dataframe` utility function is employed to print the names and the initial rows of the processed DataFrames. Additionally, the VWAP value for AAPL is printed separately for clarity. The outputs include:

- **Aggregated AAPL Data:** Resampled stock data at 1-hour intervals.
- **VWAP for AAPL:** The Volume Weighted Average Price.
- **Filtered News Data:** News articles meeting the sentiment threshold.
- **Grouped News by Date:** Aggregated article counts and average sentiment scores per date.
- **Sentiment Trend:** Evolution of average sentiment scores over time.

---

## Sample Output


**Explanation of Output:**

1. **Aggregated AAPL Data:**
   - Shows the average closing price and total trading volume for each 1-hour interval.
   
2. **VWAP for AAPL:**
   - Displays the Volume Weighted Average Price, calculated as 155.0 based on the mock data.
   
3. **Filtered News Data:**
   - Lists news articles that have a sentiment score of 0.4 or higher.
   
4. **Grouped News by Date:**
   - Provides the count of articles and average sentiment score for each publication date.
   
5. **Sentiment Trend:**
   - Illustrates the stability of sentiment scores over the analyzed dates.


In [130]:
import pandas as pd
from datetime import datetime
import yfinance as yf

# BaseDataModel for shared attributes and methods
class BaseDataModel:
    """
    BaseDataModel serves as the foundational class for data models,
    providing shared attributes and utility methods that can be
    inherited by more specialized data models.
    """

    def __init__(self, timestamp, symbol=None):
        """
        Initializes the BaseDataModel with a timestamp and an optional symbol.

        :param timestamp: The timestamp associated with the data model.
                          If not a datetime object, defaults to the current datetime.
        :param symbol: (Optional) A symbol identifier (e.g., stock ticker).
        """
        # Check if 'timestamp' is a datetime object; if not, assign current datetime
        self.timestamp = timestamp if isinstance(timestamp, datetime) else datetime.now()
        # Assign the symbol if provided; else, default to None
        self.symbol = symbol

    def is_recent(self, days=7):
        """
        Determines if the timestamp is within a recent range specified by 'days'.

        :param days: The number of days to consider as the "recent" threshold.
        :return: Boolean indicating if the timestamp is within the recent range.
        """
        # Calculate the time difference between now and the timestamp
        delta = datetime.now() - self.timestamp
        # Return True if the difference is less than or equal to 'days'
        return delta.days <= days

    def is_above_threshold(self, value, threshold):
        """
        Checks if a given value exceeds a specified threshold.

        :param value: The value to be compared against the threshold.
        :param threshold: The threshold value to compare with.
        :return: Boolean indicating if 'value' is greater than 'threshold'.
        """
        return value > threshold

# Optimized DataWorkbench
class DataWorkbench:
    """
    DataWorkbench acts as a centralized storage and processing hub for various datasets.
    It allows storing, retrieving, transforming, and aggregating data, as well as accessing metadata.
    """

    def __init__(self):
        """
        Initializes the DataWorkbench with empty dictionaries for data and metadata storage.
        """
        # Dictionary to store datasets by their names
        self.data_storage = {}
        # Dictionary to store metadata associated with each dataset
        self.metadata_storage = {}

    def store_data(self, dataset_name, data, metadata=None):
        """
        Stores a dataset in the workbench with an optional metadata dictionary.

        :param dataset_name: A unique identifier for the dataset.
        :param data: The actual dataset to be stored (e.g., pandas DataFrame).
        :param metadata: (Optional) Additional information or attributes related to the dataset.
        """
        self.data_storage[dataset_name] = data
        self.metadata_storage[dataset_name] = metadata or {}

    def retrieve_data(self, dataset_name):
        """
        Retrieves a dataset by its name from the workbench.

        :param dataset_name: The unique identifier of the dataset to retrieve.
        :return: The dataset if found; otherwise, None.
        """
        return self.data_storage.get(dataset_name, None)

    def get_metadata(self, dataset_name):
        """
        Retrieves the metadata associated with a specific dataset.

        :param dataset_name: The unique identifier of the dataset.
        :return: Metadata dictionary if found; otherwise, an empty dictionary.
        """
        return self.metadata_storage.get(dataset_name, {})

    def transform_data(self, dataset_name, transformation_func):
        """
        Applies a transformation function to a specified dataset and stores the transformed data.

        :param dataset_name: The name of the dataset to transform.
        :param transformation_func: A function that takes a dataset as input and returns the transformed dataset.
        :return: The transformed dataset.
        :raises ValueError: If the specified dataset is not found.
        """
        # Retrieve the dataset using its name
        data = self.retrieve_data(dataset_name)
        if data is not None:
            # Apply the transformation function to the dataset
            transformed_data = transformation_func(data)
            # Store the transformed data with a new name indicating transformation
            self.store_data(f"{dataset_name}_transformed", transformed_data)
            return transformed_data
        # Raise an error if the dataset does not exist in storage
        raise ValueError(f"Dataset {dataset_name} not found")

    def aggregate_data(self, dataset_name, group_by_column, agg_funcs):
        """
        Aggregates data in a dataset by a specified column using provided aggregation functions.

        :param dataset_name: The name of the dataset to aggregate.
        :param group_by_column: The column name to group the data by.
        :param agg_funcs: A dictionary specifying the aggregation functions for each column.
                          Example: {'Close': 'mean', 'Volume': 'sum'}
        :return: The aggregated dataset as a pandas DataFrame.
        :raises ValueError: If the specified dataset is not found.
        """
        # Retrieve the dataset using its name
        data = self.retrieve_data(dataset_name)
        if data is not None:
            # Perform groupby aggregation based on the provided parameters
            aggregated = data.groupby(group_by_column).agg(agg_funcs)
            # Store the aggregated data with a new name indicating aggregation
            self.store_data(f"{dataset_name}_aggregated", aggregated)
            return aggregated
        # Raise an error if the dataset does not exist in storage
        raise ValueError(f"Dataset {dataset_name} not found")

    def get_statistics(self, dataset_name):
        """
        Computes basic statistical measures for a specified dataset.

        :param dataset_name: The name of the dataset to analyze.
        :return: A pandas Series containing statistical measures like mean, std, min, max, etc.
        :raises ValueError: If the specified dataset is not found.
        """
        # Retrieve the dataset using its name
        data = self.retrieve_data(dataset_name)
        if data is not None:
            # Use pandas' describe method to compute statistics
            return data.describe()
        # Raise an error if the dataset does not exist in storage
        raise ValueError(f"Dataset {dataset_name} not found")

# Optimized Quant Data Models
class IntradayDataModel(BaseDataModel):
    """
    IntradayDataModel extends BaseDataModel to handle intraday stock data.
    It provides methods for aggregating data by time intervals,
    calculating Volume Weighted Average Price (VWAP), and computing moving averages.
    """

    def __init__(self, timestamp, price, volume, symbol):
        """
        Initializes the IntradayDataModel with additional attributes for price and volume.

        :param timestamp: The timestamp associated with the data.
        :param price: The price information (can be detailed per data point).
        :param volume: The volume information (can be detailed per data point).
        :param symbol: The stock symbol (e.g., 'AAPL').
        """
        # Initialize the base attributes using the superclass constructor
        super().__init__(timestamp, symbol)
        # Assign price and volume; can be None initially
        self.price = price
        self.volume = volume

    def aggregate_by_interval(self, data, interval):
        """
        Aggregates intraday stock data by specified time intervals.

        :param data: A pandas DataFrame containing intraday stock data.
                     Must include a 'timestamp' column with datetime information.
        :param interval: The time interval for aggregation (e.g., '5T' for 5 minutes, '1H' for 1 hour).
        :return: A pandas DataFrame with aggregated 'Close' prices and 'Volume'.
        :raises ValueError: If the 'timestamp' column is missing or cannot be converted to datetime.
        """
        # Check if 'timestamp' column exists in the DataFrame
        if 'timestamp' not in data.columns:
            raise ValueError("The 'timestamp' column is missing from the dataset.")
        try:
            # Convert 'timestamp' column to datetime objects
            data['timestamp'] = pd.to_datetime(data['timestamp'])
        except Exception as e:
            # Raise an error if conversion fails
            raise ValueError(f"Error in converting 'timestamp' to datetime: {e}")

        # Set 'timestamp' as the DataFrame index to enable resampling
        data.set_index('timestamp', inplace=True)
        # Resample the data based on the specified interval and aggregate
        aggregated = data.resample(interval.lower()).agg({
            'Close': 'mean',   # Compute the average closing price within the interval
            'Volume': 'sum'    # Compute the total volume within the interval
        })
        # Reset the index to convert 'timestamp' back to a column
        return aggregated.reset_index()

    def calculate_vwap(self, data):
        """
        Calculates the Volume Weighted Average Price (VWAP) for the given data.

        :param data: A pandas DataFrame containing 'Close' and 'Volume' columns.
        :return: The VWAP value as a float.
        """
        # Compute the numerator as the sum of (Close price * Volume) for all data points
        numerator = (data['Close'] * data['Volume']).sum()
        # Compute the denominator as the total Volume
        denominator = data['Volume'].sum()
        # Calculate VWAP; handle division by zero if necessary
        vwap = numerator / denominator if denominator != 0 else 0
        return vwap

    def calculate_moving_average(self, data, window):
        """
        Calculates the moving average of the 'Close' price over a specified window.

        :param data: A pandas DataFrame containing a 'Close' column.
        :param window: The size of the rolling window (e.g., 5 for a 5-period moving average).
        :return: The DataFrame with an additional 'Moving_Average' column.
        """
        # Compute the rolling mean (moving average) for the 'Close' column
        data['Moving_Average'] = data['Close'].rolling(window=window).mean()
        return data

class NewsDataModel(BaseDataModel):
    """
    NewsDataModel extends BaseDataModel to handle news articles data.
    It provides methods for filtering by sentiment, grouping by date,
    and analyzing sentiment trends over time.
    """

    def __init__(self, timestamp, headline, sentiment_score, relevance):
        """
        Initializes the NewsDataModel with additional attributes for headlines,
        sentiment scores, and relevance.

        :param timestamp: The timestamp when the news article was published.
        :param headline: The headline of the news article.
        :param sentiment_score: The sentiment score assigned to the article.
        :param relevance: The relevance score of the article.
        """
        # Initialize the base attributes using the superclass constructor
        super().__init__(timestamp)
        # Assign headline, sentiment score, and relevance
        self.headline = headline
        self.sentiment_score = sentiment_score
        self.relevance = relevance

    def filter_by_sentiment(self, data, threshold):
        """
        Filters news articles based on a minimum sentiment score threshold.

        :param data: A pandas DataFrame containing news data with a 'sentiment_score' column.
        :param threshold: The minimum sentiment score required for an article to be included.
        :return: A pandas DataFrame containing only the articles that meet or exceed the threshold.
        """
        # Apply a boolean mask to filter articles with sentiment_score >= threshold
        filtered_data = data[data['sentiment_score'] >= threshold]
        return filtered_data

    def group_by_date(self, data):
        """
        Groups news articles by their publication date, calculating the count of articles
        and the average sentiment score for each date.

        :param data: A pandas DataFrame containing news data with 'headline', 'timestamp',
                     and 'sentiment_score' columns.
        :return: A pandas DataFrame with 'date', 'article_count', and 'sentiment_score' columns.
        :raises ValueError: If required columns are missing or contain null values.
        """
        # Validate presence and non-nullity of 'headline' column
        if 'headline' not in data.columns or data['headline'].isnull().any():
            raise ValueError("The 'headline' column is missing or contains null values.")
        # Validate presence and non-nullity of 'sentiment_score' column
        if 'sentiment_score' not in data.columns or data['sentiment_score'].isnull().any():
            raise ValueError("The 'sentiment_score' column is missing or contains null values.")

        # Convert 'timestamp' to datetime and extract the date part
        data['date'] = pd.to_datetime(data['timestamp']).dt.date
        # Group by 'date' and aggregate the count of headlines and mean sentiment score
        grouped = data.groupby('date').agg({
            'headline': 'count',               # Count of articles per date
            'sentiment_score': 'mean'          # Average sentiment score per date
        }).rename(columns={'headline': 'article_count'})  # Rename 'headline' to 'article_count'

        # Reset the index to convert 'date' back to a column
        return grouped.reset_index()

    def analyze_sentiment_trend(self, data):
        """
        Analyzes the trend of sentiment scores over time by computing the average sentiment
        score for each date.

        :param data: A pandas DataFrame containing news data with 'timestamp' and 'sentiment_score' columns.
        :return: A pandas DataFrame with 'date' and 'sentiment_score' columns representing the trend.
        """
        # Convert 'timestamp' to datetime and extract the date part
        data['date'] = pd.to_datetime(data['timestamp']).dt.date
        # Group by 'date' and calculate the mean sentiment score for each date
        trend = data.groupby('date')['sentiment_score'].mean().reset_index()
        return trend

# Example display function replacement
def display_dataframe(name, dataframe):
    """
    Displays the name of the DataFrame and its first few rows.

    :param name: A string representing the name or title of the DataFrame.
    :param dataframe: The pandas DataFrame to be displayed.
    """
    print(f"\n{name}:")
    print(dataframe.head())



In [132]:
# Test using sample `newsapi` and `yfinance` data as specified above
# Extract relevant data from NewsAPI responses
# This creates a pandas DataFrame with the necessary columns for processing
news_data = pd.DataFrame([
    {
        'timestamp': article['publishedAt'],
        'headline': article['title'],
        'sentiment_score': 0.5,  # Assigning a fixed sentiment score for mock data
        'relevance': 0.8          # Assigning a fixed relevance score for mock data
    }
    for news in [news1, news2, news3] for article in news['articles']
])




# Initialize the DataWorkbench instance
workbench = DataWorkbench()

# Store the mock datasets in the workbench with respective dataset names
workbench.store_data("news_data", news_data)   # Store news articles data
workbench.store_data("aapl_data", aapl_data)   # Store Apple stock data
workbench.store_data("msft_data", msft_data)   # Store Microsoft stock data

# Instantiate the data models with current timestamps and relevant symbols
# Note: Price and volume are set to None initially as they can be derived from the data
intraday_model = IntradayDataModel(
    timestamp=datetime.now(),
    price=None,
    volume=None,
    symbol="AAPL"  # Symbol for Apple Inc.
)

news_model = NewsDataModel(
    timestamp=datetime.now(),
    headline=None,
    sentiment_score=None,
    relevance=None
)

# Perform operations using Quant Data Models

# 1. Aggregate AAPL data by 1-hour intervals
# Use .copy() to create a copy of the DataFrame to prevent SettingWithCopyWarning
aapl_aggregated = intraday_model.aggregate_by_interval(aapl_data.copy(), "1H")

# 2. Calculate VWAP (Volume Weighted Average Price) for AAPL
aapl_vwap = intraday_model.calculate_vwap(aapl_data)

# 3. Analyze news data by filtering, grouping, and trend analysis

# a. Filter news articles with sentiment score >= 0.4
filtered_news = news_model.filter_by_sentiment(news_data, threshold=0.4)

# b. Group news articles by their publication date, counting articles and averaging sentiment
grouped_news = news_model.group_by_date(news_data)

# c. Analyze the trend of sentiment scores over time
sentiment_trend = news_model.analyze_sentiment_trend(news_data)

# Display the processed results using the display_dataframe function

# Display the aggregated AAPL stock data
display_dataframe("Aggregated AAPL Data", aapl_aggregated)

# Display the calculated VWAP for AAPL
print(f"\nVWAP for AAPL: {aapl_vwap}\n")

# Display the filtered news articles based on sentiment threshold
display_dataframe("Filtered News Data", filtered_news)

# Display the grouped news data by date with article counts and average sentiment
display_dataframe("Grouped News by Date", grouped_news)

# Display the sentiment trend over time
display_dataframe("Sentiment Trend", sentiment_trend)



Aggregated AAPL Data:
            timestamp  Close  Volume
0 2024-11-12 09:00:00  151.5    2100
1 2024-11-12 10:00:00  155.0    4650
2 2024-11-12 11:00:00  159.0    5500

VWAP for AAPL: 156.25102040816327


Filtered News Data:
              timestamp                         headline  sentiment_score  \
0  2024-11-14T10:00:00Z        AAPL releases new product              0.5   
1  2024-11-14T12:00:00Z        MSFT acquires new company              0.5   
2  2024-11-13T09:30:00Z     Market reacts to tech stocks              0.5   
3  2024-11-12T15:45:00Z  Economic indicators show growth              0.5   
4  2024-11-12T16:00:00Z         AAPL stock hits new high              0.5   

   relevance  
0        0.8  
1        0.8  
2        0.8  
3        0.8  
4        0.8  

Grouped News by Date:
         date  article_count  sentiment_score
0  2024-11-12              2              0.5
1  2024-11-13              1              0.5
2  2024-11-14              2              0.5

Sentiment T