# Import libraries

In [24]:
import polars as pl
import requests
import os

# Fetch News Using Alpha Vantage

In [22]:
def build_endpoint(stock_ticker, api_key, start_date, end_date=None):
    """
    Constructs the API endpoint for fetching news data.
    """
    base_url = "https://www.alphavantage.co/query?function=NEWS_SENTIMENT"
    date_from = f"&time_from={start_date}T0130"
    date_to = f"&time_to={end_date}T0130" if end_date else ""
    limit = "&limit=1000"
    return f"{base_url}&tickers={stock_ticker}{date_from}{date_to}{limit}&apikey={api_key}"

def fetch_news_data(stock_ticker, api_key, start_date, end_date=None):
    """
    Makes an API request and returns the response data.
    """
    endpoint = build_endpoint(stock_ticker, api_key, start_date, end_date)
    response = requests.get(endpoint)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Failed to fetch data: {response.status_code}")
        return None

def convert_to_dataframe(news_items):
    """
    Converts a list of news items into a Polars DataFrame and extracts the date.
    """
    news_items = news_items.get("feed", [])
    if news_items:
        df = pl.DataFrame(news_items)
        df = df.with_columns(pl.col("time_published").str.slice(0, 10).alias("date"))
        return df
    return pl.DataFrame()

def aggregate_news_data(stock_ticker_list, api_key, start_date, end_date=None):
    """
    Fetches and aggregates news data for multiple stock tickers.
    """
    df_list = []
    for stock_ticker in stock_ticker_list:
        news_data = fetch_news_data(stock_ticker, api_key, start_date, end_date)
        if news_data:
            df = convert_to_dataframe(news_data)
            df_list.append(df)

    return pl.concat(df_list, how='vertical') if df_list else pl.DataFrame()

def get_news():
    # Top stocks in S & P
    stock_ticker_list = ['MSFT', 'AAPL', 'NVDA', 'GOOG', 'AMZN', 'META', 'TSLA', 'LLY', 'JPM', 'WMT']
    
    """
    NOTE - Over a few days, we collected and stored data in seperate files due to the API's daily call limits. The commented code details our approach for achieving this.
    api_key = input ("Please provide your Alpha Vantage API key :") 
    news_2022 = aggregate_news_data(stock_ticker_list, api_key, '20220101')
    news_2023 = aggregate_news_data(stock_ticker_list, api_key, '20230101')
    aggr_news_previous = pl.concat([news_2022, news_2023])
    aggr_news_previous.write_json('../Data/aggr_news_previous.json')
    """

    api_key = input ("Please provide your Alpha Vantage API key :")
    news_current_year = aggregate_news_data(stock_ticker_list, api_key, '20230101')

    try:
        aggr_news_previous = pl.read_json('../Data/aggr_news_previous.json')
        aggr_news_final = pl.concat([aggr_news_previous, news_current_year])
    except FileNotFoundError:
        aggr_news_final = news_current_year

    aggr_news_final.write_json('../Data/aggr_news_final.json')
    
def get_news_yearly(base_dir, year):
    stock_ticker_list = ['MSFT', 'AAPL', 'NVDA', 'GOOG', 'AMZN', 'META', 'TSLA', 'LLY', 'JPM', 'AVGO', 'WMT']
    
    """
    NOTE - Over a few days, we collected and stored data in seperate files due to the API's daily call limits. The commented code details our approach for achieving this.
    api_key = input ("Please provide your Alpha Vantage API key :") 
    news_2022 = aggregate_news_data(stock_ticker_list, api_key, '20220101')
    news_2023 = aggregate_news_data(stock_ticker_list, api_key, '20230101')
    aggr_news_previous = pl.concat([news_2022, news_2023])
    aggr_news_previous.write_json('../Data/aggr_news_previous.json')
    """

    api_key = input ("Please provide your Alpha Vantage API key :")
    news_current_year = aggregate_news_data(stock_ticker_list, api_key, f'{year}0101')

    news_current_year.write_json(f'{base_dir}/aggr_news_{year}.json')
    

In [26]:
base_dir = '../Data'
year = input ("Please provide the year in YYYY format:")
get_news_yearly(base_dir, year)

Please provide the year in YYYY : 2024


In [29]:
# Concatenate the saved data
def concat_json(base_dir, list_json_files, save_filename):
    
    aggr_news_final = pl.DataFrame()
    for json_file in list_json_files:
        aggr_news_for_year = pl.read_json(os.path.join(base_dir, json_file))
        aggr_news_final = pl.concat([aggr_news_final, aggr_news_for_year])
        
    aggr_news_final.write_json(f'{base_dir}/aggr_news_final.json')

In [30]:
list_json_files = ['aggr_news_2022.json', 'aggr_news_2023.json',  'aggr_news_2024.json']
concat_json(base_dir, list_json_files, save_filename = "aggr_news_final.json")