In [2]:
import yfinance
import finnhub
import pandas as pd
import time
from functools import partial
from collections import defaultdict
from typing import NamedTuple, Optional, TypedDict
import json
import numpy as np

In [3]:
# FIXME: leak of api key
finnhub_client = finnhub.Client(api_key='cidm6ghr01qvscdap9rgcidm6ghr01qvscdap9s0')

In [4]:
def with_rate_limiter(max_per_second: int):
    min_interval = 1. / float(max_per_second)
    def decorate(func):
        last_time_called = time.time()
        def rate_limited_function(*args, **kargs):
            nonlocal last_time_called
            elapsed = time.time() - last_time_called
            left_to_wait = min_interval - elapsed
            if left_to_wait > 0:
                time.sleep(left_to_wait)
            ret = func(*args, **kargs)
            last_time_called = time.time()
            return ret
        return rate_limited_function
    return decorate

In [16]:
START_DATE = "2023-09-01"
END_DATE = "2023-10-31"

In [47]:
df = yfinance.download('AAPL', START_DATE, END_DATE)

weekly_adj_close = df['Adj Close'].resample('W').ffill()
weekly_stock_df = pd.DataFrame({
    'start_date': weekly_adj_close.index[:-1],
    'end_date': weekly_adj_close.index[1:],
    'start_price': weekly_adj_close.values[:-1],
    'end_price': weekly_adj_close.values[1:],
})
print(df.head())
print(df.tail())
print(weekly_stock_df)

[*********************100%%**********************]  1 of 1 completed
                  Open        High         Low       Close   Adj Close  \
Date                                                                     
2023-09-01  189.490005  189.919998  188.279999  189.460007  189.210739   
2023-09-05  188.279999  189.979996  187.610001  189.699997  189.450409   
2023-09-06  188.399994  188.850006  181.470001  182.910004  182.669342   
2023-09-07  175.179993  178.210007  173.539993  177.559998  177.326385   
2023-09-08  178.350006  180.240005  177.789993  178.179993  177.945557   

               Volume  
Date                   
2023-09-01   45732600  
2023-09-05   45280000  
2023-09-06   81755800  
2023-09-07  112488800  
2023-09-08   65551300  
                  Open        High         Low       Close   Adj Close  \
Date                                                                     
2023-10-24  173.050003  173.669998  171.449997  173.440002  173.211807   
2023-10-25  171.880005

In [109]:
FINNHUB_RATE_LIMIT_PER_SECOND = 1
FINNHUB_SUMMARY_NA = 'Looking for stock market analysis and research with proves results?'

class Week(NamedTuple):
    stock_data: pd.Series
    news_data: pd.DataFrame

class Quarter(NamedTuple):
    start: str
    end: Optional[str] # exclusive
    financial_data: pd.Series
    weeks: list[Week]
    
class Dataset(TypedDict):
    quarters: list[Quarter]

class InvalidDateRangeException(Exception):
    pass

def prepare_dataset(ticker: str, start_date: str, end_date: str) -> Dataset:
    '''
    Prepares (a) quarterly financial data, (b) daily stock data, and (c) daily news data for a given ticker.
    Collects data from `start_date` to `end_date` (inclusive).
    :param ticker: e.g. 'AAPL'
    :param start_date: %Y-%m-%d (inclusive)
    :param end_date: %Y-%m-%d (inclusive)
    :return: Dataset
    '''
    start_datetime, end_datetime = pd.to_datetime(start_date), pd.to_datetime(end_date)
    if start_datetime > end_datetime:
        raise InvalidDateRangeException(f'Invalid date range: {start_date} > {end_date}')

    results: list[Quarter] = []

    financial_data = finnhub_client.company_basic_financials(ticker, 'all')
    by_quarter = defaultdict(dict)
    for metric, values in financial_data['series']['quarterly'].items():
        for value in values:
            by_quarter[value['period']].update({metric: value['v']})
    financial_df = pd.DataFrame.from_dict(by_quarter, orient='index')
    financial_df.sort_index(inplace=True)
    
    stock_df: pd.DataFrame = yfinance.download(
        ticker, start=start_date, end=end_date, # end is exclusive
    )
    # resample to weekly data; last week might be incomplete, 
    # so we use the last day's data for the whole week
    weekly_adj_close = stock_df['Adj Close'].resample('W').ffill()
    weekly_start_dates = weekly_adj_close.index[:-1]
    weekly_stock_df = pd.DataFrame({
        'start_date': weekly_start_dates.strftime('%Y-%m-%d'),
        'end_date': weekly_adj_close.index[1:].strftime('%Y-%m-%d'),
        'start_price': weekly_adj_close.values[:-1],
        'end_price': weekly_adj_close.values[1:],
    }, index=weekly_start_dates)

    qrtr_starts = financial_df.index
    qrtr_ends = qrtr_starts[1:]
    for i, qrtr_start in enumerate(qrtr_starts):
        qrtr_end = qrtr_ends[i] if i < len(qrtr_ends) else None
        if qrtr_start > end_date or (qrtr_end and qrtr_end < start_date):
            continue

        get_company_news = partial(finnhub_client.company_news, ticker)
        get_company_news_limited = with_rate_limiter(FINNHUB_RATE_LIMIT_PER_SECOND)(get_company_news)

        quarter = Quarter(
            start=qrtr_start, 
            end=qrtr_end, 
            financial_data=financial_df.loc[qrtr_start], 
            weeks=[])

        for _, stock_ser in weekly_stock_df.loc[qrtr_start:qrtr_end].iterrows():
            news_df = pd.DataFrame.from_records(get_company_news_limited(
                _from=stock_ser['start_date'], to=stock_ser['end_date']))
            if news_df.empty:
                # TODO: log
                continue
            news_df = news_df[['datetime', 'headline', 'summary']]
            news_df.set_index('datetime', inplace=True)
            news_df = news_df.loc[(news_df['summary'].str.len() > 0) &
                                  (~news_df['summary'].str.startswith(FINNHUB_SUMMARY_NA))]
            quarter.weeks.append(Week(stock_ser, news_df))

        results.append(quarter)
    return Dataset(quarters=results)

def dataset_to_json(dataset: Dataset, save_to: str):
    as_json = {
        'quarters': [
            {
                'start': quarter.start,
                'end': quarter.end,
                'financial_data': quarter.financial_data.replace({np.nan:None}).to_dict(),
                'weeks': [
                    {
                        'stock_data': week.stock_data.to_dict(),
                        'news_data': week.news_data.to_dict(),
                    } for week in quarter.weeks
                ],
            } for quarter in dataset['quarters']
        ]
    }
    with open(save_to, 'w') as f:
        json.dump(as_json, f, indent=2)

def preprompt(ticker: str) -> str:
    profile = finnhub_client.company_profile2(symbol=ticker)
    company_template = '''[Company Information]:
    
{name} is a leading entity in the {finnhubIndustry} sector. 
Incorporated and publicly traded since {ipo}, the company has established its reputation as one of the key players in the market. 
As of today, {name} has a market capitalization of {marketCapitalization:.2f} in {currency}, with {shareOutstanding:.2f} shares outstanding.

{name} operates primarily in the {country}, trading under the ticker {ticker} on the {exchange}. 
As a dominant force in the {finnhubIndustry} space, the company continues to innovate and drive progress within the industry.'''
    prompt = company_template.format(**profile)    
    return prompt

def prompts(ticker: str, dataset: Dataset):
    results = []
    for quarter in dataset['quarters']:
        for week in quarter.weeks:
            start_date, end_date = week.stock_data['start_date'], week.stock_data['end_date']
            start_price, end_price = week.stock_data['start_price'], week.stock_data['end_price']
            if start_price < end_price:
                term = 'increased'
            elif start_price > end_price:
                term = 'decreased'
            else:
                term = 'remained unchanged'
            prompt = f'''From {start_date} to {end_date}, {ticker}'s stock price {term} from {start_price:.2f} to {end_price:.2f}.


Company news during this period are listed below:

'''
            prompt += '\n'.join(
                [f'[Headline]: {a.headline}\n[Summary]: {a.summary}\n' 
                 for a in week.news_data.itertuples()])
            prompt += f'''


Some recent basic financials of {ticker}, reported at {quarter.start}, are presented below:

[Basic Financials]:
'''
            financial_data = quarter.financial_data.loc[quarter.financial_data.notna()]
            prompt += '\n'.join(f'{k}: {v}' for k, v in financial_data.items())
            results.append(prompt)
    return results

In [74]:
dataset = prepare_dataset('AAPL', START_DATE, END_DATE)

[*********************100%%**********************]  1 of 1 completed
True
True
True
True
True
True
True
True
True


In [107]:
dataset_to_json(dataset, f'./data/{START_DATE}_{END_DATE}.json')

In [None]:
# print(preprompt('AAPL'))
print(prompts('AAPL', dataset)[0])