## Data Acquisition

In [1]:
import pandas as pd
import numpy as np
import requests

In [2]:
def request_news(apiKey: str, ticker: str, startDate: str, endDate: str):
    """
    Requests financial news for a specific company within a specific date range from the Finnhub API.

    Args:
        apiKey: Your Finnhub API Key
        ticker: The Ticker symbol for the stock yo are interested in
        startDate: the start date for the date range, formatted YYYY-MM-DD
        endDate: the end date for the date range, formatted YYYY-MM-DD 

    Returns response json if successful
    """
    baseurl = 'https://finnhub.io/api/v1/company-news?'
    url = f'{baseurl}symbol={ticker}&from={startDate}&to={endDate}&token={apiKey}'
    response = requests.get(url)
    return response.json()

def parse(response: list):
    """
    Parses the JSON response into a Pandas Dataframe 

    Args:
        response: The JSON response containing company news information from the FinnHub API 

    Returns Pandas DataFrame if successful
    """
    results_dict = {
        'id': [result['id'] for result in response],
        'datetime': [result['datetime'] for result in response],
        'source': [result['source'] for result in response],
        'summary': [result['summary'] for result in response]
    }
    return pd.DataFrame(results_dict)

## API will cut off number of results if time period is too long, perform separate request for each month

api_key = 'Your API Key'
ticker = 'RIVN'

dates = [
    ('2024-12-01', '2024-12-31'),
    ('2024-11-01', '2024-11-30'),
    ('2024-10-01', '2024-10-31'),
    ('2024-09-01', '2024-09-30'),
    ('2024-08-01', '2024-08-31'),
    ('2024-07-01', '2024-07-31')
    ]

rivian_monthly_df = [parse(request_news(apiKey = api_key, ticker = ticker, startDate = start, endDate = end)) for start, end in dates]
rivian_df = pd.concat(rivian_monthly_df, axis = 0)

In [3]:
import yfinance as yf

ticker = yf.Ticker('RIVN')

historical = ticker.history(period='1y').reset_index()

historical = historical.drop(['High', 'Low', 'Volume', 'Dividends', 'Stock Splits'], axis=1)

## Data Pre-Processing and Aggregation

In [4]:
rivian_df['datetime'] = pd.to_datetime(rivian_df['datetime'], unit = 's', utc = True).dt.tz_convert("America/New_York")

rivian_df = rivian_df[(rivian_df['source'].isin(['Yahoo'])) & (rivian_df['summary'] != '')]

# Move date of articles published after market-close (4pm ET) one day forward given new sentiment after market close will only impact following day

rivian_df.loc[rivian_df['datetime'].dt.hour >= 16, 'datetime'] = rivian_df.loc[rivian_df['datetime'].dt.hour >= 16, 'datetime'] + pd.Timedelta(days = 1)

In [5]:
# Calculate log returns and log overnight returns

returns = [0]
for i in range(1, len(historical['Close'])):
    ret = np.log(historical['Close'][i]/historical['Close'][i-1])
    returns.append(ret)

overnight_returns = [0]
for i in range(1, len(historical['Open'])):
    ret = np.log(historical['Open'][i]/historical['Close'][i-1])
    overnight_returns.append(ret)

historical['Returns'] = returns
historical['Overnight_Returns'] = overnight_returns


In [6]:
# Merge dataframes and forward fill to account for articles published over weekends/holidays

rivian_df['datetime'] = rivian_df['datetime'].dt.date
historical['Date'] = historical['Date'].dt.date

rivian_df = rivian_df.rename(columns={'datetime': 'Date'})

rivian_df = pd.merge(rivian_df, historical, how = 'left', on = 'Date').ffill()

## Performing Sentiment Analysis 

In [7]:
from transformers import pipeline
import nltk

In [8]:
## Filter out responses with too many tokens for chosen model

rivian_df['tokens'] = [len(nltk.word_tokenize(summary)) for summary in rivian_df['summary']]
rivian_df = rivian_df[rivian_df['tokens'] <= 512].drop('tokens', axis = 1)

## Perform analysis using finBERT model, store labels and scores as columns in original dataframe

sent_pipeline = pipeline('sentiment-analysis', model = r'ProsusAI/finbert')

sentiment_scores = [sent_pipeline(summary) for summary in rivian_df['summary']]

rivian_df['sent_label'] = [label[0]['label'] for label in sentiment_scores]
rivian_df['sent_score'] = [score[0]['score'] for score in sentiment_scores]

### Export data to CSV

In [10]:
rivian_df.to_csv('CSVs/rivian_df.csv', index = False)

In [11]:
historical.to_csv('CSVs/historical.csv', index = False)