In [1]:
import pandas as pd 
from nltk.corpus import stopwords
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import numpy as np
from numpy import argmax
from torch.nn.functional import softmax

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = "ProsusAI/finbert"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [3]:
df = pd.read_csv('FinalStockNews.csv')
for col in df.columns:
    col = col.strip
df.drop('Unnamed: 0', axis=1, inplace=True)
df.dropna(inplace=True)
df.reset_index(inplace=True, drop=True)
df

Unnamed: 0,Unnamed: 0.1,ticker,title,date,link,articleInfo,paragraphList
0,0,AMZN,Amazon Analyst Predicts Lead in AI Revolution ...,2024-04-18T00:00:00,/news/stocks/amazon-analyst-predicts-lead-in-a...,Needham analyst Laura Martin reiterated Amazon...,['Needham\xa0analyst Laura Martin reiterated\x...
1,1,AMZN,Magnificent 7 Killers: 3 Growth Stocks Ready t...,2024-04-18T00:00:00,/news/stocks/magnificent-7-killers-3-growth-st...,Identifying the next growth stocks to challeng...,"['InvestorPlace - Stock Market News, Stock Adv..."
2,2,AMZN,Zoom Leverages 'AI-Powered Advancements' For G...,2024-04-18T00:00:00,/news/stocks/zoom-leverages-ai-powered-advance...,Zoom Video Communications Inc (NASDAQ:ZM) is h...,['Zoom Video Communications Inc (NASDAQ:ZM) is...
3,3,AMZN,"7 Must-Have AI Stocks, Fed Rate Cuts or No Fed...",2024-04-18T00:00:00,/news/stocks/7-must-have-ai-stocks-fed-rate-cu...,Markets have remained strong in 2024 primarily...,"['InvestorPlace - Stock Market News, Stock Adv..."
4,4,AMZN,7 Growth Stocks to Buy Now: Q2 Edition,2024-04-18T00:00:00,/news/stocks/7-growth-stocks-to-buy-now-q2-edi...,Investing in growth stocks can deliver outsize...,"['InvestorPlace - Stock Market News, Stock Adv..."
...,...,...,...,...,...,...,...
1566,1566,AAPL,Buy Rating for Apple Stock: Anticipated Growth...,2024-03-14T00:00:00,/news/stocks/buy-rating-for-apple-stock-antici...,Analyst Atif Malik of Citi maintained a Buy ra...,['Analyst Atif Malik of Citi maintained a Buy ...
1567,1567,AAPL,Tesla Worst Performing S&P 500 Stock Year-To-D...,2024-03-14T00:00:00,/news/stocks/tesla-worst-performing-s-p-500-st...,Electric vehicle leader Tesla Inc (NASDAQ:TSLA...,['Electric vehicle leader Tesla Inc (NASDAQ:TS...
1568,1568,AAPL,Apple Vision Pro's Code Hints At Potential Lau...,2024-03-14T00:00:00,/news/stocks/apple-vision-pro-s-code-hints-at-...,(RTTNews) - Tech giant Apple's (AAPL) sensatio...,"[""(RTTNews) - Tech giant Apple's (AAPL) sensat..."
1569,1569,AAPL,From The Oscars To The Stock Market: How Music...,2024-03-14T00:00:00,/news/etf/from-the-oscars-to-the-stock-market-...,Amidst the star-studded affair of the 96th Aca...,['Amidst the star-studded affair of the 96th A...


In [4]:
def getDfStock(ticker, startDate, endDate):
    # Convert startDate and endDate to datetime objects to ensure proper comparison
    start_date = pd.to_datetime(startDate)
    end_date = pd.to_datetime(endDate)

    # Ensure the 'date' column is in datetime format
    df['date'] = pd.to_datetime(df['date'])

    # Filter the DataFrame for the given ticker and the date range inclusive
    filtered_df = df[(df['ticker'] == ticker) & (df['date'] >= start_date) & (df['date'] <= end_date)]

    return filtered_df    

In [5]:
# checking validity of getDfStock
dfNew= getDfStock('AAPL','2024-03-01','2024-03-15')
dfNew

Unnamed: 0,Unnamed: 0.1,ticker,title,date,link,articleInfo,paragraphList
1559,1559,AAPL,QQQI: The Undiscovered Nasdaq-100 Covered Call...,2024-03-15,/news/etf/qqqi-the-undiscovered-nasdaq-100-cov...,"A few months ago, I analyzed the total perform...","['A few months ago, I analyzed the total perfo..."
1560,1560,AAPL,The 3 Most Undervalued Large-Cap Stocks to Buy...,2024-03-15,/news/stocks/the-3-most-undervalued-large-cap-...,Value investing has long been a popular and su...,"['InvestorPlace - Stock Market News, Stock Adv..."
1561,1561,AAPL,Apple Acquires Canadian Startup DarwinAI: Repo...,2024-03-15,/news/stocks/apple-acquires-canadian-startup-d...,(RTTNews) - Tech giant Apple Inc.(AAPL) has re...,['(RTTNews) - Tech giant Apple Inc.(AAPL) has ...
1562,1562,AAPL,Nvidia Blazes Trail For 3 Further Phases Of AI...,2024-03-15,/news/stocks/nvidia-blazes-trail-for-3-further...,NVIDIA Corp (NASDAQ:NVDA) blazed the trail for...,['NVIDIA Corp (NASDAQ:NVDA) blazed the trail f...
1563,1563,AAPL,"Old Dogs, New Tricks: 3 Dow Stocks Poised for ...",2024-03-15,/news/stocks/old-dogs-new-tricks-3-dow-stocks-...,"Although the S&P 500 is at all-time highs, som...","['InvestorPlace - Stock Market News, Stock Adv..."
1564,1564,AAPL,Don’t Fight the Tide: Nvidia’s AI Momentum Is ...,2024-03-15,/news/stocks/dont-fight-the-tide-nvidias-ai-mo...,Nvidia (NASDAQ:NVDA) has been on an absolute t...,"['InvestorPlace - Stock Market News, Stock Adv..."
1565,1565,AAPL,'This Is A Long-Term Bull Market': Expert Ed Y...,2024-03-15,/news/stocks/this-is-a-long-term-bull-market-e...,"Ed Yardeni, the renowned economist and market ...","['Ed Yardeni, the renowned economist and marke..."
1566,1566,AAPL,Buy Rating for Apple Stock: Anticipated Growth...,2024-03-14,/news/stocks/buy-rating-for-apple-stock-antici...,Analyst Atif Malik of Citi maintained a Buy ra...,['Analyst Atif Malik of Citi maintained a Buy ...
1567,1567,AAPL,Tesla Worst Performing S&P 500 Stock Year-To-D...,2024-03-14,/news/stocks/tesla-worst-performing-s-p-500-st...,Electric vehicle leader Tesla Inc (NASDAQ:TSLA...,['Electric vehicle leader Tesla Inc (NASDAQ:TS...
1568,1568,AAPL,Apple Vision Pro's Code Hints At Potential Lau...,2024-03-14,/news/stocks/apple-vision-pro-s-code-hints-at-...,(RTTNews) - Tech giant Apple's (AAPL) sensatio...,"[""(RTTNews) - Tech giant Apple's (AAPL) sensat..."


In [6]:
def sliding_window(text, tokenizer, max_length=512, overlap=50):

    tokenized_text = tokenizer.encode_plus(text, return_tensors='pt', add_special_tokens=True)
    input_ids = tokenized_text['input_ids'].squeeze().numpy().tolist()
    
    total_length = len(input_ids)
    step_size = max_length - overlap
    chunks = []
    
    for start in range(0, total_length, step_size):
        end = start + max_length
        chunk = input_ids[start:end]
        
        chunk = [tokenizer.cls_token_id] + chunk + [tokenizer.sep_token_id]
        chunks.append(chunk)
    
    return chunks


In [7]:
# unit test sliding window
assert sliding_window("This is the test", tokenizer, 2, 1) == [[101, 101, 2023, 102],
 [101, 2023, 2003, 102],
 [101, 2003, 1996, 102],
 [101, 1996, 3231, 102],
 [101, 3231, 102, 102],
 [101, 102, 102]]
assert sliding_window("The fox jumped over the lake and tackled Tom Brady", tokenizer, 6, 3) == [[101, 101, 1996, 4419, 5598, 2058, 1996, 102],
 [101, 5598, 2058, 1996, 2697, 1998, 26176, 102],
 [101, 2697, 1998, 26176, 3419, 10184, 102, 102],
 [101, 3419, 10184, 102, 102]]

In [8]:
def process_chunks(chunks, tokenizer, model):
    model.eval()  
    sentiments = []
    scores = []

    with torch.no_grad():
        for chunk in chunks:
            inputs = torch.tensor(chunk).unsqueeze(0)  
            

            inputs = torch.nn.functional.pad(inputs, (0, 512 - inputs.shape[1]), value=tokenizer.pad_token_id)
            
            outputs = model(inputs)
            logits = outputs.logits
            sentiment = torch.argmax(logits, dim=1).numpy()[0]  
            score = torch.softmax(logits, dim=1).max().item()  

            sentiments.append(sentiment)
            scores.append(score)

    return sentiments, scores


In [9]:
# unit tests for process_chunks
chunk1 = [[101, 101, 2023, 102],
 [101, 2023, 2003, 102],
 [101, 2003, 1996, 102],
 [101, 1996, 3231, 102],
 [101, 3231, 102, 102],
 [101, 102, 102]]
assert process_chunks(chunk1, tokenizer, model) == ([2, 2, 2, 2, 2, 2],
 [0.7957010269165039,
  0.8266777992248535,
  0.7992331385612488,
  0.7978023886680603,
  0.7663531303405762,
  0.7840284109115601])
chunk2 = [[101, 101, 1996, 4419, 5598, 2058, 1996, 102],
 [101, 5598, 2058, 1996, 2697, 1998, 26176, 102],
 [101, 2697, 1998, 26176, 3419, 10184, 102, 102],
 [101, 3419, 10184, 102, 102]]
assert process_chunks(chunk2, tokenizer, model) == ([2, 2, 2, 2],
 [0.7723026871681213,
  0.7744370698928833,
  0.7692530155181885,
  0.7506882548332214])

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


In [10]:
def aggregate_results(sentiments, scores):

    average_score_per_sentiment = {}
    for sentiment, score in zip(sentiments, scores):
        if sentiment not in average_score_per_sentiment:
            average_score_per_sentiment[sentiment] = []
        average_score_per_sentiment[sentiment].append(score)
    
    for sentiment in average_score_per_sentiment:
        average_score_per_sentiment[sentiment] = sum(average_score_per_sentiment[sentiment]) / len(average_score_per_sentiment[sentiment])
    
    dominant_sentiment = max(average_score_per_sentiment, key=average_score_per_sentiment.get)
    return dominant_sentiment, average_score_per_sentiment[dominant_sentiment]


In [11]:
# unit tests for aggregate_results
sent1 = [2, 2, 2, 2, 2, 2]
score1 = [0.7957010269165039,
  0.8266777992248535,
  0.7992331385612488,
  0.7978023886680603,
  0.7663531303405762,
  0.7840284109115601]
assert aggregate_results(sent1, score1) == (2, 0.7949659824371338)

sent2 = [2, 2, 2, 2, 2, 2]
score2 = [0.7957010269165039,
  0.8266777992248535,
  0.7992331385612488,
  0.7978023886680603,
  0.7663531303405762,
  0.7840284109115601]
assert aggregate_results(sent2, score2) == (2, 0.7949659824371338)

In [12]:
def full_window(text, tokenizer, model):
    chunks = sliding_window(text, tokenizer)
    sentiments, scores = process_chunks(chunks, tokenizer, model)
    return aggregate_results(sentiments, scores)

In [13]:
# unit test full_window
text1 = 'Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words, consectetur, from a Lorem Ipsum passage, and going through the cites of the word in classical literature, discovered the undoubtable source. Lorem Ipsum comes from sections 1.10.32 and 1.10.33 of "de Finibus Bonorum et Malorum" (The Extremes of Good and Evil) by Cicero, written in 45 BC. This book is a treatise on the theory of ethics, very popular during the Renaissance. The first line of Lorem Ipsum, "Lorem ipsum dolor sit amet..", comes from a line in section 1.10.32.'
assert full_window(text1, tokenizer, model) == (2, 0.8298267126083374)

text2 = "It is a long established fact that a reader will be distracted by the readable content of a page when looking at its layout."
assert full_window(text2, tokenizer, model) == (2, 0.5958979725837708)

In [14]:
def getSentimentScore(ticker, startDate, endDate):
    # Retrieve the subset DataFrame for the specified ticker and date range
    subDf = getDfStock(ticker, startDate, endDate)

    # Assume the function `full_window` applies sentiment analysis and returns the sentiment as 'positive', 'negative', or 'neutral'
    subDf['sentiment'] = subDf['articleInfo'].apply(lambda articleInfo: full_window(articleInfo, tokenizer, model)[0])

    # Map the sentiment results to numeric values: positive -> 1, neutral -> 0.5, negative -> 0
    sentiment_scores = subDf['sentiment'].map({'positive': 1, 'neutral': 0.5, 'negative': 0})

    # Calculate the average sentiment score
    if len(sentiment_scores) > 0:
        average_sentiment_score = sentiment_scores.mean()
    else:
        average_sentiment_score = None  # Handle case where no articles were found

    return average_sentiment_score


In [15]:
# unit test for getSentimentScore
getSentimentScore('AMZN', '2024-03-12', '2024-03-15')

In [16]:
df['sentiment'] = df['articleInfo'].apply(lambda news: full_window(news, tokenizer, model)[0])
df['sentiment probability']  = df['articleInfo'].apply(lambda news: full_window(news, tokenizer, model)[1])

Token indices sequence length is longer than the specified maximum sequence length for this model (828 > 512). Running this sequence through the model will result in indexing errors


KeyboardInterrupt: 

In [None]:
df['sentiment'].value_counts()

sentiment
2    774
0    485
1    305
Name: count, dtype: int64

In [None]:
full_window(df['articleInfo'][0], tokenizer, model)

(1, 0.941900759935379)