In [18]:
import pandas as pd 
from nltk.corpus import stopwords
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import numpy as np
from numpy import argmax
from torch.nn.functional import softmax

In [22]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "ProsusAI/finbert"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

ImportError: 
AutoModelForSequenceClassification requires the PyTorch library but it was not found in your environment.
However, we were able to find a TensorFlow installation. TensorFlow classes begin
with "TF", but are otherwise identically named to our PyTorch classes. This
means that the TF equivalent of the class you tried to import would be "TFAutoModelForSequenceClassification".
If you want to use TensorFlow, please use TF classes instead!

If you really do want to use PyTorch please go to
https://pytorch.org/get-started/locally/ and follow the instructions that
match your environment.


In [3]:
df = pd.read_csv('FinalStockNews.csv')
for col in df.columns:
    col = col.strip
df.drop('Unnamed: 0', axis=1, inplace=True)
df.dropna(inplace=True)
df.reset_index(inplace=True, drop=True)


Unnamed: 0,Unnamed: 0.1,ticker,title,date,link,articleInfo,paragraphList
0,0,AMZN,"Tech Layoffs, Remote Work Push Office Vacancie...",2024-04-16T00:00:00,/news/etf/tech-layoffs-remote-work-push-office...,The commercial real estate industry continues ...,['The commercial real estate industry continue...
1,1,AMZN,The 3 Best Stocks to Buy With Your Tax Refund ...,2024-04-16T00:00:00,/news/stocks/the-3-best-stocks-to-buy-with-you...,The IRS typically gives tax refunds within 21 ...,"['InvestorPlace - Stock Market News, Stock Adv..."
2,2,AMZN,3 AI Stocks to Buy Now: Q2 Edition,2024-04-16T00:00:00,/news/stocks/3-ai-stocks-to-buy-now-q2-edition...,Just like electricity and the internet unleash...,"['InvestorPlace - Stock Market News, Stock Adv..."
3,3,AMZN,"Amazon, Meta, Uber Remain 'Top Overall Picks' ...",2024-04-16T00:00:00,/news/stocks/amazon-meta-uber-remain-top-overa...,As the first-quarter (Q1) 2024 earnings season...,['As the first-quarter (Q1) 2024 earnings seas...
4,4,AMZN,Amazon (AMZN) Receives a Buy from J.P. Morgan,2024-04-16T00:00:00,/news/stocks/amazon-amzn-receives-a-buy-from-j...,J.P. Morgan analyst Doug Anmuth maintained a B...,['J.P. Morgan analyst Doug Anmuth maintained a...
...,...,...,...,...,...,...,...
1559,1559,AAPL,Riding the Bull: 3 ETFs to Capitalize on the O...,2024-03-13T00:00:00,/news/stocks/riding-the-bull-3-etfs-to-capital...,There are many advantages to owing exchange-tr...,"['InvestorPlace - Stock Market News, Stock Adv..."
1560,1560,AAPL,Apple’s Bitter Bite: Is the Tech Giant’s Slowi...,2024-03-13T00:00:00,/news/stocks/apples-bitter-bite-is-the-tech-gi...,The market has shown extreme volatility for mo...,"['InvestorPlace - Stock Market News, Stock Adv..."
1561,1561,AAPL,SCHD ETF Spotlight: The Top 6 Stocks Inside Th...,2024-03-13T00:00:00,/news/stocks/schd-etf-spotlight-the-top-6-stoc...,Investing in the stock market is one of the su...,"['InvestorPlace - Stock Market News, Stock Adv..."
1562,1562,AAPL,Maintaining Market-Perform on Apple: A Balance...,2024-03-13T00:00:00,/news/stocks/maintaining-market-perform-on-app...,Bernstein analyst Toni Sacconaghi maintained a...,['Bernstein analyst Toni Sacconaghi maintained...


In [4]:
def getDfStock(ticker, startDate, endDate):
    # Convert startDate and endDate to datetime objects to ensure proper comparison
    start_date = pd.to_datetime(startDate)
    end_date = pd.to_datetime(endDate)

    # Ensure the 'date' column is in datetime format
    df['date'] = pd.to_datetime(df['date'])

    # Filter the DataFrame for the given ticker and the date range inclusive
    filtered_df = df[(df['ticker'] == ticker) & (df['date'] >= start_date) & (df['date'] <= end_date)]

    return filtered_df    

In [6]:
def sliding_window(text, tokenizer, max_length=512, overlap=50):

    #encodes text
    tokenized_text = tokenizer.encode_plus(text, return_tensors='pt', add_special_tokens=True)
    input_ids = tokenized_text['input_ids'].squeeze().numpy().tolist()
    
    total_length = len(input_ids)
    #each step is 462 by default
    step_size = max_length - overlap
    chunks = []
    
    #create sliding window and tokenize each chunk
    for start in range(0, total_length, step_size):
        end = start + max_length
        chunk = input_ids[start:end]
        
        chunk = [tokenizer.cls_token_id] + chunk + [tokenizer.sep_token_id]
        chunks.append(chunk)
    
    return chunks


In [7]:
def process_chunks(chunks, tokenizer, model):
    # Set the model to evaluation mode
    model.eval()  
    
    # Lists to store sentiments and scores
    sentiments = []
    scores = []
    
    # Disable gradient calculation for efficiency
    with torch.no_grad():
        # Iterate through each chunk
        for chunk in chunks:
            # Convert the chunk into a PyTorch tensor and add a batch dimension
            inputs = torch.tensor(chunk).unsqueeze(0)  
            
            # Pad the input tensor to match the model's input size
            inputs = torch.nn.functional.pad(inputs, (0, 512 - inputs.shape[1]), value=tokenizer.pad_token_id)
            # Pass the input tensor through the model
            outputs = model(inputs)
            # Retrieve the logits (raw outputs) from the model
            logits = outputs.logits
            # Determine the sentiment by selecting the index of the maximum logit value
            sentiment = torch.argmax(logits, dim=1).numpy()[0]  
            # Calculate the softmax probability score for the predicted sentiment
            score = torch.softmax(logits, dim=1).max().item()  
            # Append the sentiment and score to their respective lists
            sentiments.append(sentiment)
            scores.append(score)

    # Return the lists of sentiments and scores
    return sentiments, scores


In [8]:
def aggregate_results(sentiments, scores):
    # Dictionary to store average score per sentiment
    average_score_per_sentiment = {}
    # Iterate through each sentiment and score pair
    for sentiment, score in zip(sentiments, scores):
        # If sentiment not in the dictionary, add it with an empty list
        if sentiment not in average_score_per_sentiment:
            average_score_per_sentiment[sentiment] = []
        # Append the score to the list of scores for this sentiment
        average_score_per_sentiment[sentiment].append(score)
    
    # Calculate the average score for each sentiment
    for sentiment in average_score_per_sentiment:
        average_score_per_sentiment[sentiment] = sum(average_score_per_sentiment[sentiment]) / len(average_score_per_sentiment[sentiment])
    
    # Find the dominant sentiment (the one with the highest average score)
    dominant_sentiment = max(average_score_per_sentiment, key=average_score_per_sentiment.get)
    
    # Return the dominant sentiment and its average score
    return dominant_sentiment, average_score_per_sentiment[dominant_sentiment]

In [9]:
def full_window(text, tokenizer, model):
    # Generate chunks of text using a sliding window approach
    chunks = sliding_window(text, tokenizer)
    # Process the chunks to obtain sentiments and scores
    sentiments, scores = process_chunks(chunks, tokenizer, model)
    # Aggregate the results to find the dominant sentiment and its average score
    return aggregate_results(sentiments, scores)

In [10]:
def getSentimentScore(ticker, startDate, endDate):
    # Retrieve the subset DataFrame for the specified ticker and date range
    subDf = getDfStock(ticker, startDate, endDate)

    # Assume the function `full_window` applies sentiment analysis and returns the sentiment as 'positive', 'negative', or 'neutral'
    subDf['sentiment'] = subDf['articleInfo'].apply(lambda articleInfo: full_window(articleInfo, tokenizer, model)[0])

    # Map the sentiment results to numeric values: positive -> 1, neutral -> 0.5, negative -> 0
    sentiment_scores = subDf['sentiment'].map({'positive': 1, 'neutral': 0.5, 'negative': 0})

    # Calculate the average sentiment score
    if len(sentiment_scores) > 0:
        average_sentiment_score = sentiment_scores.mean()
    else:
        average_sentiment_score = None  # Handle case where no articles were found

    return average_sentiment_score
