# Combined Workflow
Notes: 
* Continued issues accessing articles behind a paywall or external source (e.g., https://finance.yahoo.com/news/top-midday-stories-pepsico-buy-160405890.html)

In [None]:
import re
import csv
from time import sleep
from bs4 import BeautifulSoup
import requests
import yfinance as yf

import pandas as pd
import numpy as np
import math

# TODO: Compare using requests vs using drivers

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException

from seleniumbase import Driver


In [None]:
# Get website html data 
def get_basesoup(driver, url, wait=False, until_class='ClassOfMyElement'):
    driver.get(url)

    # Wait for page and element to completely load
    if wait:
        delay = 3 # seconds
        try:
            WebDriverWait(driver, delay).until(EC.presence_of_element_located((By.CLASS_NAME, until_class)))
        except TimeoutException:
            print("Loading took too much time!")
    
    basesoup=BeautifulSoup(driver.page_source,"html.parser")
    return basesoup

In [None]:
def get_news_info(url):
    '''
    Return a list of strings for a given url, 
    where each string is a sentence in the linked article.
    '''
    soup = get_basesoup(driver, url)

    try:
        article = soup.find("div", class_="article-wrap no-bb")
        
        if not article:
            print(f"No articles found on page {url}")
            return []

        # cover_wrap = article.find("div", class_="cover-wrap yf-1p8y0lh")
        # title = cover_wrap.find("h1", class_="cover-title yf-1p8y0lh")
             
        body_wrap = article.find("div", class_="body-wrap yf-i23rhs")
        body = body_wrap.find("div", class_="body yf-5ef8bf")
        text = body.find_all("p", class_="yf-1pe5jgt")
        # for paragraph in text:
        #     print(paragraph.text.strip())
        
        return [paragraph.text.strip() for paragraph in text]
            
        
    except:
        print(f"Error accessing articles on page {url}")
        return []
    
    

In [None]:
def get_urls_yfinance(ticker):
    news = yf.Ticker(ticker).news
    urls = {dictionary['link'] for dictionary in news}
    return urls

In [None]:
# Scrape all related articles
def get_list_all_articles_text_data(urls):
    article_texts = []
    
    for url in urls:
        article_texts.append(get_news_info(url)) #webscraping step

    return article_texts

In [None]:
#!pip install transformers torch
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import yfinance as yf

In [None]:
def combine_sentences(text_data):
    output_text = ""
    for sentence in text_data:
        output_text += " " + sentence
    return output_text

In [None]:
def use_finbert(text_data):
    text = combine_sentences(text_data)
    tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
    model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone')
    inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding=True)
    # Get model predictions
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Extract the logits (raw output predictions)
    logits = outputs.logits
    
    # Convert logits to probabilities using softmax
    probs = torch.nn.functional.softmax(logits, dim=-1)
    
    # Extract probabilities and predicted sentiment class
    predicted_class = torch.argmax(probs).item()  # 0: negative, 1: neutral, 2: positive
    confidence = torch.max(probs).item()

    # Sentiment mapping
    sentiment_labels = ['Negative', 'Neutral', 'Positive']
    
    # Get the predicted sentiment label
    predicted_sentiment = sentiment_labels[predicted_class]
    
    # Set a confidence threshold (e.g., 70%)
    confidence_threshold = 0.7
    
    # Output result with a check on confidence
    if confidence >= confidence_threshold:
        print(f"Sentiment: {predicted_sentiment} (Confidence: {confidence:.2f})")
    else:
        print("Sentiment prediction not reliable enough based on confidence threshold.")
    return [predicted_sentiment, confidence]

In [None]:
def get_sentiment_scores_finbert(article_texts):
    cnt_neu = 0
    cnt_pos = 0
    cnt_neg = 0
    total_confidence = 0
    for text in article_texts:
        sentiment, confidence = use_finbert(text)
        #print(text)
        if sentiment == "Neutral":
            cnt_neu += 1
        elif sentiment == "Positive":
            cnt_pos += 1
        elif sentiment == "Negative":
            cnt_neg += 1
        total_confidence += confidence
    return [cnt_neu, cnt_pos, cnt_neg, total_confidence/len(article_texts)]

In [None]:
import yfinance as yf
import pandas as pd
import requests
from bs4 import BeautifulSoup
import google.generativeai as genai
import json

In [2]:
def get_gemini_sentiment_score_one_article(api_key_gemini, text):
    genai.configure(api_key=api_key_gemini)
    model = genai.GenerativeModel("gemini-1.5-flash")
    response = model.generate_content("Please conduct sentiment analysis on the following articles of interest. Here is the text: "+text+
                                  '''Output text should be in JSON format with no extra information or text. Do NOT include extra formatting.
                                  Your response should start with { and end with }. Do not include `.
                                  Include categories neutral-sentiment,''' +
                                  "positive-sentiment, negative-sentiment, summary and stock-tickers. The sentiment categories should include "+
                                  "an integer from 0 to 9, where 0 means that the text doesn't fit that category and 9 means it fits well." +
                                  "The summary should be a once-sentence summary about the text. Stock-tickers should be the tickers of stocks related" +
                                  "to the articles." +
                                  '''If no article is given, output an empty 
                                  json "{}" only. Here is an example of required formatting: ''' +

                                  ''' {"neutral-sentiment": #,
                                    "npositive-sentiment": #,
                                    "negative-sentiment": #,
                                    "related-stocks": ["ABC", "DEF", "GHI"] +
                                    "nsummary : "Include a 2-sentence summary of the article text here." +
                                    }''')
    response_string = response.text
    if response_string == "{}":
        return None
    try:
        response_json = json.loads(response_string)
    except json.JSONDecodeError:
        return {}
    return response_json

In [None]:
def get_all_gemini_sentiment_scores(article_texts):
    avg_neu = 0
    avg_pos = 0
    avg_neg = 0
    api_key_gemini= "AIzaSyAqV8jVx8ah9Iv_04t1NcGdCjQoaAm8Uu4"
    num_articles = len(article_texts)
    for article in article_texts:
        text_data = combine_sentences(article)
        sentiment_scores = get_gemini_sentiment_score_one_article(api_key_gemini, text_data)
        if not sentiment_scores:
            continue
        else:
            avg_neu += sentiment_scores["neutral-sentiment"]
            avg_pos += sentiment_scores["positive-sentiment"]
            avg_neg += sentiment_scores["negative-sentiment"]
    return [avg_neu/num_articles, avg_pos/num_articles, avg_neg/num_articles]

In [None]:
#Look into: Long Short Term Memory (LSTM) algorithm
#https://www.nature.com/articles/s41599-024-02807-x
#running window (e,g, 5 or 10) -> prediction is following day
#prediction could be 1 day after or 1 month?

#making dataframe

#stable and larger companies features (maybe 2 extra columns)
#confidence data
#storing in a database (at a later point)
#validating model ourselves
#real-time data might be harder to use (delay)

#actually apply resulting model
from datetime import datetime
import matplotlib.pyplot as plt
import yfinance as yf
import pandas as pd

In [None]:
def make_dataframe(ticker):
    data = yf.download(ticker, start = datetime.now(), end = datetime.now())
    df = pd.DataFrame(data)
    urls = get_urls_yfinance(ticker)
    article_texts = get_list_all_articles_text_data(urls)
    
    sentiment_labels_finbert = ["neutral-count-finbert", "positive-count-finbert","negative-count-finbert","average-confidence-finbert"]
    sentiment_scores_finbert = get_sentiment_scores_finbert(article_texts)
    for i in range(len(sentiment_labels_finbert)):
      df[sentiment_labels_finbert[i]] = sentiment_scores_finbert[i]

    sentiment_labels_gemini = ["average-neutral-score-gemini", "average-positive-score-gemini","average-negative-score-gemini"]
    sentiment_scores_gemini = get_all_gemini_sentiment_scores(article_texts)
    for i in range(len(sentiment_labels_gemini)):
        df[sentiment_labels_gemini[i]] = sentiment_scores_gemini[i]
        
    df["prediction-label"] = ''
    return df

In [None]:
def update_db():
    connection_string = "mongodb+srv://varshaathreya:P9OTU6PVHDG1CITH@cluster0.luavu.mongodb.net/"

    # Step 1: Connect to MongoDB Atlas
    client = MongoClient(connection_string)
    
    # Step 2: Select the database and collection
    db = client["predictive-analysis-dataset"]  # Replace with your database name
    collection = db["stocks"]  # Replace with your collection name
    
    # Step 3: Read CSV into a DataFrame
    csv_file_path = "stock_data.csv"  # Path to your CSV file
    df = pd.read_csv(csv_file_path)
    
    # Step 4: Convert DataFrame to List of Dictionaries
    data = df.to_dict(orient="records")  # Converts rows into a list of dictionaries
    
    # Step 5: Insert data into MongoDB Atlas collection
    collection.insert_many(data)
    
    print("Data successfully imported to MongoDB Atlas!")

In [None]:
# Set up web driver
options = webdriver.ChromeOptions()
driver = Driver(uc=True, incognito=True)

In [None]:
#make_dataframe('TSLA') #only necessary if stock_data file has not been created
stock_ticker_list = ['TSLA', 'AAPL', 'LCID', 'PFE', 'VZ', 'NVDA', 'JNJ', 'T', 'RTX', 'MDT', 'GOOGL', 'BSX', 'META']
for stock in stock_ticker_list:
    curr_df = make_dataframe(stock)
    curr_df.to_csv('stock_data.csv', mode = 'a', header = False)
update_db()

In [None]:
driver.quit()

In [2]:
!pip install scikit-learn
!pip install torch




In [24]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset

# Define the LSTM model class
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super(LSTMModel, self).__init__()
        
        # Define the LSTM layer
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        
        # Define a fully connected (linear) layer for output
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        # LSTM outputs
        lstm_out, (hn, cn) = self.lstm(x)  # hn is the hidden state from the last LSTM layer
        
        # We take the output from the last time step
        out = self.fc(hn[-1])  # Get the last hidden state from the last layer
        return out

# Hyperparameters
input_size = 8  # Number of features in each time step (can vary based on your dataset)
hidden_size = 50  # Number of LSTM units in each layer
output_size = 1  # For regression, change this for classification (e.g., number of classes)
num_layers = 1  # Number of LSTM layers
batch_size = 32
seq_length = 20  # Length of each input sequence
num_epochs = 100
learning_rate = 0.001

# Generate some synthetic data for demonstration (e.g., time-series regression)
# Let's generate a random sequence with 1000 samples of length 20
df = pd.read_csv('stock_data.csv')
df = df[df["ticker"]=="AAPL"]
df = df.select_dtypes(include=[np.number])

# Convert data into sequences for LSTM input
def create_sequences(data, seq_length):
    xs, ys = [], []
    for i in range(len(data) - seq_length):
        x = data.iloc[i:i+seq_length, 0].values  # Accessing the first column (numerical data)
        y = data.iloc[i+seq_length, 0]  # The value at the next timestep
        xs.append(x)
        ys.append(y)
    return np.array(xs), np.array(ys)

# Set sequence length
SEQ_LENGTH = 8

# Create sequences
X, y = create_sequences(df, SEQ_LENGTH)


# Convert to PyTorch tensors
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)


# Split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.2, shuffle=False)

# Create DataLoader for batching
train_data = TensorDataset(X_train, y_train)
test_data = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_data, batch_size=32, shuffle=False)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)


# Instantiate the model
model = LSTMModel(input_size, hidden_size, output_size, num_layers)

# Loss function and optimizer
criterion = nn.MSELoss()  # Using Mean Squared Error for regression
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    
    # Forward pass
    outputs = model(X_train)  # Get model predictions
    loss = criterion(outputs, y_train)  # Calculate the loss
    
    # Backward pass and optimization
    optimizer.zero_grad()  # Clear previous gradients
    loss.backward()  # Compute gradients
    optimizer.step()  # Update model parameters
    
    # Print loss every 10 epochs
    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")
        
# After training, you can make predictions like this:
model.eval()  # Set model to evaluation mode
with torch.no_grad():  # Disable gradient calculation
    test_input = torch.randn(1, seq_length, input_size)  # A single sample
    prediction = model(test_input)
    print("Prediction:", prediction.item())

Epoch [10/100], Loss: 51731.2656
Epoch [20/100], Loss: 51615.4414
Epoch [30/100], Loss: 51527.5703
Epoch [40/100], Loss: 51452.8320
Epoch [50/100], Loss: 51378.1016
Epoch [60/100], Loss: 51303.8242
Epoch [70/100], Loss: 51230.0195
Epoch [80/100], Loss: 51156.6211
Epoch [90/100], Loss: 51083.5586
Epoch [100/100], Loss: 51010.7734
Prediction: 0.3531138300895691


  return F.mse_loss(input, target, reduction=self.reduction)
