In [14]:
import re
import csv
from time import sleep
from bs4 import BeautifulSoup
import requests
import yfinance as yf
import pandas as pd
import numpy as np
import math
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from seleniumbase import Driver
from pymongo import MongoClient

In [None]:
# MongoDB setup
connection_string = "mongodb+srv://varshaathreya:P9OTU6PVHDG1CITH@cluster0.luavu.mongodb.net/"
client = MongoClient(connection_string)
db = client['predictive-analysis-dataset']
collection = db['articles']

In [16]:
# Web driver setup
options = webdriver.ChromeOptions()
driver = Driver(uc=True, incognito=True)

In [17]:
def get_basesoup(driver, url, wait=False, until_class='ClassOfMyElement'):
    driver.get(url)
    if wait:
        delay = 3
        try:
            WebDriverWait(driver, delay).until(EC.presence_of_element_located((By.CLASS_NAME, until_class)))
        except TimeoutException:
            print("Loading took too much time!")
    return BeautifulSoup(driver.page_source, "html.parser")

In [18]:
def get_news_info(url):
    soup = get_basesoup(driver, url, wait=True, until_class="body-wrap")
    try:
        article = soup.find("div", class_="article-wrap no-bb")
        if not article:
            print(f"No article found on page {url}")
            return []

        body_wrap = article.find("div", class_="body-wrap yf-i23rhs")
        if body_wrap:
            body = body_wrap.find("div", class_="body yf-5ef8bf")
            text = body.find_all("p", class_="yf-1pe5jgt") if body else []
            return [paragraph.text.strip() for paragraph in text]
        else:
            return []
    except Exception as e:
        print(f"Error accessing articles on page {url}: {e}")
        return []

In [19]:
# Input companies
tickers = ["NVDA", "TSLA"]

# Data collection
data = []

# Iterate over tickers to collect news data
for ticker in tickers:
    news = yf.Ticker(ticker).news
    stock_news = {
        "ticker": ticker,
        "news": []
    }

    for article in news:
        # Create a copy of the article without the 'thumbnail' key
        filtered_article = {key: value for key, value in article.items() if key != 'thumbnail'}
        
        # Scrape article text
        url = filtered_article.get('link')
        if url:
            article_text = get_news_info(url)
            filtered_article['text'] = article_text
            stock_news['news'].append(filtered_article)

    data.append(stock_news)

In [20]:
data

[{'ticker': 'NVDA',
  'news': [{'uuid': '0c044df3-713e-31d1-9f9b-ee8359c9603a',
    'title': 'How major US stock indexes fared Tuesday, 11/19/2024',
    'publisher': 'Associated Press Finance',
    'link': 'https://finance.yahoo.com/news/major-us-stock-indexes-fared-211720658.html',
    'providerPublishTime': 1732051040,
    'type': 'STORY',
    'relatedTickers': ['COMP', '^GSPC', '^DJI', '^RUT', 'NVDA'],
    'text': ['Nvidia and other tech companies pulled U.S. stock indexes higher after they stumbled in the morning on worries about escalations in the Russia-Ukraine war.',
     'The S&P 500 rose 0.4% Tuesday. The Nasdaq composite also erased an early loss to turn 1% higher, while the Dow Jones Industrial Average slipped 0.3%.',
     'Nvidia rallied ahead of its profit report for the latest quarter, which is coming on Wednesday. Strategists say it appears to be the most anticipated event left on the calendar this year for Wall Street. Walmart rose after topping profit forecasts, while 

In [21]:
# Function to handle nested lists and flatten text
def flatten_text(text_data):
    if isinstance(text_data, list):  
        # Flatten nested lists and join into a single string
        return ' '.join([item for sublist in text_data for item in (sublist if isinstance(sublist, list) else [sublist])])
    return str(text_data)  # Handle non-list cases (fallback)

In [22]:
# Function to flatten the data
def flatten_news_data(data):
    flattened_data = []
    for entry in data:
        ticker = entry['ticker']
        for news_item in entry['news']:
            flattened_data.append({
                'Ticker': ticker,
                'UUID': news_item['uuid'],
                'Title': news_item['title'],
                'Publisher': news_item['publisher'],
                'Link': news_item['link'],
                'Provider Publish Time': news_item['providerPublishTime'],
                'Type': news_item['type'],
                'Related Tickers': ', '.join(news_item['relatedTickers']),
                'Text': flatten_text(news_item['text'])  
            })
    return flattened_data


In [23]:
flattened_data = flatten_news_data(data)
df = pd.DataFrame(flattened_data)

In [24]:
df

Unnamed: 0,Ticker,UUID,Title,Publisher,Link,Provider Publish Time,Type,Related Tickers,Text
0,NVDA,0c044df3-713e-31d1-9f9b-ee8359c9603a,"How major US stock indexes fared Tuesday, 11/1...",Associated Press Finance,https://finance.yahoo.com/news/major-us-stock-...,1732051040,STORY,"COMP, ^GSPC, ^DJI, ^RUT, NVDA",Nvidia and other tech companies pulled U.S. st...
1,NVDA,620874b0-e0e9-30f7-93e8-2a72cac665e2,Dow Jones Clings To A Loss As AI Stock Soars O...,Investor's Business Daily,https://finance.yahoo.com/m/620874b0-e0e9-30f7...,1732050821,STORY,"VIK, WMT, SMCI, SYM, NVDA",The Dow Jones couldn't get into positive terri...
2,NVDA,52cdce5c-127b-3689-9459-c0a6ef4e9b53,Why Nvidia Stock Rallied (Again) on Tuesday,Motley Fool,https://finance.yahoo.com/m/52cdce5c-127b-3689...,1732050633,STORY,NVDA,Shares of Nvidia (NASDAQ: NVDA) surged higher ...
3,NVDA,5b883d06-f7ba-38ae-bee6-4aa81e40ca76,"Nasdaq, S&P 500 rising ahead of tomorrow's Nvi...",Yahoo Finance Video,https://finance.yahoo.com/video/nasdaq-p-500-r...,1732050425,VIDEO,"^SP500EW, ^SPXEW, NVDA, ^DJI, ^GSPC, ^IXIC, ^NDX",The Nasdaq Composite (^IXIC) cap Tuesday's ses...
4,NVDA,fefc565d-2407-3fb3-a443-f1fbadc39391,Nvidia Stock Is Gaining Today -- Is It Too Lat...,Motley Fool,https://finance.yahoo.com/m/fefc565d-2407-3fb3...,1732050420,STORY,NVDA,Nvidia (NASDAQ: NVDA) stock is climbing in Tue...
5,NVDA,165c9f8a-9cc5-3384-a80f-4ebc9a7c2614,"Palantir, The Newest S&P 500 Stock, Is Leaving...",Investor's Business Daily,https://finance.yahoo.com/m/165c9f8a-9cc5-3384...,1732049743,STORY,"NVDA, GEV, PLTR, VST, AXON",Palantir joined the S&P 500 in September. It's...
6,NVDA,0c94bf2b-6e83-3631-bbe0-e89adeb73038,Nvidia Stock Gains Ahead of Earnings as Analys...,Investopedia,https://finance.yahoo.com/m/0c94bf2b-6e83-3631...,1732046957,STORY,NVDA,"Shares of Nvidia climbed Tuesday, a day ahead..."
7,NVDA,8a34bfd9-ebea-3146-b0b8-154a0abe8545,Stocks Shake Off War Angst Before Nvidia’s Res...,Bloomberg,https://finance.yahoo.com/news/asian-stocks-se...,1732050510,STORY,"NVDA, SMCI",(Bloomberg) -- Stocks rebounded in a volatile ...
8,TSLA,653caacb-8356-3ecb-8817-cab656d36f0e,Musk-Led Panel Considers Mobile Tax-Filing App...,GuruFocus.com,https://finance.yahoo.com/news/musk-led-panel-...,1732047344,STORY,TSLA,"Tesla (TSLA, Financials) CEO Elon Musk and Viv..."
9,TSLA,85ac63d3-14bd-31e6-99fd-5b9ca5a56df3,EVgo Expands Charging Network in Midwest with ...,GuruFocus.com,https://finance.yahoo.com/news/evgo-expands-ch...,1732045340,STORY,TSLA,"EVgo (EVGO, Financials) announced plans to exp..."


In [25]:
data_to_insert = df.to_dict(orient='records')

In [26]:
# Insert data into MongoDB
try:
    collection.insert_many(data_to_insert)
    print("Data successfully inserted into MongoDB.")
except Exception as e:
    print(f"Failed to insert data into MongoDB: {e}")

Data successfully inserted into MongoDB.


In [44]:
# Print data for verification
# for document in collection.find():
#     print(document)

In [None]:
driver.quit()