# Stock News Research Project

In [3]:
import pandas as pd
import torch
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from bs4 import BeautifulSoup
import requests
import os
from transformers import pipeline
from newsapi import NewsApiClient

import oledll

Initially, we will try to summarise a single article 

In [5]:
url = "https://finance.yahoo.com/\
news/nvidias-earnings-could-turn-around-the-ailing-ai-trade-173554788.html?\
guccounter=1&guce_referrer=aHR0cHM6Ly93d3cuZ29vZ2xlLmNvbS8&guce_referrer\
_sig=AQAAAMJFUcemFWCY8V_GfUzI-R56iOx9aSxiSdwYQy5cU_f1Cak6m-KjD1HnzRvf1hPC8Ttd16dfaO7aDuCuaCjzoq\
P5KJhNqrL9aa_EIQLo8oUWidQsxOd26ntczzbGfmDZYMfllQdaVUoY76qqs4p_misoSnysjR57cGP9qNEuS76c"

r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
paragraphs = soup.find_all('p')

In [6]:
paragraphs[0].text

'The artificial intelligence trade has been losing its luster as of late. Shares of Alphabet (GOOG, GOOGL), Amazon (AMZN), and Microsoft (MSFT), three of AI’s biggest players, are down over the last month, with Google parent Alphabet dropping 14%, Amazon off about 8%, and Microsoft falling more than 7% as of Thursday.'

In [7]:
text = [paragraph.text for paragraph in paragraphs]
article = ' '.join(text)

In [8]:
article = article.replace('.', '<eos>')
article = article.replace('!', '<eos>')
article = article.replace('?', '<eos>')
sentences = article.split('<eos>')

In [9]:
sentences[0]

'The artificial intelligence trade has been losing its luster as of late'

In [10]:
max_chunk = 500
current_chunk = 0
chunks = []

for sentence in sentences:
    if len(chunks) == current_chunk +1:
        if len(chunks[current_chunk]) + len(sentence.split(' ')) <=max_chunk:
            chunks[current_chunk].extend(sentence.split(' '))
        else:
            current_chunk +=1
            chunks.append(sentence.split(' '))
    else:
        chunks.append(sentence.split(' '))

for chunk_id in range(len(chunks)):
    chunks[chunk_id] = ' '.join(chunks[chunk_id])

In [11]:
chunks[0], chunks[1]

("The artificial intelligence trade has been losing its luster as of late  Shares of Alphabet (GOOG, GOOGL), Amazon (AMZN), and Microsoft (MSFT), three of AI’s biggest players, are down over the last month, with Google parent Alphabet dropping 14%, Amazon off about 8%, and Microsoft falling more than 7% as of Thursday  The stock moves come after the companies, along with fellow hyperscaler Meta (META), confirmed they’ll continue to pour billions of dollars into building out their AI infrastructure over the coming quarters — without providing much insight into when they’re going to turn all of that spending into revenue  That, along with the recent market turmoil, has put a damper on AI company stocks  But the most important component of the AI trade, Nvidia (NVDA), still has to report its earnings  The chip company’s performance could turn around the AI trade more than any hyperscaler  Unlike those software firms, revenue hasn’t been a problem for Nvidia  Still, if it falls short of Wa

In [12]:
model_name = 'human-centered-summarization/financial-summarization-pegasus'
tokenizer = PegasusTokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at human-centered-summarization/financial-summarization-pegasus and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
summaries = []
for chunk in chunks:
    input_ids = tokenizer.encode(chunk, return_tensors = 'pt', max_length=500, truncation=True)
    output = model.generate(input_ids, max_length = 500, num_beams = 5 )
    summary = tokenizer.decode(output[0], skip_special_tokens = True)
    summaries.append(summary)

final_summary = '. '.join(summaries)

In [14]:
final_summary

'Chips giant Nvidia to report second quarter earnings on Friday. Wall Street is expecting huge revenue growth, but it won’t last. Shares of the chipmaker are down more than 20% this year amid investor concern'

### Note 

The above method would not work when trying to request individual links from gooogle news

In [24]:
model_name = 'human-centered-summarization/financial-summarization-pegasus'
tokenizer = PegasusTokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

# Function to fetch article content from a URL
def fetch_article(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    paragraphs = soup.find_all('p')
    text = [paragraph.text for paragraph in paragraphs]
    return ' '.join(text)

# Function to split text into manageable chunks
def split_text_into_chunks(text, max_chunk_size):
    """Splits text into chunks of a maximum size."""
    text = text.replace('.', '<eos>').replace('!', '<eos>').replace('?', '<eos>')
    sentences = text.split('<eos>')

    chunks = []
    current_chunk = []
    current_chunk_size = 0

    for sentence in sentences:
        sentence_length = len(sentence.split())
        if current_chunk_size + sentence_length <= max_chunk_size:
            current_chunk.extend(sentence.split())
            current_chunk_size += sentence_length
        else:
            chunks.append(' '.join(current_chunk))
            current_chunk = sentence.split()
            current_chunk_size = sentence_length

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

# Function to summarize a chunk of text
def summarize_chunk(chunk, tokenizer, model):
    """Summarizes a chunk of text."""
    input_ids = tokenizer.encode(chunk, return_tensors='pt', max_length=500, truncation=True)
    output = model.generate(input_ids, max_length=500, num_beams=5)
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Function to summarize an entire article from a URL
def summarize_article(url, tokenizer, model, max_chunk_size=500):
    """Fetches and summarizes an article from a URL."""
    article = fetch_article(url)
    chunks = split_text_into_chunks(article, max_chunk_size)
    summaries = [summarize_chunk(chunk, tokenizer, model) for chunk in chunks]
    return '. '.join(summaries)

# Initialize NewsAPI client
api_key = os.getenv('API_KEY')
newsapi = NewsApiClient(api_key=api_key)

# Function to get news URLs and dates for a ticker
def get_news_urls(ticker):
    """Fetches news article URLs and publication dates for a given ticker."""
    all_articles = newsapi.get_everything(
        q=ticker,
        language='en',
        page_size =10
    )
    articles = all_articles.get('articles', [])
    return [(article['url'], article['publishedAt']) for article in articles]

# List of monitored tickers
monitored_tickers = ['AMZN', 'GOOGL', 'META', 'NVDA', 'BTC', 'ETH', 'SPX', 'VOO']

# Initialize an empty list to store results
results = []

for ticker in monitored_tickers:
    print(f"\nFetching news for ticker: {ticker}")
    articles_info = get_news_urls(ticker)
    
    for url, date in articles_info:
        print(f"Summarizing article: {url}")
        summary = summarize_article(url, tokenizer, model)
        results.append({'ticker': ticker, 'url': url, 'date': date, 'summary': summary})

# Create a DataFrame from the results
df = pd.DataFrame(results)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at human-centered-summarization/financial-summarization-pegasus and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Fetching news for ticker: AMZN
Summarizing article: https://www.techmeme.com/240801/p55
Summarizing article: https://biztoc.com/x/b3d5270106fa527b
Summarizing article: https://www.forbes.com/sites/greatspeculations/2024/08/06/why-amazon-stock-fell-9-in-a-day/
Summarizing article: https://qz.com/google-schmidt-goog-alphabet-remote-work-ai-1851622832
Summarizing article: https://www.forbes.com/sites/greatspeculations/2024/08/01/amazon-stock-is-beating-sp500-in-ytd-returns-what-to-expect-from-q2-results/
Summarizing article: https://qz.com/google-ai-remote-work-eric-schmidt-1851621932
Summarizing article: https://qz.com/microsoft-apple-google-earnings-ai-hype-real-analyst-1851619646
Summarizing article: https://qz.com/google-alphabet-goog-antitrust-lawsuit-doj-search-1851621717
Summarizing article: https://qz.com/rivian-electric-vans-amazon-parts-shortage-1851624370
Summarizing article: https://finance.yahoo.com/news/meli-vs-amzn-e-commerce-220424659.html

Fetching news for ticker: GOOGL

In [28]:
filepath = r'Stock News Research\Data'
df.to_csv(filepath+'\\news_research.csv', index = False)

In [267]:
news_data = pd.read_csv(filepath+'\\news_research.csv')

In [269]:
news_data

Unnamed: 0,ticker,url,date,summary
0,AMZN,https://www.techmeme.com/240801/p55,2024-08-02T00:15:01Z,Check back often for the most up-to-date content.
1,AMZN,https://biztoc.com/x/b3d5270106fa527b,2024-08-02T11:46:33Z,Manchester City travel to west London to start...
2,AMZN,https://www.forbes.com/sites/greatspeculations...,2024-08-06T13:00:56Z,Trefis High Quality Portfolio has outperformed...
3,AMZN,https://qz.com/google-schmidt-goog-alphabet-re...,2024-08-15T12:50:00Z,Eric Schmidt says ‘people work like hell’ on r...
4,AMZN,https://www.forbes.com/sites/greatspeculations...,2024-08-01T10:00:00Z,Online retailer is expected to report strong s...
...,...,...,...,...
75,VOO,http://www.etf.com/sections/news/etf-wrap-vang...,2024-08-05T12:00:00Z,
76,VOO,https://finance.yahoo.com/news/millennial-inve...,2024-08-11T20:00:09Z,‘Investing let me put money away to purchase m...
77,VOO,https://www.bbc.com/news/articles/cd179e8g9j3o,2024-07-29T21:01:29Z,Suspects dressed in black wearing masks opened...
78,VOO,https://www.thestar.com.my/news/nation/2024/07...,2024-07-20T16:00:00Z,At least 50 flights were affected by Friday’s ...


In [296]:
news_data_2 = news_data.copy()

In [298]:
news_data_2.isnull().sum()

ticker      0
url         0
date        0
summary    28
dtype: int64

In [300]:
news_data_2 = news_data_2.dropna(subset=['summary'], axis = 0)

In [302]:
news_data_2.isnull().sum()

ticker     0
url        0
date       0
summary    0
dtype: int64

In [304]:
news_data_2.duplicated().value_counts()

False    49
True      3
Name: count, dtype: int64

In [306]:
news_data_2

Unnamed: 0,ticker,url,date,summary
0,AMZN,https://www.techmeme.com/240801/p55,2024-08-02T00:15:01Z,Check back often for the most up-to-date content.
1,AMZN,https://biztoc.com/x/b3d5270106fa527b,2024-08-02T11:46:33Z,Manchester City travel to west London to start...
2,AMZN,https://www.forbes.com/sites/greatspeculations...,2024-08-06T13:00:56Z,Trefis High Quality Portfolio has outperformed...
3,AMZN,https://qz.com/google-schmidt-goog-alphabet-re...,2024-08-15T12:50:00Z,Eric Schmidt says ‘people work like hell’ on r...
4,AMZN,https://www.forbes.com/sites/greatspeculations...,2024-08-01T10:00:00Z,Online retailer is expected to report strong s...
5,AMZN,https://qz.com/google-ai-remote-work-eric-schm...,2024-08-14T14:39:00Z,"‘People work like hell,’ ex-CEO says. Google h..."
6,AMZN,https://qz.com/microsoft-apple-google-earnings...,2024-08-12T16:09:00Z,Wedbush sees ‘a tidal wave of spending is comi...
7,AMZN,https://qz.com/google-alphabet-goog-antitrust-...,2024-08-14T12:04:00Z,Justice Department says Google abused its domi...
8,AMZN,https://qz.com/rivian-electric-vans-amazon-par...,2024-08-16T16:34:14Z,EV maker is boosting production next year to m...
9,AMZN,https://finance.yahoo.com/news/meli-vs-amzn-e-...,2024-07-18T22:04:24Z,MercadoLibre is trading at a premium to Amazon...


In [308]:
duplicates = news_data_2[news_data_2.duplicated(keep=False)]

In [310]:
duplicates

Unnamed: 0,ticker,url,date,summary
22,META,https://removed.com,1970-01-01T00:00:00Z,Check here for the latest news and videos from...
23,META,https://removed.com,1970-01-01T00:00:00Z,Check here for the latest news and videos from...
24,META,https://removed.com,1970-01-01T00:00:00Z,Check here for the latest news and videos from...
28,META,https://removed.com,1970-01-01T00:00:00Z,Check here for the latest news and videos from...


In [312]:
news_data_2 = news_data_2.drop_duplicates()

In [314]:
news_data_2

Unnamed: 0,ticker,url,date,summary
0,AMZN,https://www.techmeme.com/240801/p55,2024-08-02T00:15:01Z,Check back often for the most up-to-date content.
1,AMZN,https://biztoc.com/x/b3d5270106fa527b,2024-08-02T11:46:33Z,Manchester City travel to west London to start...
2,AMZN,https://www.forbes.com/sites/greatspeculations...,2024-08-06T13:00:56Z,Trefis High Quality Portfolio has outperformed...
3,AMZN,https://qz.com/google-schmidt-goog-alphabet-re...,2024-08-15T12:50:00Z,Eric Schmidt says ‘people work like hell’ on r...
4,AMZN,https://www.forbes.com/sites/greatspeculations...,2024-08-01T10:00:00Z,Online retailer is expected to report strong s...
5,AMZN,https://qz.com/google-ai-remote-work-eric-schm...,2024-08-14T14:39:00Z,"‘People work like hell,’ ex-CEO says. Google h..."
6,AMZN,https://qz.com/microsoft-apple-google-earnings...,2024-08-12T16:09:00Z,Wedbush sees ‘a tidal wave of spending is comi...
7,AMZN,https://qz.com/google-alphabet-goog-antitrust-...,2024-08-14T12:04:00Z,Justice Department says Google abused its domi...
8,AMZN,https://qz.com/rivian-electric-vans-amazon-par...,2024-08-16T16:34:14Z,EV maker is boosting production next year to m...
9,AMZN,https://finance.yahoo.com/news/meli-vs-amzn-e-...,2024-07-18T22:04:24Z,MercadoLibre is trading at a premium to Amazon...


In [327]:
news_data_2 = news_data_2.drop(index =79)

In [323]:
news_data_2 = news_data_2[news_data_2['summary'] != '.']

In [329]:
news_data_2 = news_data_2[news_data_2['url'] != 'https://removed.com']

In [333]:
news_data_2.reset_index(drop = True)

Unnamed: 0,ticker,url,date,summary
0,AMZN,https://www.techmeme.com/240801/p55,2024-08-02T00:15:01Z,Check back often for the most up-to-date content.
1,AMZN,https://biztoc.com/x/b3d5270106fa527b,2024-08-02T11:46:33Z,Manchester City travel to west London to start...
2,AMZN,https://www.forbes.com/sites/greatspeculations...,2024-08-06T13:00:56Z,Trefis High Quality Portfolio has outperformed...
3,AMZN,https://qz.com/google-schmidt-goog-alphabet-re...,2024-08-15T12:50:00Z,Eric Schmidt says ‘people work like hell’ on r...
4,AMZN,https://www.forbes.com/sites/greatspeculations...,2024-08-01T10:00:00Z,Online retailer is expected to report strong s...
5,AMZN,https://qz.com/google-ai-remote-work-eric-schm...,2024-08-14T14:39:00Z,"‘People work like hell,’ ex-CEO says. Google h..."
6,AMZN,https://qz.com/microsoft-apple-google-earnings...,2024-08-12T16:09:00Z,Wedbush sees ‘a tidal wave of spending is comi...
7,AMZN,https://qz.com/google-alphabet-goog-antitrust-...,2024-08-14T12:04:00Z,Justice Department says Google abused its domi...
8,AMZN,https://qz.com/rivian-electric-vans-amazon-par...,2024-08-16T16:34:14Z,EV maker is boosting production next year to m...
9,AMZN,https://finance.yahoo.com/news/meli-vs-amzn-e-...,2024-07-18T22:04:24Z,MercadoLibre is trading at a premium to Amazon...


In [356]:
df2 = news_data_2.copy()

In [376]:
df2['summary'].iloc[42]

'‘Investing let me put money away to purchase my first home,’ she says.'

In [386]:
def count_words(text):
    return len(text.split())

df2['word_count'] = df2['summary'].apply(count_words)


In [400]:
df2 = df2.drop(index=14)

In [404]:
df2.reset_index(drop = True)

Unnamed: 0,ticker,url,date,summary,word_count
0,AMZN,https://www.techmeme.com/240801/p55,2024-08-02T00:15:01Z,Check back often for the most up-to-date content.,8
1,AMZN,https://biztoc.com/x/b3d5270106fa527b,2024-08-02T11:46:33Z,Manchester City travel to west London to start...,13
2,AMZN,https://www.forbes.com/sites/greatspeculations...,2024-08-06T13:00:56Z,Trefis High Quality Portfolio has outperformed...,26
3,AMZN,https://qz.com/google-schmidt-goog-alphabet-re...,2024-08-15T12:50:00Z,Eric Schmidt says ‘people work like hell’ on r...,19
4,AMZN,https://www.forbes.com/sites/greatspeculations...,2024-08-01T10:00:00Z,Online retailer is expected to report strong s...,38
5,AMZN,https://qz.com/google-ai-remote-work-eric-schm...,2024-08-14T14:39:00Z,"‘People work like hell,’ ex-CEO says. Google h...",15
6,AMZN,https://qz.com/microsoft-apple-google-earnings...,2024-08-12T16:09:00Z,Wedbush sees ‘a tidal wave of spending is comi...,25
7,AMZN,https://qz.com/google-alphabet-goog-antitrust-...,2024-08-14T12:04:00Z,Justice Department says Google abused its domi...,14
8,AMZN,https://qz.com/rivian-electric-vans-amazon-par...,2024-08-16T16:34:14Z,EV maker is boosting production next year to m...,21
9,AMZN,https://finance.yahoo.com/news/meli-vs-amzn-e-...,2024-07-18T22:04:24Z,MercadoLibre is trading at a premium to Amazon...,23


In [2]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, pipeline

sentiment_pipeline = pipeline('text-classification', model='mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis')




Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [410]:
def get_sentiment_info(text):
    result = sentiment_pipeline(text)[0]
    return result['label'], result['score']


df2[['sentiment', 'sentiment_score']] = df2['summary'].apply(lambda x: pd.Series(get_sentiment_info(x)))

In [412]:
df2

Unnamed: 0,ticker,url,date,summary,word_count,sentiment,sentiment_score
0,AMZN,https://www.techmeme.com/240801/p55,2024-08-02T00:15:01Z,Check back often for the most up-to-date content.,8,neutral,0.999315
1,AMZN,https://biztoc.com/x/b3d5270106fa527b,2024-08-02T11:46:33Z,Manchester City travel to west London to start...,13,neutral,0.999745
2,AMZN,https://www.forbes.com/sites/greatspeculations...,2024-08-06T13:00:56Z,Trefis High Quality Portfolio has outperformed...,26,positive,0.999725
3,AMZN,https://qz.com/google-schmidt-goog-alphabet-re...,2024-08-15T12:50:00Z,Eric Schmidt says ‘people work like hell’ on r...,19,neutral,0.999693
4,AMZN,https://www.forbes.com/sites/greatspeculations...,2024-08-01T10:00:00Z,Online retailer is expected to report strong s...,38,positive,0.999696
5,AMZN,https://qz.com/google-ai-remote-work-eric-schm...,2024-08-14T14:39:00Z,"‘People work like hell,’ ex-CEO says. Google h...",15,negative,0.985674
6,AMZN,https://qz.com/microsoft-apple-google-earnings...,2024-08-12T16:09:00Z,Wedbush sees ‘a tidal wave of spending is comi...,25,positive,0.999402
7,AMZN,https://qz.com/google-alphabet-goog-antitrust-...,2024-08-14T12:04:00Z,Justice Department says Google abused its domi...,14,neutral,0.995198
8,AMZN,https://qz.com/rivian-electric-vans-amazon-par...,2024-08-16T16:34:14Z,EV maker is boosting production next year to m...,21,positive,0.99959
9,AMZN,https://finance.yahoo.com/news/meli-vs-amzn-e-...,2024-07-18T22:04:24Z,MercadoLibre is trading at a premium to Amazon...,23,neutral,0.999571


In [414]:
df2.sentiment.value_counts()

sentiment
neutral     23
positive    14
negative     7
Name: count, dtype: int64