### Stock Market Crawler: Yahoo Finance

In [78]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from bs4 import BeautifulSoup
import requests
import os
from transformers import pipeline
import re
import csv

In [79]:
os.environ["HF_MIRROR"] = "https://huggingface.co/mirrors"  # Using different mirror URL to escape bugs

In [80]:
# Defining Model
name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(name)
model = PegasusForConditionalGeneration.from_pretrained(name)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at human-centered-summarization/financial-summarization-pegasus and are newly initialized: ['model.encoder.embed_positions.weight', 'model.decoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [81]:
url = "https://finance.yahoo.com/news/term-labs-launches-fixed-rate-130117305.html"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
paragraphs = soup.find_all('p')


In [82]:
print("No. of Paragraphs:", len(paragraphs))
print(paragraphs[3])

No. of Paragraphs: 14
<p><span>The Term team channeled the U.S. Treasury for inspiration, specifically its auction calendar model, which matches borrowers and lenders who have set predetermined rates. The collateral is then locked up in a <span class="link"><a class="link" data-ylk="slk:smart contract;elm:context_link;itc:0" href="https://decrypt.co/resources/smart-contracts" rel="nofollow noopener" target="_blank">smart contract</a></span> for the duration of the auction, aka the loan’s term. </span></p>


In [83]:
text = [paragraph.text for paragraph in paragraphs]
words = ' '.join(text).split(' ')[:400]
full_article = ' '.join(words)
full_article

'Term Labs today launches its fixed-rate lending protocol on Ethereum. Term Finance is a milestone towards the development of a more mature crypto market, one with rules more familiar to TradFi lenders and borrowers than current Decentralized Finance (DeFi) protocols.  Variable rates are the current rule in DeFi, making it riskier for large professional portfolios to get on board. Interest rates on protocols like Aave and Compound are calculated according to supply and demand. So, if a crypto whale deposits a boatload of funds into a lending pool, rates across the board will plummet in accordance with the abundance of supply, and vice versa.  The Term team channeled the U.S. Treasury for inspiration, specifically its auction calendar model, which matches borrowers and lenders who have set predetermined rates. The collateral is then locked up in a smart contract for the duration of the auction, aka the loan’s term.  Tether Reports $850 Million Q2 Profit, $72 Billion Exposure to US Treas

In [84]:
# Tokenization and summary generation
input_ids = tokenizer.encode(full_article, return_tensors='pt')
generation = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
summary = tokenizer.decode(generation[0], skip_special_tokens=True)
summary

'Term Finance’s auction calendar model matches borrowers and lenders. Co-founder says platform will be multi-chain soon'

### Pipeline

In [85]:
stock_tickers = ['PFE', 'TSLA', 'BTC', 'ZI']

In [86]:
def crawl_to_retrieve_urls(ticker):
    search_url = "https://www.google.com/search?q=yahoo+finance+{}&tbm=nws".format(ticker)
    r = requests.get(search_url)
    soup = BeautifulSoup(r.text, 'html.parser')
    atags = soup.find_all('a')
    hrefs = [link['href'] for link in atags]
    return hrefs 

In [87]:
urls = {ticker:crawl_to_retrieve_urls(ticker) for ticker in stock_tickers}
urls

{'PFE': ['/?sa=X&ved=0ahUKEwj-3fHCiLyAAxWAb2wGHZjpCTIQOwgC',
  '/search?q=yahoo+finance+PFE&tbm=nws&ie=UTF-8&gbv=1&sei=xEvJZP7rE4DfseMPmNOnkAM',
  '/search?q=yahoo+finance+PFE&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwj-3fHCiLyAAxWAb2wGHZjpCTIQ_AUIBSgA',
  '/search?q=yahoo+finance+PFE&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwj-3fHCiLyAAxWAb2wGHZjpCTIQ_AUIBygC',
  '/search?q=yahoo+finance+PFE&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwj-3fHCiLyAAxWAb2wGHZjpCTIQ_AUICCgD',
  'https://maps.google.com/maps?q=yahoo+finance+PFE&um=1&ie=UTF-8&sa=X&ved=0ahUKEwj-3fHCiLyAAxWAb2wGHZjpCTIQ_AUICSgE',
  '/search?q=yahoo+finance+PFE&ie=UTF-8&tbm=shop&source=lnms&sa=X&ved=0ahUKEwj-3fHCiLyAAxWAb2wGHZjpCTIQ_AUICigF',
  '/search?q=yahoo+finance+PFE&ie=UTF-8&tbm=bks&source=lnms&sa=X&ved=0ahUKEwj-3fHCiLyAAxWAb2wGHZjpCTIQ_AUICygG',
  '/advanced_search',
  '/search?q=yahoo+finance+PFE&ie=UTF-8&tbm=nws&source=lnt&tbs=qdr:h&sa=X&ved=0ahUKEwj-3fHCiLyAAxWAb2wGHZjpCTIQpwUIDQ',
  '/search?q=yahoo+finance+PFE&ie=U

In [88]:
bad_words = ['preferences', 'accounts', 'support', 'maps', 'policies']

In [89]:
def parse_urls(urls, bad_words):
    val = []
    for url in urls: 
        if 'https://' in url and not any(bad_word in url for bad_word in bad_words):
            res = re.findall(r'(https?://\S+)', url)[0].split('&')[0]
            val.append(res)
    return list(set(val))

In [90]:
cleaned_urls = {ticker:parse_urls(urls[ticker], bad_words) for ticker in stock_tickers}
cleaned_urls

{'PFE': ['https://finance.yahoo.com/news/stock-futures-droop-amid-mixed-earnings-after-blazing-july-stock-market-news-today-121511468.html',
  'https://finance.yahoo.com/news/arvinas-pfizer-awarded-innovation-passport-203000619.html',
  'https://www.google.com/search?q%3Dyahoo%2Bfinance%2BPFE%26tbm%3Dnws%26pccc%3D1',
  'https://finance.yahoo.com/news/pfizer-announces-executive-leadership-advance-104500957.html',
  'https://finance.yahoo.com/news/earn-500-month-pfizer-stock-120100836.html',
  'https://finance.yahoo.com/news/10-best-small-cap-pharma-185609107.html',
  'https://finance.yahoo.com/news/pfizer-q2-earnings-snapshot-105818783.html',
  'https://finance.yahoo.com/news/caribou-biosciences-announces-25-million-120000433.html',
  'https://finance.yahoo.com/news/drug-biotech-stocks-q2-earnings-135800432.html',
  'https://finance.yahoo.com/news/earnings-preview-pfizer-pfe-q2-140133393.html',
  'https://finance.yahoo.com/news/caribou-biosciences-raises-125m-via-152759589.html'],
 'TSL

In [91]:
def scrape_and_process(URLs):
    articles = []
    for url in URLs: 
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        paragraphs = soup.find_all('p')
        text = [paragraph.text for paragraph in paragraphs]
        words = ' '.join(text).split(' ')[:300]
        article = ' '.join(words)
        articles.append(article)
    return articles

In [92]:
articles = {ticker:scrape_and_process(cleaned_urls[ticker]) for ticker in stock_tickers}
articles

{'PFE': ["Stock futures had a lazy start to August, pulling back from a July rally amid a flurry of mixed earnings. Futures on the S&P 500 (^GSPC) dropped 0.3%, while those on the Dow Jones Industrial Average (^DJI) lost 0.1%. The tech-heavy Nasdaq Composite (^IXIC) slumped 0.5% before the open. A plethora of earnings added mixed sentiments to what has been a bullish mood for most of 2023 so far. Of note, Caterpillar (CAT) warned of a coming slowdown in its business; Pfizer (PFE) trimmed the upper end of its revenue forecast; Uber (UBER) popped after posting a surprise profit. These earnings serve as the appetizer for the main event: second quarter results from Apple (AAPL) and Amazon (AMZN) that are due out Thursday. Both stocks are up more than 50% so far this year. Investors are also looking ahead to key pieces of data out this week, most notably the jobs report in the US on Friday. Monday's finish capped a strong month of July for all the major indexes. The S&P 500 and Nasdaq both 

In [93]:
def NLP_summarize(articles):
    summaries = []
    for article in articles:
        input_ids = tokenizer.encode(article, return_tensors='pt')
        output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
        summary = tokenizer.decode(output[0], skip_special_tokens=True)
        summaries.append(summary)
    return summaries

In [94]:
summaries = {ticker:NLP_summarize(articles[ticker]) for ticker in stock_tickers}
summaries

{'PFE': ['Caterpillar, Pfizer, Uber post mixed results. Apple, Amazon due to report second quarter results later this week',
  'Innovation Passport awarded for vepdegestrant for locally-advanced breast cancer.',
  'All images are copyrighted.',
  'Chris Boshoff appointed Chief Oncology Research and Development Officer.',
  'Shares are trading at 52-week lows after weight-loss drug failed in trial. Pfizer’s dividend yield stands out as 4.56%',
  'BioNTech, Moderna, Radius Health are among the top five stocks to buy.',
  'Drugmaker sees full-year revenue in the range of $67 billion to $70 billion.',
  'Company is advancing its allogeneic CAR-T cell therapy pipeline. New investor joins Caribou’s Scientific Advisory Board',
  'Pfizer, Merck, Vertex, Exelixis, Incyte to report second-quarter results.',
  'Drugmaker is expected to report quarterly earnings of $0.60 per share.',
  'Data set includes all 16 patients treated in dose escalation. Upsized public offering priced at $6.5 per share w

In [95]:
sentiment = pipeline('sentiment-analysis')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [96]:
# !pip install xformers

In [97]:
sentiment(summaries['BTC'])

[{'label': 'NEGATIVE', 'score': 0.9880996346473694},
 {'label': 'NEGATIVE', 'score': 0.9959627985954285},
 {'label': 'NEGATIVE', 'score': 0.9997323155403137},
 {'label': 'NEGATIVE', 'score': 0.9268648624420166},
 {'label': 'NEGATIVE', 'score': 0.999359667301178},
 {'label': 'POSITIVE', 'score': 0.9919583797454834},
 {'label': 'POSITIVE', 'score': 0.864882230758667},
 {'label': 'NEGATIVE', 'score': 0.9994691014289856},
 {'label': 'NEGATIVE', 'score': 0.9241886138916016},
 {'label': 'NEGATIVE', 'score': 0.9983410835266113},
 {'label': 'POSITIVE', 'score': 0.9985836744308472}]

In [98]:
scores = {ticker:sentiment(summaries[ticker]) for ticker in stock_tickers}
scores

{'PFE': [{'label': 'NEGATIVE', 'score': 0.9992376565933228},
  {'label': 'POSITIVE', 'score': 0.9437476992607117},
  {'label': 'NEGATIVE', 'score': 0.9880996346473694},
  {'label': 'POSITIVE', 'score': 0.9966739416122437},
  {'label': 'NEGATIVE', 'score': 0.9856473207473755},
  {'label': 'POSITIVE', 'score': 0.9992000460624695},
  {'label': 'NEGATIVE', 'score': 0.8220949769020081},
  {'label': 'POSITIVE', 'score': 0.9980408549308777},
  {'label': 'NEGATIVE', 'score': 0.9565234184265137},
  {'label': 'NEGATIVE', 'score': 0.9894277453422546},
  {'label': 'NEGATIVE', 'score': 0.9935374855995178}],
 'TSLA': [{'label': 'NEGATIVE', 'score': 0.9685801267623901},
  {'label': 'NEGATIVE', 'score': 0.9880996346473694},
  {'label': 'NEGATIVE', 'score': 0.9731018543243408},
  {'label': 'NEGATIVE', 'score': 0.9726715087890625},
  {'label': 'POSITIVE', 'score': 0.998879611492157},
  {'label': 'NEGATIVE', 'score': 0.9880996346473694},
  {'label': 'NEGATIVE', 'score': 0.9915708303451538},
  {'label': '

In [107]:
print(summaries['PFE'][3], scores['PFE'][3]['label'], scores['PFE'][3]['score'])


Chris Boshoff appointed Chief Oncology Research and Development Officer. POSITIVE 0.9966739416122437


In [108]:
scores['BTC'][0]['score']

0.9880996346473694

In [109]:
summaries

{'PFE': ['Caterpillar, Pfizer, Uber post mixed results. Apple, Amazon due to report second quarter results later this week',
  'Innovation Passport awarded for vepdegestrant for locally-advanced breast cancer.',
  'All images are copyrighted.',
  'Chris Boshoff appointed Chief Oncology Research and Development Officer.',
  'Shares are trading at 52-week lows after weight-loss drug failed in trial. Pfizer’s dividend yield stands out as 4.56%',
  'BioNTech, Moderna, Radius Health are among the top five stocks to buy.',
  'Drugmaker sees full-year revenue in the range of $67 billion to $70 billion.',
  'Company is advancing its allogeneic CAR-T cell therapy pipeline. New investor joins Caribou’s Scientific Advisory Board',
  'Pfizer, Merck, Vertex, Exelixis, Incyte to report second-quarter results.',
  'Drugmaker is expected to report quarterly earnings of $0.60 per share.',
  'Data set includes all 16 patients treated in dose escalation. Upsized public offering priced at $6.5 per share w

In [110]:
scores

{'PFE': [{'label': 'NEGATIVE', 'score': 0.9992376565933228},
  {'label': 'POSITIVE', 'score': 0.9437476992607117},
  {'label': 'NEGATIVE', 'score': 0.9880996346473694},
  {'label': 'POSITIVE', 'score': 0.9966739416122437},
  {'label': 'NEGATIVE', 'score': 0.9856473207473755},
  {'label': 'POSITIVE', 'score': 0.9992000460624695},
  {'label': 'NEGATIVE', 'score': 0.8220949769020081},
  {'label': 'POSITIVE', 'score': 0.9980408549308777},
  {'label': 'NEGATIVE', 'score': 0.9565234184265137},
  {'label': 'NEGATIVE', 'score': 0.9894277453422546},
  {'label': 'NEGATIVE', 'score': 0.9935374855995178}],
 'TSLA': [{'label': 'NEGATIVE', 'score': 0.9685801267623901},
  {'label': 'NEGATIVE', 'score': 0.9880996346473694},
  {'label': 'NEGATIVE', 'score': 0.9731018543243408},
  {'label': 'NEGATIVE', 'score': 0.9726715087890625},
  {'label': 'POSITIVE', 'score': 0.998879611492157},
  {'label': 'NEGATIVE', 'score': 0.9880996346473694},
  {'label': 'NEGATIVE', 'score': 0.9915708303451538},
  {'label': '

In [111]:
cleaned_urls

{'PFE': ['https://finance.yahoo.com/news/stock-futures-droop-amid-mixed-earnings-after-blazing-july-stock-market-news-today-121511468.html',
  'https://finance.yahoo.com/news/arvinas-pfizer-awarded-innovation-passport-203000619.html',
  'https://www.google.com/search?q%3Dyahoo%2Bfinance%2BPFE%26tbm%3Dnws%26pccc%3D1',
  'https://finance.yahoo.com/news/pfizer-announces-executive-leadership-advance-104500957.html',
  'https://finance.yahoo.com/news/earn-500-month-pfizer-stock-120100836.html',
  'https://finance.yahoo.com/news/10-best-small-cap-pharma-185609107.html',
  'https://finance.yahoo.com/news/pfizer-q2-earnings-snapshot-105818783.html',
  'https://finance.yahoo.com/news/caribou-biosciences-announces-25-million-120000433.html',
  'https://finance.yahoo.com/news/drug-biotech-stocks-q2-earnings-135800432.html',
  'https://finance.yahoo.com/news/earnings-preview-pfizer-pfe-q2-140133393.html',
  'https://finance.yahoo.com/news/caribou-biosciences-raises-125m-via-152759589.html'],
 'TSL

In [112]:
def format_data(summaries, scores, urls):
    output = []
    for ticker in stock_tickers:
        for counter in range(len(summaries[ticker])):
            output_this = [
                ticker,
                summaries[ticker][counter],
                scores[ticker][counter]['label'],
                scores[ticker][counter]['score'],
                urls[ticker][counter]
            ]
            output.append(output_this)
    return output

In [113]:
formatted_data = format_data(summaries, scores, cleaned_urls)
formatted_data

[['PFE',
  'Caterpillar, Pfizer, Uber post mixed results. Apple, Amazon due to report second quarter results later this week',
  'NEGATIVE',
  0.9992376565933228,
  'https://finance.yahoo.com/news/stock-futures-droop-amid-mixed-earnings-after-blazing-july-stock-market-news-today-121511468.html'],
 ['PFE',
  'Innovation Passport awarded for vepdegestrant for locally-advanced breast cancer.',
  'POSITIVE',
  0.9437476992607117,
  'https://finance.yahoo.com/news/arvinas-pfizer-awarded-innovation-passport-203000619.html'],
 ['PFE',
  'All images are copyrighted.',
  'NEGATIVE',
  0.9880996346473694,
  'https://www.google.com/search?q%3Dyahoo%2Bfinance%2BPFE%26tbm%3Dnws%26pccc%3D1'],
 ['PFE',
  'Chris Boshoff appointed Chief Oncology Research and Development Officer.',
  'POSITIVE',
  0.9966739416122437,
  'https://finance.yahoo.com/news/pfizer-announces-executive-leadership-advance-104500957.html'],
 ['PFE',
  'Shares are trading at 52-week lows after weight-loss drug failed in trial. Pfiz

In [121]:
import csv

In [122]:
titles = ["Ticker Name", "News Summary", "Sentiment", "Sentiment Rating", "News URL"]
# Write data to the CSV file
with open('stocksummaries.csv', mode='w', newline='') as f:
    csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    # Write the titles first
    csv_writer.writerow(titles)
    # Write the data rows
    csv_writer.writerows(formatted_data)