### 1. Install and Import Baseline Dependencies

In [37]:
!pip install -q transformers sentencepiece

In [38]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from bs4 import BeautifulSoup
import requests

### 2. Setup Summarization Model

In [39]:
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

### 3. Summarize a Single Article

In [40]:
url = 'https://finance.yahoo.com/news/1-cop27-nuclear-boss-doesnt-100157501.html'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
paragraphs = soup.find_all('p')

In [41]:
paragraphs

[<p>(Recasts, adds background, details)</p>,
 <p>By Richard Valdmanis and Timothy Gardner</p>,
 <p>SHARM EL-SHEIKH, Egypt, Nov 10 (Reuters) - Recycling radioactive waste from nuclear power has security and cost challenges but the U.N. International Atomic Energy Agency (IAEA) would be able to monitor the process should more countries take that path, the IAEA head said this week.</p>,
 <p>Increased efforts to fight climate change and soaring power costs in parts of the world have renewed interest in nuclear power, raising the likelihood of an expansion of the industry after years of low investment because of safety concerns.</p>,
 <p>The administration of U.S. President Joe Biden, for example, sees nuclear energy as a critical in tackling emissions in the world's second-biggest greenhouse gas producer, and is exploring recycling as a way to boost domestic supplies of nuclear fuel and reduce waste.</p>,
 <p>When asked about nuclear reprocessing, IAEA Director General Rafael Grossi downpl

In [42]:
text = [paragraph.text for paragraph in paragraphs]
words = ' '.join(text).split(' ')[:400]
ARTICLE = ' '.join(words)

In [43]:
len(words)

400

In [44]:
ARTICLE

'(Recasts, adds background, details) By Richard Valdmanis and Timothy Gardner SHARM EL-SHEIKH, Egypt, Nov 10 (Reuters) - Recycling radioactive waste from nuclear power has security and cost challenges but the U.N. International Atomic Energy Agency (IAEA) would be able to monitor the process should more countries take that path, the IAEA head said this week. Increased efforts to fight climate change and soaring power costs in parts of the world have renewed interest in nuclear power, raising the likelihood of an expansion of the industry after years of low investment because of safety concerns. The administration of U.S. President Joe Biden, for example, sees nuclear energy as a critical in tackling emissions in the world\'s second-biggest greenhouse gas producer, and is exploring recycling as a way to boost domestic supplies of nuclear fuel and reduce waste. When asked about nuclear reprocessing, IAEA Director General Rafael Grossi downplayed the chances it would become a reality in m

In [45]:
input_ids = tokenizer(ARTICLE, return_tensors = 'pt').input_ids
output = model.generate(input_ids, max_length=32, num_beams=5, early_stopping=True) # max_length: maximum length of summary that we want
summary = tokenizer.decode(output[0], skip_special_tokens=True)

In [46]:
summary

'Reprocessing is a ‘very difficult technology,’ IAEA head says'

### 4. Building a News and Sentiment Pipeline

In [47]:
monitored_tickers = ['GME', 'TSLA', 'BTC']

##### 4.1 Search For Stock News using Google and Yahoo Finance

In [48]:
def search_for_stock_news_urls(ticker):
    search_url = "https://www.google.com/search?q=yahoo+finance+()&tbm=nws".format(ticker)
    r = requests.get(search_url)
    soup = BeautifulSoup(r.text, 'html.parser')
    atags = soup.find_all('a')
    hrefs = [link['href'] for link in atags]
    return hrefs

In [49]:
raw_urls = {ticker: search_for_stock_news_urls(ticker) for ticker in monitored_tickers}
raw_urls

{'GME': ['/?sa=X&ved=0ahUKEwi-gKD276P7AhUwElkFHYYABvcQOwgC',
  '/search?q=yahoo+finance+()&tbm=nws&ie=UTF-8&gbv=1&sei=ZhFtY761NbCk5NoPhoGYuA8',
  '/search?q=yahoo+finance+()&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwi-gKD276P7AhUwElkFHYYABvcQ_AUIBSgA',
  '/search?q=yahoo+finance+()&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwi-gKD276P7AhUwElkFHYYABvcQ_AUIBigB',
  '/search?q=yahoo+finance+()&ie=UTF-8&tbm=shop&source=lnms&sa=X&ved=0ahUKEwi-gKD276P7AhUwElkFHYYABvcQ_AUICCgD',
  'https://maps.google.com/maps?q=yahoo+finance+()&um=1&ie=UTF-8&sa=X&ved=0ahUKEwi-gKD276P7AhUwElkFHYYABvcQ_AUICSgE',
  '/search?q=yahoo+finance+()&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwi-gKD276P7AhUwElkFHYYABvcQ_AUICigF',
  '/search?q=yahoo+finance+()&ie=UTF-8&tbm=bks&source=lnms&sa=X&ved=0ahUKEwi-gKD276P7AhUwElkFHYYABvcQ_AUICygG',
  '/advanced_search',
  '/search?q=yahoo+finance+()&ie=UTF-8&tbm=nws&source=lnt&tbs=qdr:h&sa=X&ved=0ahUKEwi-gKD276P7AhUwElkFHYYABvcQpwUIDQ',
  '/search?q=yahoo+finance+()&ie=UTF-8&tbm=

In [50]:
raw_urls['GME'][0]

'/?sa=X&ved=0ahUKEwi-gKD276P7AhUwElkFHYYABvcQOwgC'

##### 4.2 Strip out unwanted URLS

In [51]:
import re

exclude_list = ['maps', 'policies', 'preferences', 'accounts', 'support']

In [52]:
def strip_unwanted_urls(urls, exclude_list):
    val = []
    for url in urls:
        if 'https://' in url and not any(exclude_word in url for exclude_word in exclude_list):
            res = re.findall(r'(https?://\S+)', url)[0].split('&')[0]
            val.append(res)
    return list(set(val))

In [53]:
strip_unwanted_urls(raw_urls['GME'], exclude_list) # example for one ticker GME

['https://news.yahoo.com/stock-market-news-live-updates-november-9-2022-125636212.html',
 'https://news.yahoo.com/is-it-the-right-time-to-buy-a-home-experts-weigh-in-162745305.html',
 'https://finance.yahoo.com/news/amazon-becomes-world-first-public-191725519.html',
 'https://news.yahoo.com/bullish-indicator-morning-brief-110059642.html',
 'https://news.yahoo.com/stock-market-news-live-updates-november-8-2022-125124481.html',
 'https://news.yahoo.com/midterm-elections-stock-market-112234022.html',
 'https://news.yahoo.com/chart-stock-markets-next-problem-185510276.html',
 'https://news.yahoo.com/meta-stock-boost-from-layoffs-192331851.html',
 'https://news.yahoo.com/recession-talk-could-be-overblown-morning-brief-103035628.html',
 'https://news.yahoo.com/stock-market-minimal-impact-from-midterm-elections-104546073.html']

In [54]:
cleaned_urls = {ticker: strip_unwanted_urls(raw_urls[ticker], exclude_list) for ticker in monitored_tickers}
cleaned_urls

{'GME': ['https://news.yahoo.com/stock-market-news-live-updates-november-9-2022-125636212.html',
  'https://news.yahoo.com/is-it-the-right-time-to-buy-a-home-experts-weigh-in-162745305.html',
  'https://finance.yahoo.com/news/amazon-becomes-world-first-public-191725519.html',
  'https://news.yahoo.com/bullish-indicator-morning-brief-110059642.html',
  'https://news.yahoo.com/stock-market-news-live-updates-november-8-2022-125124481.html',
  'https://news.yahoo.com/midterm-elections-stock-market-112234022.html',
  'https://news.yahoo.com/chart-stock-markets-next-problem-185510276.html',
  'https://news.yahoo.com/meta-stock-boost-from-layoffs-192331851.html',
  'https://news.yahoo.com/recession-talk-could-be-overblown-morning-brief-103035628.html',
  'https://news.yahoo.com/stock-market-minimal-impact-from-midterm-elections-104546073.html'],
 'TSLA': ['https://news.yahoo.com/stock-market-news-live-updates-november-9-2022-125636212.html',
  'https://news.yahoo.com/is-it-the-right-time-to-b

##### 4.3 Search and Scrape Cleaned URLs

In [55]:
def scrape_and_process(URLs):
    ARTICLES = []
    for url in URLs:
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        paragraphs = soup.find_all('p')
        text = [paragraph.text for paragraph in paragraphs]
        words = ' '.join(text).split(' ')[:350]
        ARTICLE = ' '.join(words)
        ARTICLES.append(ARTICLE)
    return ARTICLES

In [56]:
articles = {ticker: scrape_and_process(cleaned_urls[ticker]) for ticker in monitored_tickers}
articles

{'GME': ["U.S. stocks plummeted Wednesday after three days of gains, as investors mulled over a mixed verdict from the midterm election results and highly anticipated inflation data looms on the docket. The S&P 500 (^GSPC) shed over 2%, while the Dow Jones Industrial Average (^DJI) fell by nearly 650 points, or roughly 2%. The technology-heavy Nasdaq Composite (^IXIC) dragged down by almost 2.5%, or 260 points. Investors’ optimism during stocks' three-day rally was built on expectations that Republicans would gain ground and create gridlock in Washington. But the Republican red wave failed to materialize in the U.S. midterms. Democrats managed to flip a crucial Senate seat, with John Fetterman beating Mehmet Oz in the Pennsylvania race. As of late Wednesday, both House and Senate control remains in the balance. Georgia's U.S. Senate race, meanwhile, is heading to a runoff, with neither major candidate on track to win a majority of votes. The year after midterm elections tends to see th

In [57]:
len(articles['TSLA'])

10

In [58]:
articles['TSLA'][0]

"U.S. stocks plummeted Wednesday after three days of gains, as investors mulled over a mixed verdict from the midterm election results and highly anticipated inflation data looms on the docket. The S&P 500 (^GSPC) shed over 2%, while the Dow Jones Industrial Average (^DJI) fell by nearly 650 points, or roughly 2%. The technology-heavy Nasdaq Composite (^IXIC) dragged down by almost 2.5%, or 260 points. Investors’ optimism during stocks' three-day rally was built on expectations that Republicans would gain ground and create gridlock in Washington. But the Republican red wave failed to materialize in the U.S. midterms. Democrats managed to flip a crucial Senate seat, with John Fetterman beating Mehmet Oz in the Pennsylvania race. As of late Wednesday, both House and Senate control remains in the balance. Georgia's U.S. Senate race, meanwhile, is heading to a runoff, with neither major candidate on track to win a majority of votes. The year after midterm elections tends to see the highest

##### 4.4 Summarize all Articles

In [59]:
def summarize(articles):
    summaries = []
    for article in articles:
        input_ids = tokenizer.encode(article, return_tensors='pt')
        output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
        summary = tokenizer.decode(output[0], skip_special_tokens = True)
        summaries.append(summary)
    return summaries

In [60]:
summaries = {ticker: summarize(articles[ticker]) for ticker in monitored_tickers}
summaries

{'GME': ['Democrats flip crucial Senate seat, but control remains in balance. Consumer price index expected to fall to 7.9% in October',
  "Mortgage rates are high, but experts say it's time to buy. Experts say prices will go back to pre-pandemic 2020 levels",
  'Shares fall 4.3% on Wednesday, pushing market value below $1 trillion. Co-founder Jeff Bezos’ fortune has dwindled to $109 billion',
  'BofA says clients sold a net $6 billion in U.S. stocks last week. Two-year U.S. bonds yield more than five-year bonds',
  'Mid-term elections will have ‘modest’ impact on FX markets, JPMorgan says. Corporate earnings, inflation data also on investors’ radars',
  'Republicans seen as more pro-business. S&P 500 has historically outperformed in one and three month periods',
  'S&P 500 earnings estimates for the fourth quarter have dropped 4.9%. Higher rates, inflation seen exerting greatest influence on markets',
  'Cost-cutting move should boost earnings per share, Jefferies says. Meta said Wedn

### 5. Adding Sentiment Analysis

In [61]:
from transformers import pipeline
sentiment = pipeline('sentiment-analysis')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [62]:
scores = {ticker: sentiment(summaries[ticker]) for ticker in monitored_tickers}
scores

{'GME': [{'label': 'NEGATIVE', 'score': 0.9853833913803101},
  {'label': 'NEGATIVE', 'score': 0.9821999073028564},
  {'label': 'NEGATIVE', 'score': 0.9995256662368774},
  {'label': 'NEGATIVE', 'score': 0.8551507592201233},
  {'label': 'POSITIVE', 'score': 0.9949733018875122},
  {'label': 'NEGATIVE', 'score': 0.9824764728546143},
  {'label': 'NEGATIVE', 'score': 0.9894402623176575},
  {'label': 'NEGATIVE', 'score': 0.9434362649917603},
  {'label': 'NEGATIVE', 'score': 0.9988952875137329},
  {'label': 'POSITIVE', 'score': 0.8238267302513123}],
 'TSLA': [{'label': 'NEGATIVE', 'score': 0.9853833913803101},
  {'label': 'NEGATIVE', 'score': 0.9821999073028564},
  {'label': 'NEGATIVE', 'score': 0.9995256662368774},
  {'label': 'NEGATIVE', 'score': 0.8551507592201233},
  {'label': 'POSITIVE', 'score': 0.9949733018875122},
  {'label': 'NEGATIVE', 'score': 0.9824764728546143},
  {'label': 'NEGATIVE', 'score': 0.9894402623176575},
  {'label': 'NEGATIVE', 'score': 0.9434362649917603},
  {'label': 

In [63]:
print(summaries['BTC'][0], scores['BTC'][0]['label'], scores['BTC'][0]['score'])

Democrats flip crucial Senate seat, but control remains in balance. Consumer price index expected to fall to 7.9% in October NEGATIVE 0.9853833913803101


### 6. Exporting Results to CSV

In [64]:
def create_output_array(summaries, scores, urls):
    output = []
    for ticker in monitored_tickers:
        for counter in range(len(summaries[ticker])):
            output_this = [
                ticker,
                summaries[ticker][counter],
                scores[ticker][counter]['label'],
                scores[ticker][counter]['score'],
                urls[ticker][counter]
            ]
            output.append(output_this)
    return output

In [65]:
import os
os.chdir('/content/drive/MyDrive/Data Science WorkSpace/Data Science Projects/Automating Stocks and Crypto News Sentiment Analysis')
final_output = create_output_array(summaries, scores, cleaned_urls)
final_output

[['GME',
  'Apple, Facebook parent expected to begin large-scale layoffs. Carvana shares sink after Morgan Stanley cuts price target',
  'NEGATIVE',
  0.999729573726654,
  'https://news.yahoo.com/stock-market-news-live-updates-november-7-2022-110101325.html'],
 ['GME',
  "Mortgage rates are high, but experts say it's time to buy. Experts say prices will go back to pre-pandemic 2020 levels",
  'NEGATIVE',
  0.9821999073028564,
  'https://news.yahoo.com/is-it-the-right-time-to-buy-a-home-experts-weigh-in-162745305.html'],
 ['GME',
  'Democrats flip crucial Senate seat, but control remains in balance. Consumer price index expected to fall to 7.9% in October',
  'NEGATIVE',
  0.9853833913803101,
  'https://news.yahoo.com/stock-market-news-live-updates-november-9-2022-125636212.html'],
 ['GME',
  'BofA says clients sold a net $6 billion in U.S. stocks last week. Two-year U.S. bonds yield more than five-year bonds',
  'NEGATIVE',
  0.8551507592201233,
  'https://news.yahoo.com/bullish-indica

[['GME',
  'Democrats flip crucial Senate seat, but control remains in balance. Consumer price index expected to fall to 7.9% in October',
  'NEGATIVE',
  0.9853833913803101,
  'https://news.yahoo.com/stock-market-news-live-updates-november-9-2022-125636212.html'],
 ['GME',
  "Mortgage rates are high, but experts say it's time to buy. Experts say prices will go back to pre-pandemic 2020 levels",
  'NEGATIVE',
  0.9821999073028564,
  'https://news.yahoo.com/is-it-the-right-time-to-buy-a-home-experts-weigh-in-162745305.html'],
 ['GME',
  'Shares fall 4.3% on Wednesday, pushing market value below $1 trillion. Co-founder Jeff Bezos’ fortune has dwindled to $109 billion',
  'NEGATIVE',
  0.9995256662368774,
  'https://finance.yahoo.com/news/amazon-becomes-world-first-public-191725519.html'],
 ['GME',
  'BofA says clients sold a net $6 billion in U.S. stocks last week. Two-year U.S. bonds yield more than five-year bonds',
  'NEGATIVE',
  0.8551507592201233,
  'https://news.yahoo.com/bullish-

In [66]:
final_output[11]

['TSLA',
 "Mortgage rates are high, but experts say it's time to buy. Experts say prices will go back to pre-pandemic 2020 levels",
 'NEGATIVE',
 0.9821999073028564,
 'https://news.yahoo.com/is-it-the-right-time-to-buy-a-home-experts-weigh-in-162745305.html']

In [67]:
final_output.insert(0, ['Ticker', 'Summary', 'Label', 'Confidence', 'URL'])

In [68]:
final_output

[['Ticker', 'Summary', 'Label', 'Confidence', 'URL'],
 ['GME',
  'Democrats flip crucial Senate seat, but control remains in balance. Consumer price index expected to fall to 7.9% in October',
  'NEGATIVE',
  0.9853833913803101,
  'https://news.yahoo.com/stock-market-news-live-updates-november-9-2022-125636212.html'],
 ['GME',
  "Mortgage rates are high, but experts say it's time to buy. Experts say prices will go back to pre-pandemic 2020 levels",
  'NEGATIVE',
  0.9821999073028564,
  'https://news.yahoo.com/is-it-the-right-time-to-buy-a-home-experts-weigh-in-162745305.html'],
 ['GME',
  'Shares fall 4.3% on Wednesday, pushing market value below $1 trillion. Co-founder Jeff Bezos’ fortune has dwindled to $109 billion',
  'NEGATIVE',
  0.9995256662368774,
  'https://finance.yahoo.com/news/amazon-becomes-world-first-public-191725519.html'],
 ['GME',
  'BofA says clients sold a net $6 billion in U.S. stocks last week. Two-year U.S. bonds yield more than five-year bonds',
  'NEGATIVE',
  

In [69]:
import csv
with open('assetsummaries.csv', mode= 'w', newline='') as f:
    csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerows(final_output)