In [1]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from bs4 import BeautifulSoup
import requests

In [2]:
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

In [5]:
url = "https://au.finance.yahoo.com/news/china-restricting-tesla-use-uncovers-a-significant-challenge-for-elon-musk-expert-161921664.html"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
paragraphs = soup.find_all('p')

In [7]:
text = [paragraph.text for paragraph in paragraphs]
words = ' '.join(text).split(' ')[:400]
ARTICLE = ' '.join(words)

In [8]:
input_ids = tokenizer.encode(ARTICLE, return_tensors='pt')
output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
summary = tokenizer.decode(output[0], skip_special_tokens=True)

To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  ..\aten\src\ATen\native\BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


### 4. Building a News and Sentiment Pipeline

In [9]:
monitored_tickers = ['GME','TSLA','BTC']

#### 4.1. Search for Stock News using Google and Yahoo Finance

In [13]:
def search_for_stock_news_urls(ticker):
    search_url = "https://www.google.com/search?q=yahoo+finance+{}&tbm=nws".format(ticker)
    r = requests.get(search_url)
    soup = BeautifulSoup(r.text, 'html.parser')
    atags = soup.find_all('a')
    hrefs = [link['href'] for link in atags]
    return hrefs

In [21]:
# testing exempel
#search_url = "https://www.google.com/search?q=yahoo+finance+{}&tbm=nws".format('GME')

In [22]:
#search_url

In [18]:
raw_urls = {ticker:search_for_stock_news_urls(ticker) for ticker in monitored_tickers}

In [23]:
raw_urls

{'GME': ['/?sa=X&ved=0ahUKEwjqvOuKhMLzAhXIc98KHZr_AEsQOwgC',
  '/?output=search&ie=UTF-8&tbm=nws&sa=X&ved=0ahUKEwjqvOuKhMLzAhXIc98KHZr_AEsQPAgE',
  '/search?q=yahoo+finance+GME&tbm=nws&ie=UTF-8&gbv=1&sei=HgJkYeq1Msjn_Qaa_4PYBA',
  '/search?q=yahoo+finance+GME&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwjqvOuKhMLzAhXIc98KHZr_AEsQ_AUIBygA',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwjqvOuKhMLzAhXIc98KHZr_AEsQ_AUICSgC',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=bks&source=lnms&sa=X&ved=0ahUKEwjqvOuKhMLzAhXIc98KHZr_AEsQ_AUICigD',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwjqvOuKhMLzAhXIc98KHZr_AEsQ_AUICygE',
  '/advanced_search',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=nws&source=lnt&tbs=lr:lang_1ar&lr=lang_ar&sa=X&ved=0ahUKEwjqvOuKhMLzAhXIc98KHZr_AEsQpwUIDQ',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=nws&source=lnt&tbs=qdr:h&sa=X&ved=0ahUKEwjqvOuKhMLzAhXIc98KHZr_AEsQpwUIDw',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=nws

In [20]:
#testing
#raw_urls['GME']

#### 4.2. Strip out unwanted URLs

In [24]:
import re

In [25]:
exclude_list = ['maps', 'policies', 'preferences', 'accounts', 'support']

In [26]:
def strip_unwanted_urls(urls, exclude_list):
    val = []
    for url in urls: 
        if 'https://' in url and not any(exclude_word in url for exclude_word in exclude_list):
            res = re.findall(r'(https?://\S+)', url)[0].split('&')[0]
            val.append(res)
    return list(set(val))

In [29]:
#testing function strip_unwanted_urls
#strip_unwanted_urls(raw_urls['GME'],exclude_list)

#### 4.3. Search and Scrape Cleaned URLs

In [32]:
cleaned_urls = {tricker:strip_unwanted_urls(raw_urls[tricker],exclude_list) for tricker in monitored_tickers}
cleaned_urls

{'GME': ['https://finance.yahoo.com/news/gainey-mckenna-egleston-announces-class-144000626.html',
  'https://investorplace.com/2021/10/what-cei-stock-investors-are-saying-after-camber-energys-red-hot-week/',
  'https://finance.yahoo.com/news/the-rise-of-social-trading-how-the-internet-is-changing-investing-153643100.html',
  'https://finance.yahoo.com/news/michael-burry-says-received-subpoena-195853985.html',
  'https://finance.yahoo.com/news/ken-griffin-robinhood-strike-back-180254628.html',
  'https://finance.yahoo.com/news/game-stop-stock-is-enduring-an-awful-september-180120497.html',
  'https://finance.yahoo.com/news/significant-role-of-retail-investing-here-to-stay-suzanne-shank-164155648.html',
  'https://finance.yahoo.com/news/gensler-zeroes-in-on-citadel-securities-as-sec-considers-payment-for-order-flow-ban-191715790.html',
  'https://finance.yahoo.com/news/gamestop-gme-hire-500-staff-135001043.html',
  'https://finance.yahoo.com/news/the-game-stop-phenomenon-triggers-potenti

In [37]:
def scrape_and_process(URLs):
    ARTICLES = []
    for url in URLs: 
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        paragraphs = soup.find_all('p')
        text = [paragraph.text for paragraph in paragraphs]
        words = ' '.join(text).split(' ')[:350]
        ARTICLE = ' '.join(words)
        ARTICLES.append(ARTICLE)
    return ARTICLES

In [39]:
#testing function scrape_and_process
#articles = {ticker:scrape_and_process(cleaned_urls[ticker]) for ticker in monitored_tickers}
#


#### 4.4. Summarise all Articles

In [42]:
def summarize(articles):
    summaries = []
    for article in articles:
        input_ids = tokenizer.encode(article, return_tensors='pt')
        output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
        summary = tokenizer.decode(output[0], skip_special_tokens=True)
        summaries.append(summary)
    return summaries

In [43]:
summaries = {ticker:summarize(articles[ticker]) for ticker in monitored_tickers}
summaries

{'GME': ['Complaint alleges that defendants made false and misleading statements and/or lacked a reasonable basis.',
  'Energy and power solutions provider has been on a wild ride. Social media communities think it could go to the moon',
  'Social trading apps are competing for a slice of the pie.',
  'Michael Burry says he received a subpoena from SEC. Rally in video-game retailer’s shares sparked meme-stock frenzy',
  'Retail investors say Citadel Securities pressured them. Miami says claims in lawsuit are ‘unsubstantiated’',
  'Shares of the self-described tech company are down 12% so far in September. One could be punishing meme trader crowd for lack of clarity',
  "‘Retail factor is here to stay,' says Siebert Williams Shank. Off-exchange trading made up 47.2% of equity volume in January 2021",
  '‘I’m pro competition,’ SEC chair says. Payments for order flow may make markets less efficient',
  'Plans to hire nearly 500 employees at new customer service center in Pembroke Pines, F

In [44]:
#testing with BTC
summaries['BTC']

['Signals this week from U.S. regulators boosted sentiment.',
 'Analysts expect trading volumes to remain elevated',
 'The preferred view remains BTC is in a new bull run to new all-time highs. Bitcoin daily charts with detailed EWP count and technical indicators',
 'Largest cryptocurrency is trading at $44,000 after plunging to as low as $42,500.',
 '$1.6 billion purchase was made on a central exchange. Some see it as a signal of bullishness, others say it’s just a case of ‘buying on the cheap’',
 'Ethereum is meant to be more of a gas to power applications, Grayscale CEO says.',
 'Vitalik Buterin says making it mandatory is contrary to ideals of freedom. El Salvador has been slow to adopt the digital currency',
 'Data show large bitcoin purchases on centralized exchanges. But independent blockchain data analyst says buying mostly came from Coinbase',
 'Largest cryptocurrency is up more than 15% in the past week.',
 'Hedge fund manager has been ringing the alarm on crypto. Shiba inu h

### 5. Adding Sentiment Analysis

In [45]:
from transformers import pipeline
sentiment = pipeline('sentiment-analysis')

Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [47]:
sentiment(summaries['BTC'])

[{'label': 'POSITIVE', 'score': 0.9887939691543579},
 {'label': 'POSITIVE', 'score': 0.8669217228889465},
 {'label': 'NEGATIVE', 'score': 0.7618804574012756},
 {'label': 'NEGATIVE', 'score': 0.9983111023902893},
 {'label': 'NEGATIVE', 'score': 0.9990121722221375},
 {'label': 'NEGATIVE', 'score': 0.9987099766731262},
 {'label': 'NEGATIVE', 'score': 0.9957336783409119},
 {'label': 'NEGATIVE', 'score': 0.9950243234634399},
 {'label': 'NEGATIVE', 'score': 0.7754116654396057},
 {'label': 'NEGATIVE', 'score': 0.9789251089096069}]

In [51]:
scores = {ticker:sentiment(summaries[ticker]) for ticker in monitored_tickers}

In [57]:
#testing the sentiment analysis with summaries work
#print(summaries['BTC'][0], scores['BTC'][0]['label'],scores['BTC'][0]['score'])

In [54]:
scores

{'GME': [{'label': 'NEGATIVE', 'score': 0.9994003176689148},
  {'label': 'POSITIVE', 'score': 0.9961391091346741},
  {'label': 'NEGATIVE', 'score': 0.9865431785583496},
  {'label': 'NEGATIVE', 'score': 0.9614684581756592},
  {'label': 'NEGATIVE', 'score': 0.9942743182182312},
  {'label': 'NEGATIVE', 'score': 0.9997784495353699},
  {'label': 'POSITIVE', 'score': 0.9101598262786865},
  {'label': 'NEGATIVE', 'score': 0.9988982677459717},
  {'label': 'POSITIVE', 'score': 0.9902737140655518},
  {'label': 'NEGATIVE', 'score': 0.9995399713516235}],
 'TSLA': [{'label': 'NEGATIVE', 'score': 0.9954897165298462},
  {'label': 'POSITIVE', 'score': 0.801917552947998},
  {'label': 'NEGATIVE', 'score': 0.9975821375846863},
  {'label': 'NEGATIVE', 'score': 0.9670628309249878},
  {'label': 'POSITIVE', 'score': 0.9962459206581116},
  {'label': 'POSITIVE', 'score': 0.9839545488357544},
  {'label': 'NEGATIVE', 'score': 0.9911378026008606},
  {'label': 'POSITIVE', 'score': 0.99591463804245},
  {'label': 'PO

In [58]:
#testing BTC scores
#scores['BTC'][0]['score']

### 6. Exporting Results to CSV

In [60]:
summaries

{'GME': ['Complaint alleges that defendants made false and misleading statements and/or lacked a reasonable basis.',
  'Energy and power solutions provider has been on a wild ride. Social media communities think it could go to the moon',
  'Social trading apps are competing for a slice of the pie.',
  'Michael Burry says he received a subpoena from SEC. Rally in video-game retailer’s shares sparked meme-stock frenzy',
  'Retail investors say Citadel Securities pressured them. Miami says claims in lawsuit are ‘unsubstantiated’',
  'Shares of the self-described tech company are down 12% so far in September. One could be punishing meme trader crowd for lack of clarity',
  "‘Retail factor is here to stay,' says Siebert Williams Shank. Off-exchange trading made up 47.2% of equity volume in January 2021",
  '‘I’m pro competition,’ SEC chair says. Payments for order flow may make markets less efficient',
  'Plans to hire nearly 500 employees at new customer service center in Pembroke Pines, F

In [61]:
scores

{'GME': [{'label': 'NEGATIVE', 'score': 0.9994003176689148},
  {'label': 'POSITIVE', 'score': 0.9961391091346741},
  {'label': 'NEGATIVE', 'score': 0.9865431785583496},
  {'label': 'NEGATIVE', 'score': 0.9614684581756592},
  {'label': 'NEGATIVE', 'score': 0.9942743182182312},
  {'label': 'NEGATIVE', 'score': 0.9997784495353699},
  {'label': 'POSITIVE', 'score': 0.9101598262786865},
  {'label': 'NEGATIVE', 'score': 0.9988982677459717},
  {'label': 'POSITIVE', 'score': 0.9902737140655518},
  {'label': 'NEGATIVE', 'score': 0.9995399713516235}],
 'TSLA': [{'label': 'NEGATIVE', 'score': 0.9954897165298462},
  {'label': 'POSITIVE', 'score': 0.801917552947998},
  {'label': 'NEGATIVE', 'score': 0.9975821375846863},
  {'label': 'NEGATIVE', 'score': 0.9670628309249878},
  {'label': 'POSITIVE', 'score': 0.9962459206581116},
  {'label': 'POSITIVE', 'score': 0.9839545488357544},
  {'label': 'NEGATIVE', 'score': 0.9911378026008606},
  {'label': 'POSITIVE', 'score': 0.99591463804245},
  {'label': 'PO

In [62]:
cleaned_urls

{'GME': ['https://finance.yahoo.com/news/gainey-mckenna-egleston-announces-class-144000626.html',
  'https://investorplace.com/2021/10/what-cei-stock-investors-are-saying-after-camber-energys-red-hot-week/',
  'https://finance.yahoo.com/news/the-rise-of-social-trading-how-the-internet-is-changing-investing-153643100.html',
  'https://finance.yahoo.com/news/michael-burry-says-received-subpoena-195853985.html',
  'https://finance.yahoo.com/news/ken-griffin-robinhood-strike-back-180254628.html',
  'https://finance.yahoo.com/news/game-stop-stock-is-enduring-an-awful-september-180120497.html',
  'https://finance.yahoo.com/news/significant-role-of-retail-investing-here-to-stay-suzanne-shank-164155648.html',
  'https://finance.yahoo.com/news/gensler-zeroes-in-on-citadel-securities-as-sec-considers-payment-for-order-flow-ban-191715790.html',
  'https://finance.yahoo.com/news/gamestop-gme-hire-500-staff-135001043.html',
  'https://finance.yahoo.com/news/the-game-stop-phenomenon-triggers-potenti

In [66]:
def create_output_array(summaries, scores, urls):
    output = []
    for ticker in monitored_tickers:
        for counter in range(len(summaries[ticker])):
            output_this = [
                ticker,
                summaries[ticker][counter],
                scores[ticker][counter]['label'],
                scores[ticker][counter]['score'],
                urls[ticker][counter]
            ]
            output.append(output_this)
    return output

In [67]:
final_output = create_output_array(summaries, scores, cleaned_urls)
final_output

[['GME',
  'Complaint alleges that defendants made false and misleading statements and/or lacked a reasonable basis.',
  'NEGATIVE',
  0.9994003176689148,
  'https://finance.yahoo.com/news/gainey-mckenna-egleston-announces-class-144000626.html'],
 ['GME',
  'Energy and power solutions provider has been on a wild ride. Social media communities think it could go to the moon',
  'POSITIVE',
  0.9961391091346741,
  'https://investorplace.com/2021/10/what-cei-stock-investors-are-saying-after-camber-energys-red-hot-week/'],
 ['GME',
  'Social trading apps are competing for a slice of the pie.',
  'NEGATIVE',
  0.9865431785583496,
  'https://finance.yahoo.com/news/the-rise-of-social-trading-how-the-internet-is-changing-investing-153643100.html'],
 ['GME',
  'Michael Burry says he received a subpoena from SEC. Rally in video-game retailer’s shares sparked meme-stock frenzy',
  'NEGATIVE',
  0.9614684581756592,
  'https://finance.yahoo.com/news/michael-burry-says-received-subpoena-195853985.htm

In [68]:
final_output.insert(0, ['Ticker', 'Summary', 'Label', 'Confidence', 'URL'])

In [69]:
final_output

[['Ticker', 'Summary', 'Label', 'Confidence', 'URL'],
 ['GME',
  'Complaint alleges that defendants made false and misleading statements and/or lacked a reasonable basis.',
  'NEGATIVE',
  0.9994003176689148,
  'https://finance.yahoo.com/news/gainey-mckenna-egleston-announces-class-144000626.html'],
 ['GME',
  'Energy and power solutions provider has been on a wild ride. Social media communities think it could go to the moon',
  'POSITIVE',
  0.9961391091346741,
  'https://investorplace.com/2021/10/what-cei-stock-investors-are-saying-after-camber-energys-red-hot-week/'],
 ['GME',
  'Social trading apps are competing for a slice of the pie.',
  'NEGATIVE',
  0.9865431785583496,
  'https://finance.yahoo.com/news/the-rise-of-social-trading-how-the-internet-is-changing-investing-153643100.html'],
 ['GME',
  'Michael Burry says he received a subpoena from SEC. Rally in video-game retailer’s shares sparked meme-stock frenzy',
  'NEGATIVE',
  0.9614684581756592,
  'https://finance.yahoo.com/n

In [71]:
import csv
with open('assetsummaries.csv', mode='w', newline='') as f:
    csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerows(final_output)