# **Install and Import Dependencies**

In [1]:
!pip install transformers




In [2]:
% pip install sentencepiece



In [3]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from bs4 import BeautifulSoup
import requests

# **Setup Summarization Model**

In [4]:
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1912529.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1341.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1436.0, style=ProgressStyle(description…




# **Summarizer a Single Article**

In [5]:
url = "https://au.finance.yahoo.com/news/china-restricting-tesla-use-uncovers-a-significant-challenge-for-elon-musk-expert-161921664.html"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
paragraphs = soup.find_all('p')

In [6]:
paragraphs[0].text

'Reporting live from Parliament House, sign up to get our Budget Breakdown.'

In [7]:
text = [paragraph.text for paragraph in paragraphs]
words = ' '.join(text).split(' ')[:400]
ARTICLE = ' '.join(words)

In [8]:
input_ids = tokenizer.encode(ARTICLE, return_tensors ='pt')
output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
summary = tokenizer.decode(output[0], skip_special_tokens=True)

In [9]:
summary

'Tensions between U.S. and China may have already had blowback. Tesla has been successful in China in recent years'

# **4. Building a news and Sentiment Pipeline**

In [10]:
monitored_tickers = ['GME', 'TSLA', 'BTC', 'ETH']

4.1. Search for Stock News using Google and Yahoo Finance

In [11]:
def search_for_stock_news_urls(ticker):
    search_url = "https://www.google.com/search?q=yahoo+finance+{}&tbm=nws".format(ticker)
    r = requests.get(search_url)
    soup = BeautifulSoup(r.text, 'html.parser')
    atags = soup.find_all('a')
    hrefs = [link['href'] for link in atags]
    return hrefs 

In [None]:
raw_urls = {ticker:search_for_stock_news_urls(ticker) for ticker in monitored_tickers}
raw_urls

In [None]:
raw_urls['ETH']

4.2. Strip out unwanted URLs

In [14]:
import re

In [15]:
exclude_list = ['maps', 'policies', 'preferences', 'accounts', 'support']

In [16]:
def strip_unwanted_urls(urls, exclude_list):
    val = []
    for url in urls: 
        if 'https://' in url and not any(exclude_word in url for exclude_word in exclude_list):
            res = re.findall(r'(https?://\S+)', url)[0].split('&')[0]
            val.append(res)
    return list(set(val))

In [None]:
cleaned_urls = {ticker:strip_unwanted_urls(raw_urls[ticker], exclude_list) for ticker in monitored_tickers}
cleaned_urls

4.3. Search and Scrape Cleaned URLs

In [18]:
def scrape_and_process(URLs):
    ARTICLES = []
    for url in URLs: 
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        paragraphs = soup.find_all('p')
        text = [paragraph.text for paragraph in paragraphs]
        words = ' '.join(text).split(' ')[:350]
        ARTICLE = ' '.join(words)
        ARTICLES.append(ARTICLE)
    return ARTICLES

In [None]:
articles = {ticker:scrape_and_process(cleaned_urls[ticker]) for ticker in monitored_tickers}
articles

In [None]:
articles['ETH'][2]

4.4. Summarise all Articles

In [21]:
def summarize(articles):
    summaries = []
    for article in articles:
        input_ids = tokenizer.encode(article, return_tensors='pt')
        output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
        summary = tokenizer.decode(output[0], skip_special_tokens=True)
        summaries.append(summary)
    return summaries

In [22]:
summaries = {ticker:summarize(articles[ticker]) for ticker in monitored_tickers}
summaries

{'BTC': ['Largest token is trading in a tight range between $56,400 and $59,700. Ether is also in a tight range between $64,000 and $64,000',
  'Fidelity’s Wise Origin Bitcoin Trust would be listed. Cboe seeking to list Wise Origin Bitcoin Trust',
  'Scammers claim to have given away nearly $100,000 in crypto. Bitcoin remains in tight trading range between $56,400 and $59,700',
  'Is Bitcoin a good investment or a dangerous speculative bubble? Experts on both sides of the coin give their views',
  'Kevin Simpson, Founder and Chief Investment Officer at Capital Wealth Planning.',
  'Digital currency was invented under the pseudonym Satoshi Nakamoto.',
  'Largest cryptocurrency is trading slightly above $55,000.',
  'Dogecoin fell through 23.6% FIB of $0.5691, 38.2% FIB of $0.4618.',
  'Second-largest crypto is up four-fold in 2021 vs. Bitcoin. Wedbush sees crypto boom playing out for next decade',
  'Largest cryptocurrency continues to trade lower than April’s high. Ethereum has more th

In [23]:
summaries['ETH']

['Dogecoin plunges as Elon Musk calls it ‘a hustle’ on SNL.',
 'Ethereum has gained more than 436% this year. Bitcoin has fallen below $20,000 for the first time since December',
 'Musk’s tweet on Sunday failed to lift the coin. Dogecoin has been rallying since the start of the year',
 'Scammers claim to have given away nearly $100,000 in crypto. Bitcoin remains in tight trading range between $56,400 and $59,700',
 'Ether’s daily trading volume jumped to $2.7 billion in 2021.',
 'CBD Hemp corporation collaborates with Epazz to distribute its tokens. CryObo to help IEOs manage their crypto assets',
 'Kevin Simpson, Founder and Chief Investment Officer at Capital Wealth Planning.',
 'Public testnet of OMGX launched today by Enya and OMG Network. Smart contracts, token staking and cross-chain liquidity to be supported',
 'Analytics suggest more Ether is being held in DeFi smart contracts.',
 'Second-largest crypto is up four-fold in 2021 vs. Bitcoin. Wedbush sees crypto boom playing out f

# **5. Adding Sentiment Analysis**

In [24]:
from transformers import pipeline
sentiment = pipeline('sentiment-analysis')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=629.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267844284.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=48.0, style=ProgressStyle(description_w…




In [25]:
sentiment(summaries['ETH'])

[{'label': 'NEGATIVE', 'score': 0.9818911552429199},
 {'label': 'NEGATIVE', 'score': 0.9982873797416687},
 {'label': 'NEGATIVE', 'score': 0.99958735704422},
 {'label': 'NEGATIVE', 'score': 0.9969335198402405},
 {'label': 'NEGATIVE', 'score': 0.6294972896575928},
 {'label': 'NEGATIVE', 'score': 0.9920616149902344},
 {'label': 'POSITIVE', 'score': 0.9861441850662231},
 {'label': 'NEGATIVE', 'score': 0.9659869074821472},
 {'label': 'NEGATIVE', 'score': 0.9885581135749817},
 {'label': 'POSITIVE', 'score': 0.6359964609146118}]

In [26]:
scores = {ticker:sentiment(summaries[ticker]) for ticker in monitored_tickers}
scores

{'BTC': [{'label': 'NEGATIVE', 'score': 0.9869668483734131},
  {'label': 'NEGATIVE', 'score': 0.9771203398704529},
  {'label': 'NEGATIVE', 'score': 0.9969335198402405},
  {'label': 'NEGATIVE', 'score': 0.9512415528297424},
  {'label': 'POSITIVE', 'score': 0.9861441850662231},
  {'label': 'NEGATIVE', 'score': 0.98663729429245},
  {'label': 'NEGATIVE', 'score': 0.9675812125205994},
  {'label': 'NEGATIVE', 'score': 0.9996721744537354},
  {'label': 'POSITIVE', 'score': 0.6359964609146118},
  {'label': 'NEGATIVE', 'score': 0.9988906979560852}],
 'ETH': [{'label': 'NEGATIVE', 'score': 0.9818911552429199},
  {'label': 'NEGATIVE', 'score': 0.9982873797416687},
  {'label': 'NEGATIVE', 'score': 0.99958735704422},
  {'label': 'NEGATIVE', 'score': 0.9969335198402405},
  {'label': 'NEGATIVE', 'score': 0.6294972896575928},
  {'label': 'NEGATIVE', 'score': 0.9920616149902344},
  {'label': 'POSITIVE', 'score': 0.9861441850662231},
  {'label': 'NEGATIVE', 'score': 0.9659869074821472},
  {'label': 'NEGA

In [27]:
print(summaries['ETH'][3], scores['ETH'][3]['label'], scores['ETH'][3]['score'])

Scammers claim to have given away nearly $100,000 in crypto. Bitcoin remains in tight trading range between $56,400 and $59,700 NEGATIVE 0.9969335198402405


In [32]:
scores['ETH'][0]['score']

0.9818911552429199

# **6. Exporting Results to CSV**

In [None]:
summaries

In [None]:
scores

In [None]:
cleaned_urls

In [36]:
range(len(summaries['ETH']))

range(0, 10)

In [37]:
summaries['ETH'][3]

'Scammers claim to have given away nearly $100,000 in crypto. Bitcoin remains in tight trading range between $56,400 and $59,700'

In [38]:
def create_output_array(summaries, scores, urls):
    output = []
    for ticker in monitored_tickers:
        for counter in range(len(summaries[ticker])):
            output_this = [
                ticker,
                summaries[ticker][counter],
                scores[ticker][counter]['label'],
                scores[ticker][counter]['score'],
                urls[ticker][counter]
            ]
            output.append(output_this)
    return output

In [None]:
final_output = create_output_array(summaries, scores, cleaned_urls)
final_output

In [40]:
final_output.insert(0, ['Ticker', 'Summary', 'Label', 'Confidence', 'URL'])

In [None]:
final_output

In [44]:
import csv
with open('ASSETsummaries.csv', mode='w', newline='') as f:
    csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerows(final_output)