In [127]:
# Install baseline dependancies.
!pip install transformers
!pip install SentencePiece



In [128]:
# Import dependancies
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from bs4 import BeautifulSoup
import requests
import re

In [129]:
# Setting up Summarization model.
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at human-centered-summarization/financial-summarization-pegasus and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [130]:
# Auto summarize a single article.
# url = "https://finance.yahoo.com/news/uber-eats-ai-chatbot-offer-175411015.html"
# url = "https://finance.yahoo.com/news/nvidias-forward-pe-ratio-tumbles-181637108.html"
url = "https://news.yahoo.com/finance/news/malaysia-offers-incentives-country-gardens-034908823.html"
r = requests.get(url)
soup = BeautifulSoup(r.text,'html.parser')
paragraphs = soup.find_all('p')

In [131]:
paragraphs

[<p>By A. Ananthalakshmi and Yantoultra Ngui</p>,
 <p>KUALA LUMPUR/SINGAPORE (Reuters) -Embattled Chinese developer Country Garden said on Monday its $100-billion project in Malaysia was proceeding as planned and it had sufficient assets, despite concerns about its financial strength amid debt woes.</p>,
 <p>The comment by China's largest private developer came after it missed two dollar coupon payments this month totaling $22.5 million, fuelling fears that the country's property debt crisis could hamper a broader economic recovery and spill overseas.</p>,
 <p>"Our company's projects in Malaysia are operating normally and the sales performance is strong," the developer's Singapore and Malaysia unit said in a statement, adding that its overall operation in the region was "safe and stable."</p>,
 <p>"Various debt management measures are considered to actively resolve the pressure of periodic liquidity, to ensure the company's long-term future development," it added, without elaborating.<

In [132]:
text = [paragraph.text for paragraph in paragraphs]
words = ' '.join(text).split(' ')[:400]
ARTICLE = ' '.join(words)

In [133]:
input_ids = tokenizer.encode(ARTICLE, return_tensors='pt')
output = model.generate(input_ids,max_length=55,num_beams=5,early_stopping=True)
summary = tokenizer.decode(output[0],skip_special_tokens=True)

In [134]:
summary

'Country Garden has missed two dollar coupon this month. Malaysia unit is servicing loans, Bank Negara says'

In [135]:
# Taking user inputs
# input_stocks = []
# num_stocks = int(input("Enter number of stocks to monitor : "))
# for i in range(0,num_stocks):
#   stock_name = input("Enter stock {} : ".format(i+1))
#   input_stocks.append(stock_name)

In [136]:
# input_stocks

In [137]:
# Building a news and sentiment pipeline.
monitored_stocks = ['BTC','Tesla','Accenture','Amazon']
# monitored_stocks = ['Amazon', 'Tesla', 'Accenture', 'Bitcoin', 'Apple']
# monitored_stocks = input_stocks

In [138]:
# Search for stock news using google and yahoo finance

def search_for_stock_news_url(stock):
  search_url = "https://www.google.com/search?q=yahoo+finance+{}&tbm=nws".format(stock)
  r = requests.get(search_url);
  soup = BeautifulSoup(r.text,'html.parser')
  atags = soup.find_all('a')
  hrefs = [link['href'] for link in atags]
  return hrefs

In [139]:
raw_urls = {stock:search_for_stock_news_url(stock) for stock in monitored_stocks}
raw_urls

{'BTC': ['/?sa=X&ved=0ahUKEwi6x-nb8ZSBAxVZnFYBHSkLAfgQOwgC',
  '/search?q=yahoo+finance+BTC&tbm=nws&sca_esv=562916950&ie=UTF-8&gbv=1&sei=O933ZPq9E9m42roPqZaEwA8',
  '/search?q=yahoo+finance+BTC&sca_esv=562916950&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwi6x-nb8ZSBAxVZnFYBHSkLAfgQ_AUIBSgA',
  '/search?q=yahoo+finance+BTC&sca_esv=562916950&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwi6x-nb8ZSBAxVZnFYBHSkLAfgQ_AUIBygC',
  '/search?q=yahoo+finance+BTC&sca_esv=562916950&ie=UTF-8&tbm=bks&source=lnms&sa=X&ved=0ahUKEwi6x-nb8ZSBAxVZnFYBHSkLAfgQ_AUICCgD',
  '/search?q=yahoo+finance+BTC&sca_esv=562916950&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwi6x-nb8ZSBAxVZnFYBHSkLAfgQ_AUICSgE',
  'https://maps.google.com/maps?q=yahoo+finance+BTC&um=1&ie=UTF-8&sa=X&ved=0ahUKEwi6x-nb8ZSBAxVZnFYBHSkLAfgQ_AUICigF',
  '/search?q=yahoo+finance+BTC&sca_esv=562916950&ie=UTF-8&tbm=shop&source=lnms&sa=X&ved=0ahUKEwi6x-nb8ZSBAxVZnFYBHSkLAfgQ_AUICygG',
  '/advanced_search',
  '/search?q=yahoo+finance+BTC&sca_esv=56291695

In [140]:
# Removing unwanted urls

excluded_list = ['maps','policies','accounts','preferences','support','google']
def strip_unwanted_urls(urls,excluded_list):
  val = []
  for url in urls:
    if 'https://' in url and not any(exclude_word in url for exclude_word in excluded_list):
      res = re.findall(r'(https?://\S+)',url)[0].split('&')[0]
      val.append(res)
  return list(set(val))



In [141]:
cleaned_urls = {stock:strip_unwanted_urls(raw_urls[stock],excluded_list) for stock in monitored_stocks}

In [142]:
cleaned_urls

{'BTC': ['https://finance.yahoo.com/news/grayscale-prods-sec-to-take-action-as-bitcoin-rally-fades-204200696.html',
  'https://finance.yahoo.com/news/cleanspark-releases-august-2023-bitcoin-130000381.html',
  'https://finance.yahoo.com/news/new-volatility-for-bitcoin-ends-summer-slumber-for-crypto-investors-160538908.html',
  'https://hk.finance.yahoo.com/news/%25E9%2589%2585%25E4%25BA%25A8%25E8%25B2%25B7%25E5%25B9%25A3%25E9%2580%259F%25E5%25A0%25B1-%25E6%25AF%2594%25E7%2589%25B9%25E5%25B9%25A3-btc-24%25E5%25B0%258F%25E6%2599%2582%25E6%2588%2590%25E4%25BA%25A4%25E9%2587%258F%25E8%25B6%2585%25E9%2581%258E6-85%25E5%2584%2584%25E7%25BE%258E%25E5%2585%2583-021515508.html',
  'https://hk.finance.yahoo.com/news/%25E9%2589%2585%25E4%25BA%25A8%25E8%25B2%25B7%25E5%25B9%25A3%25E9%2580%259F%25E5%25A0%25B1-%25E6%25AF%2594%25E7%2589%25B9%25E5%25B9%25A3-btc-24%25E5%25B0%258F%25E6%2599%2582%25E6%2588%2590%25E4%25BA%25A4%25E9%2587%258F%25E8%25B6%2585%25E9%2581%258E7-42%25E5%2584%2584%25E7%25BE%258E%25

In [143]:
def scrape_info(urls):
  articles=[]
  for url in urls:
    r=requests.get(url)
    soup = BeautifulSoup(r.text,'html.parser')
    paragraphs=soup.find_all('p')
    text=[paragraph.text for paragraph in paragraphs]
    words=' '.join(text).split(' ')[:350]
    article=' '.join(words)
    articles.append(article)
  return articles

In [144]:
articles = {stock:scrape_info(cleaned_urls[stock]) for stock in monitored_stocks}

In [145]:
articles

{'BTC': ['A lawyer working for Grayscale Investments asked to meet with the Securities and Exchange Commission "as soon as practical" in a Tuesday letter that prodded the regulator to approve the conversion of Grayscale’s bitcoin trust into a spot bitcoin exchange-traded fund. The attorney from David Polk & Wardwell LLP asked for the meeting to "discuss the way forward" following a critical decision last month from a three-judge panel of the District of Columbia Court of Appeals that injected a new wave of optimism into crypto markets. The judges concluded that the SEC was "arbitrary and capricious" when it denied Grayscale’s conversion application in 2022 after previously approving ETF products that held bitcoin futures contracts. A spot bitcoin ETF would allow investors to get exposure to the world’s largest cryptocurrency without having to own it, possibly expanding mainstream acceptance of digital assets. In the hours following the court decision, bitcoin rose 8% and briefly touche

In [146]:
# summarize these articles

def summarize(articles):
  summaries = []
  for article in articles:

    input_ids = tokenizer.encode(article, return_tensors='pt')
    output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
    summary = tokenizer.decode(output[0], skip_special_tokens=True)

    summaries.append(summary)

  return summaries


In [147]:
# articles['BTC'][5]
# for article in articles['BTC']:
#     print("A : ",article)
#     input_ids = tokenizer.encode(article, return_tensors='pt')
#     output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
#     summary = tokenizer.decode(output[0], skip_special_tokens=True)
#     print("S :",summary)
#     print("--------------")

In [148]:
summaries = {stock:summarize(articles[stock]) for stock in monitored_stocks}
summaries

{'BTC': ['‘We believe the trust’s nearly one million investors deserve this fair playing field,’ lawyer says. SEC has until mid-October to request re-hearing of decision',
  'Sales of BTC equated to proceeds of approximately $1.2 million. Expansion in Washington state now at full operational hashrate of 9.3 EH/s',
  'Largest one-day drop in this year’s crypto market. Bitcoin briefly dropped below $25,000 on Thursday',
  'We are aware of the issue and are working to resolve it.',
  'We are aware of the issue and are working to resolve it.',
  'Largest cryptocurrency’s price sank to a two-month low this week. ‘Institutional investors are getting optimistic,’ IntoTheBlock says',
  'It’s a strategic marketing move that’s subtly rewriting the Bitcoin narrative.',
  'We are aware of the issue and are working to resolve it.',
  'Margined contracts now account for 33% of total futures open interest. Volatility-boosting liquidations seen with increased use',
  'We are aware of the issue and are

In [149]:
from transformers import pipeline

In [150]:
sentiment = pipeline('sentiment-analysis')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [151]:
scores = {stock:sentiment(summaries[stock]) for stock in monitored_stocks}

In [152]:
scores

{'BTC': [{'label': 'NEGATIVE', 'score': 0.9881656169891357},
  {'label': 'NEGATIVE', 'score': 0.9866843819618225},
  {'label': 'NEGATIVE', 'score': 0.9980260133743286},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'NEGATIVE', 'score': 0.9988779425621033},
  {'label': 'POSITIVE', 'score': 0.9950358271598816},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'POSITIVE', 'score': 0.8145719170570374},
  {'label': 'POSITIVE', 'score': 0.9979088306427002}],
 'Tesla': [{'label': 'NEGATIVE', 'score': 0.9976634979248047},
  {'label': 'POSITIVE', 'score': 0.9923650622367859},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'POSITIVE', 'score': 0.9983747005462646},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'NEGATIVE', 'score': 0.9898334741592407},
  {'label': 'NEGATIVE', 'score': 0.5887808799743652},
  {'label':

In [153]:
def create_final_output(summaries,scores,urls):
  output = []
  for stock in monitored_stocks:
    for counter in range(len(summaries[stock])):
      temp_list = [
          stock,
          summaries[stock][counter],
          scores[stock][counter]['label'],
          scores[stock][counter]['score'],
          urls[stock][counter]
      ]
      output.append(temp_list)
  return output

In [154]:
final_op = create_final_output(summaries,scores,cleaned_urls)

In [155]:
final_op

[['BTC',
  '‘We believe the trust’s nearly one million investors deserve this fair playing field,’ lawyer says. SEC has until mid-October to request re-hearing of decision',
  'NEGATIVE',
  0.9881656169891357,
  'https://finance.yahoo.com/news/grayscale-prods-sec-to-take-action-as-bitcoin-rally-fades-204200696.html'],
 ['BTC',
  'Sales of BTC equated to proceeds of approximately $1.2 million. Expansion in Washington state now at full operational hashrate of 9.3 EH/s',
  'NEGATIVE',
  0.9866843819618225,
  'https://finance.yahoo.com/news/cleanspark-releases-august-2023-bitcoin-130000381.html'],
 ['BTC',
  'Largest one-day drop in this year’s crypto market. Bitcoin briefly dropped below $25,000 on Thursday',
  'NEGATIVE',
  0.9980260133743286,
  'https://finance.yahoo.com/news/new-volatility-for-bitcoin-ends-summer-slumber-for-crypto-investors-160538908.html'],
 ['BTC',
  'We are aware of the issue and are working to resolve it.',
  'POSITIVE',
  0.9979088306427002,
  'https://hk.finance

In [156]:
final_op.insert(0,['Stock','Summary','Label','Confidence','URLs'])

In [157]:
final_op

[['Stock', 'Summary', 'Label', 'Confidence', 'URLs'],
 ['BTC',
  '‘We believe the trust’s nearly one million investors deserve this fair playing field,’ lawyer says. SEC has until mid-October to request re-hearing of decision',
  'NEGATIVE',
  0.9881656169891357,
  'https://finance.yahoo.com/news/grayscale-prods-sec-to-take-action-as-bitcoin-rally-fades-204200696.html'],
 ['BTC',
  'Sales of BTC equated to proceeds of approximately $1.2 million. Expansion in Washington state now at full operational hashrate of 9.3 EH/s',
  'NEGATIVE',
  0.9866843819618225,
  'https://finance.yahoo.com/news/cleanspark-releases-august-2023-bitcoin-130000381.html'],
 ['BTC',
  'Largest one-day drop in this year’s crypto market. Bitcoin briefly dropped below $25,000 on Thursday',
  'NEGATIVE',
  0.9980260133743286,
  'https://finance.yahoo.com/news/new-volatility-for-bitcoin-ends-summer-slumber-for-crypto-investors-160538908.html'],
 ['BTC',
  'We are aware of the issue and are working to resolve it.',
  '

In [158]:
import csv

In [159]:
with open('analysis.csv', mode='w', newline='') as f:
  csv_writer = csv.writer(f,delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
  csv_writer.writerows(final_op)

In [161]:
import pandas as pd
op = pd.read_csv('analysis.csv')

In [163]:
op

Unnamed: 0,Stock,Summary,Label,Confidence,URLs
0,BTC,‘We believe the trust’s nearly one million inv...,NEGATIVE,0.988166,https://finance.yahoo.com/news/grayscale-prods...
1,BTC,Sales of BTC equated to proceeds of approximat...,NEGATIVE,0.986684,https://finance.yahoo.com/news/cleanspark-rele...
2,BTC,Largest one-day drop in this year’s crypto mar...,NEGATIVE,0.998026,https://finance.yahoo.com/news/new-volatility-...
3,BTC,We are aware of the issue and are working to r...,POSITIVE,0.997909,https://hk.finance.yahoo.com/news/%25E9%2589%2...
4,BTC,We are aware of the issue and are working to r...,POSITIVE,0.997909,https://hk.finance.yahoo.com/news/%25E9%2589%2...
5,BTC,Largest cryptocurrency’s price sank to a two-m...,NEGATIVE,0.998878,https://finance.yahoo.com/news/large-bitcoin-h...
6,BTC,It’s a strategic marketing move that’s subtly ...,POSITIVE,0.995036,https://finance.yahoo.com/news/bitcoin-etf-app...
7,BTC,We are aware of the issue and are working to r...,POSITIVE,0.997909,https://hk.finance.yahoo.com/news/%25E9%2589%2...
8,BTC,Margined contracts now account for 33% of tota...,POSITIVE,0.814572,https://finance.yahoo.com/news/bitcoins-margin...
9,BTC,We are aware of the issue and are working to r...,POSITIVE,0.997909,https://hk.finance.yahoo.com/news/%25E9%2589%2...
