In [1]:
# Import libraries
import pandas as pd
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from urllib.request import urlopen
from urllib.request import Request
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from config import my_tickers, my_headers

In [2]:
# 需要下載的新聞的數目 
news_number = 3 
tickers = my_tickers

# 新聞的URL
finviz_url = 'https://finviz.com/quote.ashx?t='
news_tables = {}

In [4]:
for ticker in tickers:
    url = finviz_url + ticker
    req = Request(url=url,headers=my_headers) 
    # 拿HTML
    resp = urlopen(req)
    # 解讀成為HTML碼 
    html = BeautifulSoup(resp, features="lxml")
    # 拿news-table內容
    news_table = html.find(id='news-table')
    # 每個ticker都有自己的新聞
    news_tables[ticker] = news_table

news_table

<table border="0" cellpadding="1" cellspacing="0" class="fullview-news-outer" id="news-table" width="100%">
<tr><td align="right" style="white-space:nowrap" width="130">Jul-03-22 07:29PM  </td><td align="left"><div class="news-link-container"><div class="news-link-left"><a class="tab-link-news" href="https://www.wsj.com/articles/bezos-criticizes-bidens-call-for-gas-stations-to-cut-prices-11656873702?siteid=yhoof2" target="_blank">Bezos Criticizes Bidens Call for Gas Stations to Cut Prices</a></div><div class="news-link-right"><span style="color:#aa6dc0;font-size:9px"> The Wall Street Journal</span></div></div></td></tr>
<tr><td align="right" width="130">07:03PM  </td><td align="left"><div class="news-link-container"><div class="news-link-left"><a class="tab-link-news" href="https://finance.yahoo.com/news/oil-stabilizes-traders-weigh-recession-230348893.html" target="_blank">Oil Stabilizes as Traders Weigh Recession Concerns, Tight Supply</a></div><div class="news-link-right"><span styl

In [5]:
# 使用 BeautifulSoup 和 requests 模塊從 FinViz 網站獲取新聞數據。
# 該代碼解析 HTML 新聞表的 URL，並遍歷代碼列表以收集每個代碼的最近標題。
# 對於每隻輸入的股票，都會打印出 n 個最近的標題，以便於查看數據。
try:
    for ticker in tickers:
        df = news_tables[ticker]
        df_tr = df.findAll('tr')
    
        print ('\n')
        print ('Recent News Headlines for {}: '.format(ticker))
        
        for i, table_row in enumerate(df_tr):
            a_text = table_row.a.text
            td_text = table_row.td.text
            td_text = td_text.strip()
            print(a_text,'(',td_text,')')
            if i == news_number-1:
                break
except KeyError:
    pass



Recent News Headlines for AAPL: 
Theyll chicken out: Fund legend Rick Rule says the Fed wont keep hiking rates aggressively, even to prevent amazing damage. Here are 3 spots he likes for your money ( Jul-03-22 04:35PM )
Worried About Personal Data Leaks? Heres How to Lock Down Your Phone ( 10:00AM )
Learn This Investing Lesson, And You'll Have the Key to Success ( 05:45AM )


Recent News Headlines for TSLA: 
Dow Jones Futures Fall: Don't Feed The Bear; BYD Leaves Tesla In The Dust ( Jul-03-22 07:49PM )
California Governor Newsom Runs Ads in Florida Attacking GOP ( 12:23PM )
Teslas second-quarter sales drop amid supply-chain and other pandemic-linked problems ( 11:40AM )


Recent News Headlines for AMZN: 
Bezos Criticizes Bidens Call for Gas Stations to Cut Prices ( Jul-03-22 07:29PM )
Oil Stabilizes as Traders Weigh Recession Concerns, Tight Supply ( 07:03PM )
WeShop launches as first community-owned shopping platform with IPO in sight ( 07:01PM )


In [7]:
# 為了執行情緒分析，數據必須採用正確的格式，因此這段代碼會遍歷收集到的新聞，
# 並將其分類為包含代碼、日期、時間和實際標題的列表。
parsed_news = []
for file_name, news_table in news_tables.items():
    for x in news_table.findAll('tr'):
        text = x.a.get_text() 
        date_scrape = x.td.text.split()

        if len(date_scrape) == 1:
            time = date_scrape[0]
            
        else:
            date = date_scrape[0]
            time = date_scrape[1]

        ticker = file_name.split('_')[0]
        
        parsed_news.append([ticker, date, time, text])

parsed_news

[['AAPL',
  'Jul-03-22',
  '04:35PM',
  'Theyll chicken out: Fund legend Rick Rule says the Fed wont keep hiking rates aggressively, even to prevent amazing damage. Here are 3 spots he likes for your money'],
 ['AAPL',
  'Jul-03-22',
  '10:00AM',
  'Worried About Personal Data Leaks? Heres How to Lock Down Your Phone'],
 ['AAPL',
  'Jul-03-22',
  '05:45AM',
  "Learn This Investing Lesson, And You'll Have the Key to Success"],
 ['AAPL',
  'Jul-02-22',
  '02:00PM',
  'Here Are 2 of the Best Stocks to Buy if the U.S. Avoids a Recession'],
 ['AAPL',
  'Jul-02-22',
  '09:00AM',
  'Will the Apple and MLS Deal Attract a New Generation?'],
 ['AAPL',
  'Jul-02-22',
  '07:00AM',
  'Its so horrible that I want to buy it  Jim Cramer likes these 2 crushed tech stocks that are still posting white-hot revenue growth'],
 ['AAPL',
  'Jul-02-22',
  '07:00AM',
  'Is Apple Dictating the Future of Sports Streaming?'],
 ['AAPL',
  'Jul-02-22',
  '06:32AM',
  'More Americans Are Using Digital Wallets Than Ev

In [8]:
# 使用nltk，分析每個標題的極性分數，
# 範圍為 -1 到 1，-1 表示高度負面，高度 1 表示正面。
analyzer = SentimentIntensityAnalyzer()

columns = ['Ticker', 'Date', 'Time', 'Headline']
news = pd.DataFrame(parsed_news, columns=columns)
scores = news['Headline'].apply(analyzer.polarity_scores).tolist()

df_scores = pd.DataFrame(scores)
news = news.join(df_scores, rsuffix='_right')

df_scores

Unnamed: 0,neg,neu,pos,compound
0,0.152,0.635,0.213,0.2960
1,0.167,0.833,0.000,-0.2960
2,0.000,0.730,0.270,0.5719
3,0.241,0.535,0.225,0.1779
4,0.000,0.762,0.238,0.3612
...,...,...,...,...
295,0.196,0.804,0.000,-0.2960
296,0.000,1.000,0.000,0.0000
297,0.126,0.686,0.189,0.2732
298,0.000,1.000,0.000,0.0000


In [9]:
news

Unnamed: 0,Ticker,Date,Time,Headline,neg,neu,pos,compound
0,AAPL,Jul-03-22,04:35PM,Theyll chicken out: Fund legend Rick Rule says...,0.152,0.635,0.213,0.2960
1,AAPL,Jul-03-22,10:00AM,Worried About Personal Data Leaks? Heres How t...,0.167,0.833,0.000,-0.2960
2,AAPL,Jul-03-22,05:45AM,"Learn This Investing Lesson, And You'll Have t...",0.000,0.730,0.270,0.5719
3,AAPL,Jul-02-22,02:00PM,Here Are 2 of the Best Stocks to Buy if the U....,0.241,0.535,0.225,0.1779
4,AAPL,Jul-02-22,09:00AM,Will the Apple and MLS Deal Attract a New Gene...,0.000,0.762,0.238,0.3612
...,...,...,...,...,...,...,...,...
295,AMZN,Jun-29-22,11:26AM,CEO resignations: 668 executives have left the...,0.196,0.804,0.000,-0.2960
296,AMZN,Jun-29-22,10:53AM,2 Reasons MercadoLibre Can Bounce Back Big Time,0.000,1.000,0.000,0.0000
297,AMZN,Jun-29-22,10:29AM,"McCormick cuts outlook, General Mills tops ear...",0.126,0.686,0.189,0.2732
298,AMZN,Jun-29-22,09:54AM,Ecuador President Survives Impeachment Attempt...,0.000,1.000,0.000,0.0000


In [10]:
# 數據已準備好以吸引人的方式進行操作和查看。
# 對於輸入列表中的每個代碼，將創建一個新的 DataFrame，其中包括其標題及其各自的分數。
# 創建一個最終的 DataFrame，其中包括每個股票代碼在所有最近解析的新聞中的平均情緒值。
news['Date'] = pd.to_datetime(news.Date).dt.date

unique_ticker = news['Ticker'].unique().tolist()
news_dict = {name: news.loc[news['Ticker'] == name] for name in unique_ticker}

news

Unnamed: 0,Ticker,Date,Time,Headline,neg,neu,pos,compound
0,AAPL,2022-07-03,04:35PM,Theyll chicken out: Fund legend Rick Rule says...,0.152,0.635,0.213,0.2960
1,AAPL,2022-07-03,10:00AM,Worried About Personal Data Leaks? Heres How t...,0.167,0.833,0.000,-0.2960
2,AAPL,2022-07-03,05:45AM,"Learn This Investing Lesson, And You'll Have t...",0.000,0.730,0.270,0.5719
3,AAPL,2022-07-02,02:00PM,Here Are 2 of the Best Stocks to Buy if the U....,0.241,0.535,0.225,0.1779
4,AAPL,2022-07-02,09:00AM,Will the Apple and MLS Deal Attract a New Gene...,0.000,0.762,0.238,0.3612
...,...,...,...,...,...,...,...,...
295,AMZN,2022-06-29,11:26AM,CEO resignations: 668 executives have left the...,0.196,0.804,0.000,-0.2960
296,AMZN,2022-06-29,10:53AM,2 Reasons MercadoLibre Can Bounce Back Big Time,0.000,1.000,0.000,0.0000
297,AMZN,2022-06-29,10:29AM,"McCormick cuts outlook, General Mills tops ear...",0.126,0.686,0.189,0.2732
298,AMZN,2022-06-29,09:54AM,Ecuador President Survives Impeachment Attempt...,0.000,1.000,0.000,0.0000


In [11]:
unique_ticker

['AAPL', 'TSLA', 'AMZN']

In [12]:
news_dict

{'AAPL':    Ticker        Date     Time  \
 0    AAPL  2022-07-03  04:35PM   
 1    AAPL  2022-07-03  10:00AM   
 2    AAPL  2022-07-03  05:45AM   
 3    AAPL  2022-07-02  02:00PM   
 4    AAPL  2022-07-02  09:00AM   
 ..    ...         ...      ...   
 95   AAPL  2022-06-29  10:10AM   
 96   AAPL  2022-06-29  09:39AM   
 97   AAPL  2022-06-29  09:00AM   
 98   AAPL  2022-06-29  08:14AM   
 99   AAPL  2022-06-29  08:06AM   
 
                                              Headline    neg    neu    pos  \
 0   Theyll chicken out: Fund legend Rick Rule says...  0.152  0.635  0.213   
 1   Worried About Personal Data Leaks? Heres How t...  0.167  0.833  0.000   
 2   Learn This Investing Lesson, And You'll Have t...  0.000  0.730  0.270   
 3   Here Are 2 of the Best Stocks to Buy if the U....  0.241  0.535  0.225   
 4   Will the Apple and MLS Deal Attract a New Gene...  0.000  0.762  0.238   
 ..                                                ...    ...    ...    ...   
 95  FCC commissi

In [15]:
values = []
for ticker in tickers: 
    dataframe = news_dict[ticker]
    dataframe = dataframe.set_index('Ticker')
    dataframe = dataframe.drop(columns = ['Headline'])
    print ('\n')
    print (dataframe.head())
    
    mean = round(dataframe['compound'].mean(), 2)
    values.append(mean)
#dataframe



              Date     Time    neg    neu    pos  compound
Ticker                                                    
AAPL    2022-07-03  04:35PM  0.152  0.635  0.213    0.2960
AAPL    2022-07-03  10:00AM  0.167  0.833  0.000   -0.2960
AAPL    2022-07-03  05:45AM  0.000  0.730  0.270    0.5719
AAPL    2022-07-02  02:00PM  0.241  0.535  0.225    0.1779
AAPL    2022-07-02  09:00AM  0.000  0.762  0.238    0.3612


              Date     Time    neg    neu    pos  compound
Ticker                                                    
TSLA    2022-07-03  07:49PM  0.000  1.000  0.000    0.0000
TSLA    2022-07-03  12:23PM  0.273  0.727  0.000   -0.4588
TSLA    2022-07-03  11:40AM  0.375  0.625  0.000   -0.5859
TSLA    2022-07-03  10:56AM  0.000  0.741  0.259    0.6369
TSLA    2022-07-03  09:58AM  0.000  1.000  0.000    0.0000


              Date     Time    neg    neu    pos  compound
Ticker                                                    
AMZN    2022-07-03  07:29PM  0.360  0.640  0.000  

In [16]:
df = pd.DataFrame(list(zip(tickers, values)), columns =['Ticker', 'Mean Sentiment']) 
df = df.set_index('Ticker')
df = df.sort_values('Mean Sentiment', ascending=False)
print ('\n')
print (df)



        Mean Sentiment
Ticker                
AMZN              0.10
AAPL             -0.03
TSLA             -0.06
