## Web Scrapping of Real Time  News Headlines using Finviz API

In [8]:
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd
import matplotlib.pyplot as plt


def get_parsed_news(tickers, n):
    # Define the URL for Finviz
    finviz_url = 'https://finviz.com/quote.ashx?t='

    news_tables = {}

    # Loop through each ticker to fetch news data
    for ticker in tickers:
        url = finviz_url + ticker

        # Make a request to the URL
        req = Request(url=url, headers={'user-agent': 'sentiment-app'})
        response = urlopen(req)
        html = BeautifulSoup(response, features='html.parser')

        # Find the news table
        news_table = html.find(id='news-table')
        news_tables[ticker] = news_table

    parsed_news = []

    # Loop through news tables and extract relevant information
    for ticker, news_table in news_tables.items():
        for row in news_table.findAll('tr'):
            anchor_tag = row.a
            time_data = row.td.text.strip().split()

            if anchor_tag and len(time_data) >= 2:
                date, time = time_data[0], time_data[1]
                title = anchor_tag.get_text()
                parsed_news.append([ticker, date, time, title])

    return parsed_news

# Plotting mean sentiment scores
# plt.figure(figsize=(10, 8))
# mean_df = df.groupby(['ticker', 'date'])['compound'].mean().unstack()
# mean_df.plot(kind='bar')
# plt.title('Mean Sentiment Score by Ticker and Date')
# plt.xlabel('Date')
# plt.ylabel('Mean Sentiment Score')
# plt.legend(title='Ticker')
# plt.show()

In [9]:
# Define the list of tickers and number of recent news articles to retrieve
tickers = ['GOOG', 'AMZN', 'AAPL', 'MSFT', 'TSLA', 'NFLX', 'META', 'GME', 'PANW','BUD', 'WMT', 'W', 'NVDA']
n = 3

# Fetch parsed news data
parsed_news = get_parsed_news(tickers, n)

print(parsed_news)



In [10]:
# Create DataFrame and calculate sentiment scores using VADER
vader = SentimentIntensityAnalyzer()
df = pd.DataFrame(parsed_news, columns=['ticker', 'date', 'time', 'title'])
df['compound'] = df['title'].apply(lambda title: vader.polarity_scores(title)['compound'])
df['date'] = pd.to_datetime(df['date'])
print(df)

    ticker       date     time  \
0     GOOG 2023-08-20  02:55PM   
1     GOOG 2023-08-19  07:04PM   
2     GOOG 2023-08-18  08:50PM   
3     GOOG 2023-08-17  07:51PM   
4     GOOG 2023-08-16  11:26PM   
..     ...        ...      ...   
220   NVDA 2023-08-19  10:05AM   
221   NVDA 2023-08-18  07:10PM   
222   NVDA 2023-08-17  06:01PM   
223   NVDA 2023-08-16  09:02PM   
224   NVDA 2023-08-15  05:19PM   

                                                 title  compound  
0    Americas Tech Giants Rush to Comply With New C...    0.0000  
1    Mohnish Pabrais 10 Biggest Investments in 10 Y...    0.0000  
2    UPDATE 2-California regulator probes crashes i...    0.0000  
3    Netflix's Top Stories: Cloud Gaming and Strang...    0.2023  
4    Alphabets Verily Plans Cost Cuts Amid Pressure...   -0.5267  
..                                                 ...       ...  
220  What Nvidia, Intel, AMD, and TSMC Stock Invest...    0.0000  
221                       Why Are Stocks Moving Lower? 

In [13]:
df11 = pd.DataFrame(parsed_news, columns=['Stock', 'Date', 'Time', 'Headlines'])
df11.to_csv('../data/stock_headlines.csv', index=False)


In [16]:
df_ = pd.read_csv("../data/stock_headlines.csv")
df_

Unnamed: 0,Stock,Date,Time,Headlines
0,GOOG,Aug-20-23,02:55PM,Americas Tech Giants Rush to Comply With New C...
1,GOOG,Aug-19-23,07:04PM,Mohnish Pabrais 10 Biggest Investments in 10 Y...
2,GOOG,Aug-18-23,08:50PM,UPDATE 2-California regulator probes crashes i...
3,GOOG,Aug-17-23,07:51PM,Netflix's Top Stories: Cloud Gaming and Strang...
4,GOOG,Aug-16-23,11:26PM,Alphabets Verily Plans Cost Cuts Amid Pressure...
...,...,...,...,...
220,NVDA,Aug-19-23,10:05AM,"What Nvidia, Intel, AMD, and TSMC Stock Invest..."
221,NVDA,Aug-18-23,07:10PM,Why Are Stocks Moving Lower?
222,NVDA,Aug-17-23,06:01PM,"Are AI job wages of up to $900,000 justified?"
223,NVDA,Aug-16-23,09:02PM,NVIDIA's Top Stories: How It's Beating Competi...


## Performing Sentiment Analysis 


In [17]:
analyzer = SentimentIntensityAnalyzer()

columns = ['Ticker', 'Date', 'Time', 'Headline']
news = pd.DataFrame(parsed_news, columns=columns)
scores = news['Headline'].apply(analyzer.polarity_scores).tolist()

df_scores = pd.DataFrame(scores)
news = news.join(df_scores, rsuffix='_right')

In [18]:
# View Data 
news['Date'] = pd.to_datetime(news.Date).dt.date

unique_ticker = news['Ticker'].unique().tolist()
news_dict = {name: news.loc[news['Ticker'] == name] for name in unique_ticker}

values = []
for ticker in tickers: 
    dataframe = news_dict[ticker]
    dataframe = dataframe.set_index('Ticker')
    dataframe = dataframe.drop(columns = ['Headline'])
    print ('\n')
    print (dataframe.head())
    
    mean = round(dataframe['compound'].mean(), 2)
    values.append(mean)
    
df = pd.DataFrame(list(zip(tickers, values)), columns =['Ticker', 'Mean Sentiment']) 
df = df.set_index('Ticker')
df = df.sort_values('Mean Sentiment', ascending=False)
print ('\n')
print (df)



              Date     Time    neg    neu    pos  compound
Ticker                                                    
GOOG    2023-08-20  02:55PM  0.000  1.000  0.000    0.0000
GOOG    2023-08-19  07:04PM  0.000  1.000  0.000    0.0000
GOOG    2023-08-18  08:50PM  0.000  1.000  0.000    0.0000
GOOG    2023-08-17  07:51PM  0.000  0.795  0.205    0.2023
GOOG    2023-08-16  11:26PM  0.268  0.732  0.000   -0.5267


              Date     Time    neg    neu    pos  compound
Ticker                                                    
AMZN    2023-08-20  02:35PM  0.000  0.658  0.342    0.3818
AMZN    2023-08-19  11:15AM  0.000  0.639  0.361    0.7845
AMZN    2023-08-18  07:10PM  0.355  0.645  0.000   -0.2960
AMZN    2023-08-17  11:12PM  0.000  1.000  0.000    0.0000
AMZN    2023-08-16  08:42PM  0.000  0.444  0.556    0.8176


              Date     Time    neg    neu    pos  compound
Ticker                                                    
AAPL    2023-08-20  04:23PM  0.000  0.735  0.265  

In [21]:
# Import libraries
import pandas as pd
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from urllib.request import urlopen, Request
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Parameters 
n = 3 #the # of article headlines displayed per ticker
tickers = ['AAPL', 'TSLA', 'AMZN']

# Get Data
finwiz_url = 'https://finviz.com/quote.ashx?t='
news_tables = {}

for ticker in tickers:
    url = finwiz_url + ticker
    req = Request(url=url,headers={'user-agent': 'sentiment-app'}) 
    resp = urlopen(req)    
    html = BeautifulSoup(resp, features="lxml")
    news_table = html.find(id='news-table')
    news_tables[ticker] = news_table

try:
    for ticker in tickers:
        df = news_tables[ticker]
        df_tr = df.findAll('tr')
    
        print ('\n')
        print ('Recent News Headlines for {}: '.format(ticker))
        
        for i, table_row in enumerate(df_tr):
            a_text1 = table_row
            if a_text1:
                a_text = a_text1.a.text
                td_text = table_row.td.text
                td_text = td_text.strip()
                print(a_text,'(',td_text,')')
                if i == n-1:
                    break
except KeyError:
    pass

# Iterate through the news
parsed_news = []
for file_name, news_table in news_tables.items():
    for x in news_table.findAll('tr'):
        text1 = x.a
        if text1:
            text = text1.get_text() 
            date_scrape = x.td.text.split()

            if len(date_scrape) == 1:
                time = date_scrape[0]
            
            else:
                date = date_scrape[0]
                time = date_scrape[1]

        ticker = file_name.split('_')[0]
        
        parsed_news.append([ticker, date, time, text])
        
# Sentiment Analysis
analyzer = SentimentIntensityAnalyzer()

columns = ['Ticker', 'Date', 'Time', 'Headline']
news = pd.DataFrame(parsed_news, columns=columns)
scores = news['Headline'].apply(analyzer.polarity_scores).tolist()

df_scores = pd.DataFrame(scores)
news = news.join(df_scores, rsuffix='_right')


# View Data 
news['Date'] = pd.to_datetime(news.Date).dt.date

unique_ticker = news['Ticker'].unique().tolist()
news_dict = {name: news.loc[news['Ticker'] == name] for name in unique_ticker}

values = []
for ticker in tickers: 
    dataframe = news_dict[ticker]
    dataframe = dataframe.set_index('Ticker')
    dataframe = dataframe.drop(columns = ['Headline'])
    print ('\n')
    print (dataframe.head())
    
    mean = round(dataframe['compound'].mean(), 2)
    values.append(mean)
    
df = pd.DataFrame(list(zip(tickers, values)), columns =['Ticker', 'Mean Sentiment']) 
df = df.set_index('Ticker')
df = df.sort_values('Mean Sentiment', ascending=False)
print ('\n')
print (df)



Recent News Headlines for AAPL: 
10 Stocks ChatGPT Said Will Make Me Rich in 10 Years ( Aug-20-23 04:23PM )
3 Top Stocks To Buy if There Is a Market Pullback ( 01:00PM )
3 Tech Stocks With More Potential Than Any Cryptocurrency ( 07:55AM )


Recent News Headlines for TSLA: 
Michael Burrys Top 10 Stock Picks For Q3 ( Aug-20-23 09:00AM )
Texas mandates new EV charging stations use the Tesla NACS plug ( 09:00AM )
Cathie Wood Just Sold Tesla and Shopify -- and Bought This Potentially Explosive Growth Stock ( 07:35AM )


Recent News Headlines for AMZN: 
Warren Buffetts 11 Growth Stock Picks ( Aug-20-23 02:35PM )
3 Top Stocks To Buy if There Is a Market Pullback ( 01:00PM )
Did Amazon Just Do Shopify a Huge Favor? ( 11:45AM )


              Date     Time  neg    neu    pos  compound
Ticker                                                  
AAPL    2023-08-20  04:23PM  0.0  0.735  0.265    0.5574
AAPL    2023-08-20  01:00PM  0.0  0.816  0.184    0.2023
AAPL    2023-08-20  07:55AM  0.0  1.00