## 1. Import Dependencies

In [3]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from bs4 import BeautifulSoup as BS
import requests as req

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Jupyter settings
pd.set_option('display.max_colwidth', None)

## 2. Scrape + Load Data

In [4]:
from bs4 import BeautifulSoup
import requests

url = "https://www.reuters.com/markets/"
response = requests.get(url)

soup = BeautifulSoup(response.text, "html.parser")

data = []

for x in soup.find_all('h3'):
    if len(x.text) > 15:
        headline = x.text.strip()
        data.append(headline)
        
df = pd.DataFrame(data, columns=["Headline"])

# Remove common endings 
common_endings = [', article with image', ', article with gallery', ' - sources', ' -sources', '- report']
for i in common_endings:
    df['Headline'] = df['Headline'].str.replace(i, '')

display(df)

Unnamed: 0,Headline
0,Exclusive: First Republic considers downsizing if capital raise fails
1,"SVB Financial Group accuses FDIC of cutting it off from cash, article with video"
2,"First Republic's future to be discussed as major bank CEOs meet, sources say"
3,Zimbabwe's new 300 MW coal-fired plant starts feeding into grid
4,Exclusive: EU drafts plan to allow e-fuel combustion engine cars
5,"Putin says Chinese proposal could be basis for peace in Ukraine, article with video"
6,Demand for transatlantic flights soars as Americans can't get enough of Europe


In [5]:
from itertools import chain

# Shuffle and extract headlines into new list
headlines_array = np.array(df)
np.random.shuffle(headlines_array)
headlines_list = (headlines_array[:]).tolist()

print(headlines_list)

[["Demand for transatlantic flights soars as Americans can't get enough of Europe"], ["Zimbabwe's new 300 MW coal-fired plant starts feeding into grid"], ['Exclusive: First Republic considers downsizing if capital raise fails'], ['Exclusive: EU drafts plan to allow e-fuel combustion engine cars'], ['SVB Financial Group accuses FDIC of cutting it off from cash, article with video'], ['Putin says Chinese proposal could be basis for peace in Ukraine, article with video'], ["First Republic's future to be discussed as major bank CEOs meet, sources say"]]


## 3. Implementation of FINBERT

In [6]:
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

In [7]:
inputs = tokenizer(headlines_list, padding = True, truncation = True, return_tensors='pt', is_split_into_words=True)
print(inputs)

{'input_ids': tensor([[  101,  5157,  2005, 26617,  7599,  2061, 11650,  2004,  4841,  2064,
          1005,  1056,  2131,  2438,  1997,  2885,   102,     0,     0,     0],
        [  101, 11399,  1005,  1055,  2047,  3998, 12464,  5317,  1011,  5045,
          3269,  4627,  8521,  2046,  8370,   102,     0,     0,     0,     0],
        [  101,  7262,  1024,  2034,  3072, 10592, 12482,  6026,  2065,  3007,
          5333, 11896,   102,     0,     0,     0,     0,     0,     0,     0],
        [  101,  7262,  1024,  7327, 28967,  2933,  2000,  3499,  1041,  1011,
          4762, 16513,  3194,  3765,   102,     0,     0,     0,     0,     0],
        [  101, 17917,  2497,  3361,  2177, 26960,  2015,  1042, 14808,  1997,
          6276,  2009,  2125,  2013,  5356,  1010,  3720,  2007,  2678,   102],
        [  101, 22072,  2758,  2822,  6378,  2071,  2022,  3978,  2005,  3521,
          1999,  5924,  1010,  3720,  2007,  2678,   102,     0,     0,     0],
        [  101,  2034,  3072,  1

In [8]:
# Inference
outputs = model(**inputs)
print(outputs.logits.shape)

torch.Size([7, 3])


In [9]:
# Postprocessing with softmax
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)

tensor([[0.0176, 0.9505, 0.0319],
        [0.3517, 0.0119, 0.6364],
        [0.0279, 0.3533, 0.6187],
        [0.1958, 0.0098, 0.7944],
        [0.0189, 0.7687, 0.2124],
        [0.1026, 0.0191, 0.8783],
        [0.0353, 0.0201, 0.9446]], grad_fn=<SoftmaxBackward0>)


In [10]:
# Model classes
model.config.id2label

{0: 'positive', 1: 'negative', 2: 'neutral'}

In [11]:
# Formatting results as pandas dataframe

# Headline sentiment
positive = predictions[:, 0].tolist()
negative = predictions[:, 1].tolist()
neutral = predictions[:, 2].tolist()

table = {'Headline':headlines_list,
        'Positive':positive,
        'Negative':negative,
        'Neutral':neutral}

df = pd.DataFrame(table, columns = ["Headline", "Positive", "Negative", "Neutral"])

display(df)

Unnamed: 0,Headline,Positive,Negative,Neutral
0,[Demand for transatlantic flights soars as Americans can't get enough of Europe],0.017596,0.950474,0.031929
1,[Zimbabwe's new 300 MW coal-fired plant starts feeding into grid],0.35173,0.011906,0.636364
2,[Exclusive: First Republic considers downsizing if capital raise fails],0.027914,0.35334,0.618747
3,[Exclusive: EU drafts plan to allow e-fuel combustion engine cars],0.195809,0.009805,0.794386
4,"[SVB Financial Group accuses FDIC of cutting it off from cash, article with video]",0.018868,0.768729,0.212404
5,"[Putin says Chinese proposal could be basis for peace in Ukraine, article with video]",0.102637,0.01911,0.878253
6,"[First Republic's future to be discussed as major bank CEOs meet, sources say]",0.035345,0.02005,0.944605
