In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import datetime
import gc
import logging
import time

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
pd.set_option('display.max_colwidth', None)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
print(torch.backends.mps.is_built())
print(torch.backends.mps.is_available())

True
True


In [2]:
partner_headlines = pd.read_csv('./archive/raw_partner_headlines.csv')
benzinga_headlines = pd.read_csv('./archive/raw_analyst_ratings.csv')
headlines = pd.concat([partner_headlines, benzinga_headlines]).drop('Unnamed: 0', axis = 1)
headlines


Unnamed: 0,headline,url,publisher,date,stock
0,Agilent Technologies Announces Pricing of $5…… Million of Senior Notes,http://www.gurufocus.com/news/1153187/agilent-technologies-announces-pricing-of-500-million-of-senior-notes,GuruFocus,2020-06-01 00:00:00,A
1,Agilent (A) Gears Up for Q2 Earnings: What's in the Cards?,http://www.zacks.com/stock/news/931205/agilent-a-gears-up-for-q2-earnings-whats-in-the-cards?cid=CS-BENZ-FT-analyst_blog|earnings_preview-931205,Zacks,2020-05-18 00:00:00,A
2,J.P. Morgan Asset Management Announces Liquidation of Six Exchange-Traded Funds,http://www.gurufocus.com/news/1138923/jp-morgan-asset-management-announces-liquidation-of-six-exchangetraded-funds,GuruFocus,2020-05-15 00:00:00,A
3,"Pershing Square Capital Management, L.P. Buys Agilent Technologies Inc, The Howard Hughes Corp, ...",http://www.gurufocus.com/news/1138704/pershing-square-capital-management-lp-buys-agilent-technologies-inc-the-howard-hughes-corp-lowes-inc-sells-chipotle-mexican-grill-inc,GuruFocus,2020-05-15 00:00:00,A
4,Agilent Awards Trilogy Sciences with a Golden Ticket at LabCentral,http://www.gurufocus.com/news/1134012/agilent-awards-trilogy-sciences-with-a-golden-ticket-at-labcentral,GuruFocus,2020-05-12 00:00:00,A
...,...,...,...,...,...
1407323,Top Narrow Based Indexes For August 29,https://www.benzinga.com/news/11/08/1888782/top-narrow-based-indexes-for-august-29,Monica Gerson,2011-08-29 00:00:00,ZX
1407324,Recap: Wednesday's Top Percentage Gainers and Losers,https://www.benzinga.com/news/earnings/11/06/1193660/recap-wednesdays-top-percentage-gainers-and-losers,Benjamin Lee,2011-06-22 00:00:00,ZX
1407325,UPDATE: Oppenheimer Color on China Zenix Auto Initiation,https://www.benzinga.com/analyst-ratings/analyst-color/11/06/1186890/update-oppenheimer-color-on-china-zenix-auto-initiation,BenzingaStaffL,2011-06-21 00:00:00,ZX
1407326,"Oppenheimer Initiates China Zenix At Outperform, $8 PT",https://www.benzinga.com/analyst-ratings/price-target/11/06/1186025/oppenheimer-initiates-china-zenix-at-outperform-8-pt,Joe Young,2011-06-21 00:00:00,ZX


In [33]:
one_yr_headlines = headlines.loc[(headlines['date'] >= "2014-01-01") & (headlines['date'] < "2015-01-01")]
one_yr_headlines

Unnamed: 0,headline,url,publisher,date,stock
710,Affymetrix (AFFX) Touches 52-Week High on Bullish Trends - Analyst Blog,http://www.zacks.com/stock/news/158725/affymetrix-affx-touches-52-week-high-on-bullish-trends,Zacks,2014-12-29 00:00:00,A
711,Electronic Arts Inc. (EA) Rises: Stock Adds 5.4% in Session - Tale of the Tape,http://www.zacks.com/stock/news/156255/electronic-arts-inc-ea-rises-54,Zacks,2014-12-04 00:00:00,A
712,"Agilent Beats on Q4 Earnings, Guides Above Expectations - Analyst Blog",http://www.zacks.com/stock/news/154614/agilent-beats-on-q4-earnings-guides-above-expectations,Zacks,2014-11-19 00:00:00,A
713,AGILENT TECHNOLOGIES,http://news.investors.com/111714-726871-AGILENT-TECHNOLOGIES.htm?ven=benzingacp&src=aurlaam,Investor's Business Daily,2014-11-17 00:00:00,A
714,Will Agilent Technologies (A) Miss Q4 Earnings Estimates? - Analyst Blog,http://www.zacks.com/stock/news/154124/will-agilent-technologies-a-miss-q4-earnings-estimates,Zacks,2014-11-14 00:00:00,A
...,...,...,...,...,...
1407303,5 Consumer Goods Stocks With The Lowest PEG Ratio,https://www.benzinga.com/trading-ideas/14/09/4818208/5-consumer-goods-stocks-with-the-lowest-peg-ratio,Lisa Levin,2014-09-02 00:00:00,ZX
1407304,China Zenix Auto International Limited Reports Q2 EPS of $0.16; Revenue of $154.70M,https://www.benzinga.com/news/earnings/14/08/4795109/china-zenix-auto-international-limited-reports-q2-eps-of-0-16-revenue-of,Paul Quintaro,2014-08-21 00:00:00,ZX
1407305,China Zenix Auto Reports Q1 EPS of $0.04; Revenue of $144.10M,https://www.benzinga.com/news/earnings/14/05/4557998/china-zenix-auto-reports-q1-eps-of-0-04-revenue-of-144-10m,Charles Gross,2014-05-15 00:00:00,ZX
1407306,China Zenix Names Martin Cheung as CFO,https://www.benzinga.com/news/14/03/4369482/china-zenix-names-martin-cheung-as-cfo,Charles Gross,2014-03-06 00:00:00,ZX


In [5]:
sp_100_data = pd.read_html('https://en.wikipedia.org/wiki/S%26P_100')
sp_100_cmpys = sp_100_data[2]
sp_100 = sp_100_cmpys['Symbol'].to_list()
sp_100


['AAPL',
 'ABBV',
 'ABT',
 'ACN',
 'ADBE',
 'AIG',
 'AMD',
 'AMGN',
 'AMT',
 'AMZN',
 'AVGO',
 'AXP',
 'BA',
 'BAC',
 'BK',
 'BKNG',
 'BLK',
 'BMY',
 'BRK.B',
 'C',
 'CAT',
 'CHTR',
 'CL',
 'CMCSA',
 'COF',
 'COP',
 'COST',
 'CRM',
 'CSCO',
 'CVS',
 'CVX',
 'DE',
 'DHR',
 'DIS',
 'DOW',
 'DUK',
 'EMR',
 'F',
 'FDX',
 'GD',
 'GE',
 'GILD',
 'GM',
 'GOOG',
 'GOOGL',
 'GS',
 'HD',
 'HON',
 'IBM',
 'INTC',
 'INTU',
 'JNJ',
 'JPM',
 'KHC',
 'KO',
 'LIN',
 'LLY',
 'LMT',
 'LOW',
 'MA',
 'MCD',
 'MDLZ',
 'MDT',
 'MET',
 'META',
 'MMM',
 'MO',
 'MRK',
 'MS',
 'MSFT',
 'NEE',
 'NFLX',
 'NKE',
 'NVDA',
 'ORCL',
 'PEP',
 'PFE',
 'PG',
 'PM',
 'PYPL',
 'QCOM',
 'RTX',
 'SBUX',
 'SCHW',
 'SO',
 'SPG',
 'T',
 'TGT',
 'TMO',
 'TMUS',
 'TSLA',
 'TXN',
 'UNH',
 'UNP',
 'UPS',
 'USB',
 'V',
 'VZ',
 'WFC',
 'WMT',
 'XOM']

In [35]:
sp_100_one_yr = one_yr_headlines.loc[one_yr_headlines['stock'].isin(sp_100)]

In [6]:
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert", token = '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')
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert", token = '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').to("mps")

In [8]:
class NewsHeadlines(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, max_length = 512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        text = self.data[idx]
        encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, padding='max_length', return_tensors='pt').to("mps")
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze()
        }

In [38]:
dataset = NewsHeadlines(sp_100_one_yr['headline'].to_list(), tokenizer)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=False)

# Perform inference in batches
outputs = []
for batch in dataloader:
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    with torch.no_grad():
        logits = model(input_ids, attention_mask=attention_mask)[0]
        probabilities = torch.softmax(logits, dim=1)
    outputs.append(probabilities)

In [39]:
outputs

[tensor([[0.0404, 0.4184, 0.5412],
         [0.7593, 0.0085, 0.2322],
         [0.0348, 0.0223, 0.9429],
         [0.0327, 0.0407, 0.9265],
         [0.5460, 0.0109, 0.4431],
         [0.2123, 0.0092, 0.7785],
         [0.0756, 0.0185, 0.9059],
         [0.0460, 0.2377, 0.7164]], device='mps:0'),
 tensor([[0.0789, 0.1585, 0.7626],
         [0.9442, 0.0293, 0.0264],
         [0.7603, 0.0097, 0.2300],
         [0.1085, 0.0110, 0.8805],
         [0.1039, 0.0171, 0.8790],
         [0.0613, 0.0125, 0.9262],
         [0.9218, 0.0434, 0.0347],
         [0.4009, 0.4406, 0.1585]], device='mps:0'),
 tensor([[0.0439, 0.0166, 0.9395],
         [0.0170, 0.9018, 0.0813],
         [0.6933, 0.0949, 0.2118],
         [0.0104, 0.9701, 0.0195],
         [0.8323, 0.0943, 0.0734],
         [0.2805, 0.5424, 0.1772],
         [0.9365, 0.0220, 0.0415],
         [0.6362, 0.0262, 0.3376]], device='mps:0'),
 tensor([[0.8991, 0.0149, 0.0860],
         [0.0367, 0.0657, 0.8976],
         [0.9416, 0.0304, 0.0281],
 

In [40]:
predictions = torch.cat(outputs, dim = 0)
predictions

tensor([[0.0404, 0.4184, 0.5412],
        [0.7593, 0.0085, 0.2322],
        [0.0348, 0.0223, 0.9429],
        ...,
        [0.2433, 0.0362, 0.7205],
        [0.0361, 0.0614, 0.9025],
        [0.0099, 0.9603, 0.0298]], device='mps:0')

In [41]:
positive = predictions[:, 0].tolist()
negative = predictions[:, 1].tolist()
neutral = predictions[:, 2].tolist()


table = {'Headline':sp_100_one_yr['headline'].to_list(),
         'Ticker': sp_100_one_yr['stock'].to_list(),
         'Date': sp_100_one_yr['date'].to_list(),
         "Positive":positive,
         "Negative":negative, 
         "Neutral":neutral}
      
df = pd.DataFrame(table, columns = ["Headline", "Ticker", "Date", "Positive", "Negative", "Neutral"])
df

Unnamed: 0,Headline,Ticker,Date,Positive,Negative,Neutral
0,SEI Investments Sees Mounting Expenses: Should You Hold? - Analyst Blog,BLK,2014-12-30 00:00:00,0.040438,0.418375,0.541186
1,"Principal Financial Fee-Based Model Thrives, Ups Capital Plan - Analyst Blog",BLK,2014-12-29 00:00:00,0.759318,0.008476,0.232206
2,Affiliated Managers Closes AQR Capital Investment Deal - Analyst Blog,BLK,2014-12-29 00:00:00,0.034803,0.022315,0.942882
3,Central Bankers The Movie - December 23 2014,BLK,2014-12-23 00:00:00,0.032739,0.040712,0.926548
4,Does Ameriprise's Revenue Growth Indicate Better Future? - Analyst Blog,BLK,2014-12-23 00:00:00,0.546031,0.010887,0.443082
...,...,...,...,...,...,...
9730,Benzinga Weekly Preview: All Eyes on Bank Earnings,WFC,2014-01-11 00:00:00,0.019091,0.052738,0.928170
9731,3 Reasons to Invest like a Sniper in Park National,WFC,2014-01-10 00:00:00,0.061492,0.016042,0.922466
9732,Bernie Madoff Proves the Value of Dividend Stocks,WFC,2014-01-08 00:00:00,0.243263,0.036237,0.720500
9733,How To React Instead Of Predicting This Earnings Season,WFC,2014-01-06 00:00:00,0.036130,0.061408,0.902462


In [42]:
df['Tone'] = df[['Positive', 'Negative', 'Neutral']].idxmax(axis = 1).replace({'Negative': -1, "Positive": 1, "Neutral": 0})
df

  df['Tone'] = df[['Positive', 'Negative', 'Neutral']].idxmax(axis = 1).replace({'Negative': -1, "Positive": 1, "Neutral": 0})


Unnamed: 0,Headline,Ticker,Date,Positive,Negative,Neutral,Tone
0,SEI Investments Sees Mounting Expenses: Should You Hold? - Analyst Blog,BLK,2014-12-30 00:00:00,0.040438,0.418375,0.541186,0
1,"Principal Financial Fee-Based Model Thrives, Ups Capital Plan - Analyst Blog",BLK,2014-12-29 00:00:00,0.759318,0.008476,0.232206,1
2,Affiliated Managers Closes AQR Capital Investment Deal - Analyst Blog,BLK,2014-12-29 00:00:00,0.034803,0.022315,0.942882,0
3,Central Bankers The Movie - December 23 2014,BLK,2014-12-23 00:00:00,0.032739,0.040712,0.926548,0
4,Does Ameriprise's Revenue Growth Indicate Better Future? - Analyst Blog,BLK,2014-12-23 00:00:00,0.546031,0.010887,0.443082,1
...,...,...,...,...,...,...,...
9730,Benzinga Weekly Preview: All Eyes on Bank Earnings,WFC,2014-01-11 00:00:00,0.019091,0.052738,0.928170,0
9731,3 Reasons to Invest like a Sniper in Park National,WFC,2014-01-10 00:00:00,0.061492,0.016042,0.922466,0
9732,Bernie Madoff Proves the Value of Dividend Stocks,WFC,2014-01-08 00:00:00,0.243263,0.036237,0.720500,0
9733,How To React Instead Of Predicting This Earnings Season,WFC,2014-01-06 00:00:00,0.036130,0.061408,0.902462,0


In [43]:
df['sentiment_embedding'] = np.tanh((np.log(df['Positive'] / df['Negative'])/ df['Neutral']))
df

Unnamed: 0,Headline,Ticker,Date,Positive,Negative,Neutral,Tone,sentiment_embedding
0,SEI Investments Sees Mounting Expenses: Should You Hold? - Analyst Blog,BLK,2014-12-30 00:00:00,0.040438,0.418375,0.541186,0,-0.999645
1,"Principal Financial Fee-Based Model Thrives, Ups Capital Plan - Analyst Blog",BLK,2014-12-29 00:00:00,0.759318,0.008476,0.232206,1,1.000000
2,Affiliated Managers Closes AQR Capital Investment Deal - Analyst Blog,BLK,2014-12-29 00:00:00,0.034803,0.022315,0.942882,0,0.439302
3,Central Bankers The Movie - December 23 2014,BLK,2014-12-23 00:00:00,0.032739,0.040712,0.926548,0,-0.230986
4,Does Ameriprise's Revenue Growth Indicate Better Future? - Analyst Blog,BLK,2014-12-23 00:00:00,0.546031,0.010887,0.443082,1,1.000000
...,...,...,...,...,...,...,...,...
9730,Benzinga Weekly Preview: All Eyes on Bank Earnings,WFC,2014-01-11 00:00:00,0.019091,0.052738,0.928170,0,-0.798606
9731,3 Reasons to Invest like a Sniper in Park National,WFC,2014-01-10 00:00:00,0.061492,0.016042,0.922466,0,0.897004
9732,Bernie Madoff Proves the Value of Dividend Stocks,WFC,2014-01-08 00:00:00,0.243263,0.036237,0.720500,0,0.989921
9733,How To React Instead Of Predicting This Earnings Season,WFC,2014-01-06 00:00:00,0.036130,0.061408,0.902462,0,-0.528277


In [39]:
means_df = df.groupby(['Ticker', 'Date']).agg({'Tone': 'mean', 'sentiment_embedding': 'mean'}).reset_index()
means_df

Unnamed: 0,Ticker,Date,Tone,sentiment_embedding
0,ABBV,2014-01-15 00:00:00,1.0,1.000000e+00
1,ABBV,2014-01-27 00:00:00,-1.0,-9.977198e-01
2,ABBV,2014-01-31 00:00:00,0.0,6.362652e-07
3,ABBV,2014-02-05 00:00:00,0.0,9.741661e-01
4,ABBV,2014-02-21 00:00:00,1.0,1.000000e+00
...,...,...,...,...
4770,WFC,2014-12-17 00:00:00,0.0,9.992707e-01
4771,WFC,2014-12-22 00:00:00,0.0,-9.235528e-06
4772,WFC,2014-12-24 00:00:00,-0.5,-2.755384e-03
4773,WFC,2014-12-29 00:00:00,0.0,9.999519e-01


In [41]:
len(means_df['Ticker'].unique())

57

In [44]:
df.to_csv('./2014_news_sentiment_data.csv')

In [9]:
sp_100_full_hls = headlines.loc[headlines['stock'].isin(sp_100)]

In [12]:
@timeout(100)
def get_batch_logits(batch, model):
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    with torch.no_grad():
        logits = model(input_ids, attention_mask=attention_mask)[0]
        probabilities = torch.softmax(logits, dim=1)
    return probabilities

In [None]:
dataset = NewsHeadlines(sp_100_full_hls['headline'].to_list(), tokenizer)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=False)

# Perform inference in batches
outputs = []
i = 1
logging.info("Dataset Loaded")
for batch in dataloader:
    try:
        probabilities = get_batch_logits(batch, model)
        outputs.append(probabilities)
        logging.info(f"Batch {i} completed")
    except TimeoutError:
        logging.into(f"Batch {i} timed out")
    i += 1


2024-04-15 15:24:45,072 - INFO - Dataset Loaded
2024-04-15 15:24:45,466 - INFO - Batch 1 completed
2024-04-15 15:24:45,770 - INFO - Batch 2 completed
2024-04-15 15:24:46,104 - INFO - Batch 3 completed
2024-04-15 15:24:46,420 - INFO - Batch 4 completed
2024-04-15 15:24:46,720 - INFO - Batch 5 completed
2024-04-15 15:24:47,048 - INFO - Batch 6 completed
2024-04-15 15:24:47,351 - INFO - Batch 7 completed
2024-04-15 15:24:47,650 - INFO - Batch 8 completed
2024-04-15 15:24:47,940 - INFO - Batch 9 completed
2024-04-15 15:24:48,234 - INFO - Batch 10 completed
2024-04-15 15:24:48,529 - INFO - Batch 11 completed
2024-04-15 15:24:48,823 - INFO - Batch 12 completed
2024-04-15 15:24:49,117 - INFO - Batch 13 completed
2024-04-15 15:24:49,411 - INFO - Batch 14 completed
2024-04-15 15:24:49,703 - INFO - Batch 15 completed
2024-04-15 15:24:49,999 - INFO - Batch 16 completed
2024-04-15 15:24:50,293 - INFO - Batch 17 completed
2024-04-15 15:24:50,587 - INFO - Batch 18 completed
2024-04-15 15:24:50,880 -

2024-04-15 16:26:58,484 - INFO - Batch 12363 completed
2024-04-15 16:26:58,776 - INFO - Batch 12364 completed
2024-04-15 16:26:59,074 - INFO - Batch 12365 completed
2024-04-15 16:26:59,370 - INFO - Batch 12366 completed
2024-04-15 16:26:59,668 - INFO - Batch 12367 completed
2024-04-15 16:26:59,962 - INFO - Batch 12368 completed
2024-04-15 16:27:00,259 - INFO - Batch 12369 completed
2024-04-15 16:27:00,550 - INFO - Batch 12370 completed
2024-04-15 16:27:00,842 - INFO - Batch 12371 completed
2024-04-15 16:27:01,134 - INFO - Batch 12372 completed
2024-04-15 16:27:01,425 - INFO - Batch 12373 completed
2024-04-15 16:27:01,715 - INFO - Batch 12374 completed
2024-04-15 16:27:02,009 - INFO - Batch 12375 completed
2024-04-15 16:27:02,303 - INFO - Batch 12376 completed
2024-04-15 16:27:02,596 - INFO - Batch 12377 completed
2024-04-15 16:27:02,887 - INFO - Batch 12378 completed
2024-04-15 16:27:03,488 - INFO - Batch 12380 completed
2024-04-15 16:27:03,815 - INFO - Batch 12381 completed
2024-04-15

In [11]:
from functools import wraps
import errno
import os
import signal

class TimeoutError(Exception):
    pass

def timeout(seconds=100, error_message=os.strerror(errno.ETIME)):
    def decorator(func):
        def _handle_timeout(signum, frame):
            raise TimeoutError(error_message)

        def wrapper(*args, **kwargs):
            signal.signal(signal.SIGALRM, _handle_timeout)
            signal.alarm(seconds)
            try:
                result = func(*args, **kwargs)
            finally:
                signal.alarm(0)
            return result

        return wraps(func)(wrapper)

    return decorator