## This notebook contains demo for our fine-tuned Analyst Tone model. We fine-tuned FinBERT model on 10,000 manually annotated analyst statements. You can use this script and infer sentiment on your customerized dataset.

In [119]:
# download pre-trained and fine-tuned weights, unzip to the working directory
# https://gohkust-my.sharepoint.com/:u:/g/personal/imyiyang_ust_hk/EQJGiEOkhIlBqlW63TbKA3gBCYgDDcHlBCB7VTXIUMmyiA

In [120]:
import sys 
print(sys.version)

3.7.3 (default, Mar 27 2019, 16:54:48) 
[Clang 4.0.1 (tags/RELEASE_401/final)]


In [121]:
import os
import copy
import numpy as np
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import lr_scheduler
#from torch.utils.data import Dataset, DataLoader
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM, BertConfig
from bertModel import BertClassification, dense_opt
from datasets import text_dataset, financialPhraseBankDataset
import argparse
from sklearn.metrics import f1_score

In [122]:
labels = {0:'neutral', 1:'positive',2:'negative'}
num_labels= len(labels)
vocab ="finance-uncased"
vocab_path = '/Users/svetlana/Downloads/analyst_tone-2/vocab' 
pretrained_weights_path ='/Users/svetlana/Downloads/analyst_tone-2/pretrained_weights'
fine_tuned_weight_path = '/Users/svetlana/Downloads/analyst_tone-2/fine_tuned.pth'   
max_seq_length=256
device=torch.device("cpu")

In [123]:
model = BertClassification(weight_path = pretrained_weights_path, 
                           num_labels=num_labels, vocab=vocab)

  nn.init.xavier_normal(self.classifier.weight)


In [124]:
print("Model's state_dict:")
for param_tensor in model.state_dict():
    print(param_tensor, "\t", model.state_dict()[param_tensor].size())

Model's state_dict:
bert.embeddings.word_embeddings.weight 	 torch.Size([30873, 768])
bert.embeddings.position_embeddings.weight 	 torch.Size([512, 768])
bert.embeddings.token_type_embeddings.weight 	 torch.Size([2, 768])
bert.embeddings.LayerNorm.weight 	 torch.Size([768])
bert.embeddings.LayerNorm.bias 	 torch.Size([768])
bert.encoder.layer.0.attention.self.query.weight 	 torch.Size([768, 768])
bert.encoder.layer.0.attention.self.query.bias 	 torch.Size([768])
bert.encoder.layer.0.attention.self.key.weight 	 torch.Size([768, 768])
bert.encoder.layer.0.attention.self.key.bias 	 torch.Size([768])
bert.encoder.layer.0.attention.self.value.weight 	 torch.Size([768, 768])
bert.encoder.layer.0.attention.self.value.bias 	 torch.Size([768])
bert.encoder.layer.0.attention.output.dense.weight 	 torch.Size([768, 768])
bert.encoder.layer.0.attention.output.dense.bias 	 torch.Size([768])
bert.encoder.layer.0.attention.output.LayerNorm.weight 	 torch.Size([768])
bert.encoder.layer.0.attention.outp

bert.encoder.layer.9.attention.self.key.bias 	 torch.Size([768])
bert.encoder.layer.9.attention.self.value.weight 	 torch.Size([768, 768])
bert.encoder.layer.9.attention.self.value.bias 	 torch.Size([768])
bert.encoder.layer.9.attention.output.dense.weight 	 torch.Size([768, 768])
bert.encoder.layer.9.attention.output.dense.bias 	 torch.Size([768])
bert.encoder.layer.9.attention.output.LayerNorm.weight 	 torch.Size([768])
bert.encoder.layer.9.attention.output.LayerNorm.bias 	 torch.Size([768])
bert.encoder.layer.9.intermediate.dense.weight 	 torch.Size([3072, 768])
bert.encoder.layer.9.intermediate.dense.bias 	 torch.Size([3072])
bert.encoder.layer.9.output.dense.weight 	 torch.Size([768, 3072])
bert.encoder.layer.9.output.dense.bias 	 torch.Size([768])
bert.encoder.layer.9.output.LayerNorm.weight 	 torch.Size([768])
bert.encoder.layer.9.output.LayerNorm.bias 	 torch.Size([768])
bert.encoder.layer.10.attention.self.query.weight 	 torch.Size([768, 768])
bert.encoder.layer.10.attention.s

In [125]:
model.load_state_dict(torch.load(fine_tuned_weight_path, map_location=device))
model.to(device)

BertClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30873, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )


# 0 is neutral, 1 is positive, and 2 is negative 

In [126]:
import glob
import csv
import pandas as pd
text_path = '/Users/svetlana/Desktop/Creation/*'
files = glob.glob(text_path)

In [127]:
files

['/Users/svetlana/Desktop/Creation/MUR 10102018.txt',
 '/Users/svetlana/Desktop/Creation/ADM 29102018.txt',
 '/Users/svetlana/Desktop/Creation/LOW 24092009.txt',
 '/Users/svetlana/Desktop/Creation/GBX 26102018 .txt',
 '/Users/svetlana/Desktop/Creation/CME 12012021.txt',
 '/Users/svetlana/Desktop/Creation/RLGY 15022017.txt',
 '/Users/svetlana/Desktop/Creation/VTR 06112020.txt',
 '/Users/svetlana/Desktop/Creation/WMT 11102018.txt',
 '/Users/svetlana/Desktop/Creation/MDLZ 02072015.txt',
 '/Users/svetlana/Desktop/Creation/PLD 05112018 .txt']

In [128]:
sentences = []
for file in files:
    with open(file) as f:
        #sentence = f.read().strip().split('\n')
        sentence = f.read()
    sentences.append(sentence)

In [129]:
sentences

['EL DORADO, Ark.--(BUSINESS WIRE)--Oct. 10, 2018-- Murphy Oil Corporation (NYSE: MUR) announced today that its wholly owned subsidiary, Murphy Exploration & Production Company - USA, has entered into a definitive agreement to form a new joint venture company with Petrobras America Inc. (“PAI”), a subsidiary of Petrobras (NYSE: PBR). The joint venture company will be comprised of Gulf of Mexico producing assets from Murphy and PAI with Murphy overseeing the operations. The transaction will have an effective date of October 1, 2018 and is expected to close by year-end 2018. \nBoth companies will contribute all their current producing Gulf of Mexico assets to the joint venture, which will be owned 80 percent by Murphy and 20 percent by PAI. The transaction excludes exploration blocks from both companies, with the exception of PAI’s blocks that hold deep exploration rights. Murphy will pay cash consideration of $900 million to PAI, subject to normal closing adjustments. Additionally, PAI 

In [130]:
import pandas as pd
df = pd.DataFrame(sentences)
df

Unnamed: 0,0
0,"EL DORADO, Ark.--(BUSINESS WIRE)--Oct. 10, 201..."
1,﻿Archer Daniels Midland Company (NYSE: ADM) an...
2,"MOORESVILLE, N.C., Aug. 24 /PRNewswire-FirstCa..."
3,"﻿LAKE OSWEGO, Ore., Oct. 26, 2018 /PRNewswire/..."
4,"﻿CME Group (NASDAQ: CME), the world's leading ..."
5,"MADISON, N.J. and CHICAGO, Feb. 15, 2017 /PRNe..."
6,"﻿CHICAGO--(BUSINESS WIRE)--Ventas, Inc. (NYSE:..."
7,"BENTONVILLE, Ark., Oct. 11, 2018 – Today, Walm..."
8,"Shanghai, July 2, 2015 – Mondelēz Internationa..."
9,"Nov 05, 2018 \nSAN FRANCISCO, Nov. 5, 2018 /PR..."


In [131]:
df.at[6,0]

'\ufeffCHICAGO--(BUSINESS WIRE)--Ventas, Inc. (NYSE: VTR) (“Ventas” or the “Company”), an S&P 500 company and one of the world’s leading owners of healthcare, senior housing and research & innovation properties, announced today it has formed a joint venture (the “JV”) with GIC. The JV will initially own four in-progress university-based Research & Innovation (“R&I”) development projects (the “Initial R&I JV Projects”) with total project costs estimated at $930 million. The JV may be expanded to include other pre-identified future R&I development projects.\n“We are excited to announce this attractive R&I development partnership with GIC, one of the world’s most respected real estate investors,” said Debra A. Cafaro, Ventas Chairman & CEO. “With this strategic partnership, we continue to diversify our capital sources, retain a majority interest in our ongoing R&I developments, accelerate additional projects from our pipeline of opportunities and enhance our liquidity and financial flexib

In [132]:
import transformers

In [133]:
from transformers import BertTokenizer

In [134]:
tokenizer = BertTokenizer(vocab_file = vocab_path, do_lower_case = True, 
                          do_basic_tokenize = True)

In [135]:
for sent in sentences:
    encoded_data = tokenizer.batch_encode_plus(
    sentences, add_special_tokens = True,
    return_attention_mask = True,
    max_length = max_seq_length,
    pad_to_max_length = True,
    return_tensors = 'pt')

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [136]:
from torch import Tensor

In [137]:
labels = torch.empty(len(sentences), dtype=torch.long)

In [138]:
input_ids = encoded_data['input_ids']
attention_masks = encoded_data['attention_mask']
labels = torch.tensor(labels)

  This is separate from the ipykernel package so we can avoid doing imports until


In [139]:
labels

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [140]:
dataset = TensorDataset(input_ids, attention_masks, labels)

In [141]:
batch_size = 32

In [142]:
dataloader = DataLoader(
    dataset, 
    sampler = SequentialSampler(dataset),
    batch_size=batch_size)

In [143]:
def predict(dataloader_):
    model.eval()
    predictions = []
    for batch in dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2],
                 }
        with torch.no_grad():
            outputs = model(**inputs)
        
        outputs = F.softmax(outputs,dim=1)
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        
        predictions.append(outputs)
        
    predictions = np.concatenate(predictions, axis = 0)
    
    return predictions

In [144]:
pred = predict(dataloader_ = dataloader)
pred

array([[9.99961257e-01, 3.83020051e-05, 5.08418395e-07],
       [2.75445014e-01, 7.24547803e-01, 7.13355075e-06],
       [1.51895314e-01, 8.48099291e-01, 5.35283652e-06],
       [9.35520649e-01, 6.44244477e-02, 5.49040778e-05],
       [5.07179209e-07, 9.99999523e-01, 1.42591778e-08],
       [9.99375403e-01, 6.07218943e-04, 1.74464185e-05],
       [2.20124647e-01, 7.79869616e-01, 5.77074843e-06],
       [2.54127372e-05, 9.99974489e-01, 1.09009946e-07],
       [9.99278605e-01, 7.19159783e-04, 2.21359687e-06],
       [7.80923432e-03, 9.92190063e-01, 7.44203078e-07]], dtype=float32)

In [145]:
len(pred)

10

In [146]:
positives = 0
negatives = 0
neutrals = 0
sentiment = []
for i in range(len(pred)):
    pred_labels_i = np.argmax(pred[i]).flatten()
    sentiment.append(pred_labels_i[0])
    if pred_labels_i[0] == 1:
        positives+=1
    elif pred_labels_i[0] == 2:
        negatives+=1
    elif pred_labels_i[0] == 0:
        neutrals+=1
print(positives, negatives, neutrals) 
print(positives / len(pred), negatives / len(pred), neutrals / len(pred))

6 0 4
0.6 0.0 0.4


In [147]:
sentiment

[0, 1, 1, 0, 1, 0, 1, 1, 0, 1]

In [148]:
df['sentiment'] = sentiment

In [149]:
df

Unnamed: 0,0,sentiment
0,"EL DORADO, Ark.--(BUSINESS WIRE)--Oct. 10, 201...",0
1,﻿Archer Daniels Midland Company (NYSE: ADM) an...,1
2,"MOORESVILLE, N.C., Aug. 24 /PRNewswire-FirstCa...",1
3,"﻿LAKE OSWEGO, Ore., Oct. 26, 2018 /PRNewswire/...",0
4,"﻿CME Group (NASDAQ: CME), the world's leading ...",1
5,"MADISON, N.J. and CHICAGO, Feb. 15, 2017 /PRNe...",0
6,"﻿CHICAGO--(BUSINESS WIRE)--Ventas, Inc. (NYSE:...",1
7,"BENTONVILLE, Ark., Oct. 11, 2018 – Today, Walm...",1
8,"Shanghai, July 2, 2015 – Mondelēz Internationa...",0
9,"Nov 05, 2018 \nSAN FRANCISCO, Nov. 5, 2018 /PR...",1


In [150]:
from collections import defaultdict
groups = defaultdict(list)
roots = []
tickers = []
dates = []
for file in files:
    basename = os.path.basename(file)
    root, extension = os.path.splitext(basename)
    roots.append(root)
    for root in roots:
        ticker = str(root).split()[0]
        date = str(root).split()[1]
    tickers.append(ticker)
    dates.append(date)

In [151]:
import datetime
import timedelta

In [152]:
df['ticker'] = tickers
df['dates'] = dates

In [161]:
df['date'] = pd.to_datetime(df['dates'], format="%d%m%Y", infer_datetime_format=True)
df = df.drop(columns = ['dates'])

In [162]:
from yahoo_fin import stock_info as si
from yahoo_fin.stock_info import *

In [163]:
price_data = si.get_data(ticker = 'MUR', start_date = '2018-10-10')['close']
price_data

2018-10-10    31.770000
2018-10-11    35.459999
2018-10-12    35.779999
2018-10-15    35.560001
2018-10-16    35.380001
                ...    
2021-02-26    16.330000
2021-03-01    16.780001
2021-03-02    16.510000
2021-03-03    17.459999
2021-03-04    18.830000
Name: close, Length: 603, dtype: float64

In [164]:
price_data_list = []
for ticker, date in zip(df.ticker, df.date): 
    price_data = si.get_data(ticker, start_date = date, 
                             end_date = date + timedelta.Timedelta(days=1))['close']
    price_data_list.append(price_data)

In [165]:
price = []
price_change = []
for i in price_data_list:
    price_only = str(i).split()[1]
    price.append(price_only)
    
print(price)

['31.77', '47.009998', '21.389999', '48.810001', '201.490005', '27.290001', '39.639999', '93.919998', '41.549999', '65.07']


In [166]:
df['price'] = pd.DataFrame(price)

In [167]:
df.rename(columns={'0':'text'})
df

Unnamed: 0,0,sentiment,ticker,date,price
0,"EL DORADO, Ark.--(BUSINESS WIRE)--Oct. 10, 201...",0,MUR,2018-10-10,31.77
1,﻿Archer Daniels Midland Company (NYSE: ADM) an...,1,ADM,2018-10-29,47.009998
2,"MOORESVILLE, N.C., Aug. 24 /PRNewswire-FirstCa...",1,LOW,2009-09-24,21.389999
3,"﻿LAKE OSWEGO, Ore., Oct. 26, 2018 /PRNewswire/...",0,GBX,2018-10-26,48.810001
4,"﻿CME Group (NASDAQ: CME), the world's leading ...",1,CME,2021-01-12,201.490005
5,"MADISON, N.J. and CHICAGO, Feb. 15, 2017 /PRNe...",0,RLGY,2017-02-15,27.290001
6,"﻿CHICAGO--(BUSINESS WIRE)--Ventas, Inc. (NYSE:...",1,VTR,2020-11-06,39.639999
7,"BENTONVILLE, Ark., Oct. 11, 2018 – Today, Walm...",1,WMT,2018-10-11,93.919998
8,"Shanghai, July 2, 2015 – Mondelēz Internationa...",0,MDLZ,2015-07-02,41.549999
9,"Nov 05, 2018 \nSAN FRANCISCO, Nov. 5, 2018 /PR...",1,PLD,2018-11-05,65.07
