## This notebook contains demo for our fine-tuned Analyst Tone model. We fine-tuned FinBERT model on 10,000 manually annotated analyst statements. You can use this script and infer sentiment on your customerized dataset.

In [1]:
# download pre-trained and fine-tuned weights, unzip to the working directory
# https://gohkust-my.sharepoint.com/:u:/g/personal/imyiyang_ust_hk/EQJGiEOkhIlBqlW63TbKA3gBCYgDDcHlBCB7VTXIUMmyiA

In [71]:
import sys 
print(sys.version)

3.7.3 (default, Mar 27 2019, 16:54:48) 
[Clang 4.0.1 (tags/RELEASE_401/final)]


In [72]:
import os
import copy
import numpy as np
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import lr_scheduler
#from torch.utils.data import Dataset, DataLoader
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM, BertConfig
from bertModel import BertClassification, dense_opt
from datasets import text_dataset, financialPhraseBankDataset
import argparse
from sklearn.metrics import f1_score

In [73]:
labels = {0:'neutral', 1:'positive',2:'negative'}
num_labels= len(labels)
vocab ="finance-uncased"
vocab_path = '/Users/svetlana/Downloads/analyst_tone-2/vocab' 
pretrained_weights_path ='/Users/svetlana/Downloads/analyst_tone-2/pretrained_weights'
fine_tuned_weight_path = '/Users/svetlana/Downloads/analyst_tone-2/fine_tuned.pth'   
max_seq_length=256
device=torch.device("cpu")

In [74]:
model = BertClassification(weight_path = pretrained_weights_path, 
                           num_labels=num_labels, vocab=vocab)

  nn.init.xavier_normal(self.classifier.weight)


In [75]:
print("Model's state_dict:")
for param_tensor in model.state_dict():
    print(param_tensor, "\t", model.state_dict()[param_tensor].size())

Model's state_dict:
bert.embeddings.word_embeddings.weight 	 torch.Size([30873, 768])
bert.embeddings.position_embeddings.weight 	 torch.Size([512, 768])
bert.embeddings.token_type_embeddings.weight 	 torch.Size([2, 768])
bert.embeddings.LayerNorm.weight 	 torch.Size([768])
bert.embeddings.LayerNorm.bias 	 torch.Size([768])
bert.encoder.layer.0.attention.self.query.weight 	 torch.Size([768, 768])
bert.encoder.layer.0.attention.self.query.bias 	 torch.Size([768])
bert.encoder.layer.0.attention.self.key.weight 	 torch.Size([768, 768])
bert.encoder.layer.0.attention.self.key.bias 	 torch.Size([768])
bert.encoder.layer.0.attention.self.value.weight 	 torch.Size([768, 768])
bert.encoder.layer.0.attention.self.value.bias 	 torch.Size([768])
bert.encoder.layer.0.attention.output.dense.weight 	 torch.Size([768, 768])
bert.encoder.layer.0.attention.output.dense.bias 	 torch.Size([768])
bert.encoder.layer.0.attention.output.LayerNorm.weight 	 torch.Size([768])
bert.encoder.layer.0.attention.outp

bert.encoder.layer.9.attention.self.key.bias 	 torch.Size([768])
bert.encoder.layer.9.attention.self.value.weight 	 torch.Size([768, 768])
bert.encoder.layer.9.attention.self.value.bias 	 torch.Size([768])
bert.encoder.layer.9.attention.output.dense.weight 	 torch.Size([768, 768])
bert.encoder.layer.9.attention.output.dense.bias 	 torch.Size([768])
bert.encoder.layer.9.attention.output.LayerNorm.weight 	 torch.Size([768])
bert.encoder.layer.9.attention.output.LayerNorm.bias 	 torch.Size([768])
bert.encoder.layer.9.intermediate.dense.weight 	 torch.Size([3072, 768])
bert.encoder.layer.9.intermediate.dense.bias 	 torch.Size([3072])
bert.encoder.layer.9.output.dense.weight 	 torch.Size([768, 3072])
bert.encoder.layer.9.output.dense.bias 	 torch.Size([768])
bert.encoder.layer.9.output.LayerNorm.weight 	 torch.Size([768])
bert.encoder.layer.9.output.LayerNorm.bias 	 torch.Size([768])
bert.encoder.layer.10.attention.self.query.weight 	 torch.Size([768, 768])
bert.encoder.layer.10.attention.s

In [76]:
model.load_state_dict(torch.load(fine_tuned_weight_path, map_location=device))
model.to(device)

BertClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30873, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )


# 0 is neutral, 1 is positive, and 2 is negative 

In [77]:
import glob
text_path = '/Users/svetlana/Desktop/Creation/*'
files = glob.glob(text_path)

In [78]:
for file in files:
    with open(file) as f:
        sentences = f.readlines()

In [79]:
import pandas as pd
df = pd.DataFrame(sentences)
df

Unnamed: 0,0
0,"﻿Hitachi and SBI, the largest state-owned comm..."
1,enter into Joint Venture to accelerate Digital...
2,Partnership to establish a state-of-the-art ca...
3,and future ready digital payments platform\n
4,"MUMBAI, October 29, 2018 --- Hitachi, Ltd. (TS..."
5,It is planned that Hitachi Payments will inves...
6,Financial services market in India is making r...
7,"SBI, as the largest state-owned commercial ban..."
8,Hitachi Payments empowers financial institutio...
9,"Through this joint venture, Hitachi enters a n..."


In [80]:
sentences = df[0].values

In [157]:
sentences

array(['\ufeffHitachi and SBI, the largest state-owned commercial bank in India,\n',
       'enter into Joint Venture to accelerate Digital Payments in India\n',
       'Partnership to establish a state-of-the-art card acceptance\n',
       'and future ready digital payments platform\n',
       'MUMBAI, October 29, 2018 --- Hitachi, Ltd. (TSE: 6501, "Hitachi") today announced that Hitachi Payment Services Pvt. Ltd. ("Hitachi Payments"), a wholly-owned subsidiary based in India of Hitachi, and State Bank of India ("SBI") have signed a definitive agreement to enter into a joint venture for the establishment of a state-of-the-art card acceptance and future ready digital payments platform for India.\n',
       'It is planned that Hitachi Payments will invest [26%] to SBI Payment Services Pvt. Ltd. ("SBI Payment"), a wholly-owned subsidiary of SBI, and through this investment, SBI Payment will be a joint venture between both parties. Both parties will proceed to apply for regulatory approva

In [158]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [159]:
import transformers

In [160]:
from transformers import BertTokenizer

In [161]:
tokenizer = BertTokenizer(vocab_file = vocab_path, do_lower_case = True, 
                          do_basic_tokenize = True)

In [162]:
for sent in sentences:
    encoded_data = tokenizer.batch_encode_plus(
    sentences, add_special_tokens = True,
    return_attention_mask = True,
    max_length = max_seq_length,
    pad_to_max_length = True,
    return_tensors = 'pt')

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [163]:
from torch import Tensor

In [164]:
labels = torch.empty(len(sentences), dtype=torch.long)

In [165]:
input_ids = encoded_data['input_ids']
attention_masks = encoded_data['attention_mask']
labels = torch.tensor(labels)

  This is separate from the ipykernel package so we can avoid doing imports until


In [166]:
labels

tensor([5764607523034234880, 1152930294905439388,                  11,
                          0,                   0,                   0,
                          0,                   0,                   0,
                          0,                   0,                   0])

In [167]:
dataset = TensorDataset(input_ids, attention_masks, labels)

In [172]:
batch_size = 2

In [173]:
dataloader = DataLoader(
    dataset, 
    sampler = SequentialSampler(dataset),
    batch_size=batch_size)

In [194]:
def predict(dataloader_):
    model.eval()
    predictions = []
    for batch in dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2],
                 }
        with torch.no_grad():
            outputs = model(**inputs)
        
        outputs = F.softmax(outputs,dim=1)
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        
        predictions.append(outputs)
        
    predictions = np.concatenate(predictions, axis = 0)
    
    return predictions

In [195]:
pred = predict(dataloader_ = dataloader)
pred

array([[9.99121726e-01, 6.42632367e-04, 2.35585467e-04],
       [9.99090672e-01, 7.70008308e-04, 1.39302560e-04],
       [9.93162036e-01, 6.80811796e-03, 2.98820760e-05],
       [1.04347067e-02, 9.89528000e-01, 3.73049334e-05],
       [9.98422384e-01, 1.56004529e-03, 1.75645400e-05],
       [9.99921560e-01, 5.48552198e-05, 2.35760435e-05],
       [1.70250829e-07, 9.99999762e-01, 1.29963453e-07],
       [8.82389426e-01, 1.17605723e-01, 4.78871607e-06],
       [5.92404723e-01, 4.07585800e-01, 9.46625187e-06],
       [2.07257335e-06, 9.99997973e-01, 2.34200428e-08],
       [2.07182378e-01, 7.92782605e-01, 3.50150985e-05],
       [1.18270125e-02, 9.88171220e-01, 1.77317634e-06]], dtype=float32)

In [197]:
len(pred)

12

In [215]:
positives = 0
for i in range(len(pred)):
    pred_labels_i = np.argmax(pred[i]).flatten()
    if pred_labels_i[0] == 1:
        positives+=1
print(positives) 
print(positives / len(pred))

5
0.4166666666666667
