# Pipeline

In [None]:
# installing some of the required libraries
!pip install transformers -q

[K     |████████████████████████████████| 3.8 MB 29.1 MB/s 
[K     |████████████████████████████████| 67 kB 6.0 MB/s 
[K     |████████████████████████████████| 596 kB 51.3 MB/s 
[K     |████████████████████████████████| 895 kB 68.0 MB/s 
[K     |████████████████████████████████| 6.5 MB 60.6 MB/s 
[?25h

In [None]:
# importing all the required libraries
import pandas as pd
import numpy as np
import os, json
from tqdm import tqdm
import nltk
from scipy import stats
from nltk.tokenize import sent_tokenize
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import regex as re
nltk.download('punkt')
nltk.download('stopwords')

# all the three models used for voting
tokenizer1 = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model1 = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
classifier1 = pipeline("sentiment-analysis", model=model1, tokenizer=tokenizer1)

tokenizer2 = AutoTokenizer.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis")
model2 = AutoModelForSequenceClassification.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis")
classifier2 = pipeline("sentiment-analysis", model=model2, tokenizer=tokenizer2)

tokenizer3 = AutoTokenizer.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")
model3 = AutoModelForSequenceClassification.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")
classifier3 = pipeline("sentiment-analysis", model=model3, tokenizer=tokenizer3)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Downloading:   0%|          | 0.00/252 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/758 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/418M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/369 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/221k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/453k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/789 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/419M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/333 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/933 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/313M [00:00<?, ?B/s]

In [None]:
def predict(big_para):
  ### Function to give sentiment, its confidence scores and the top 10 positive and negative sentiment sentences in an Item

  # splitting para into sentences
  lines = sent_tokenize(big_para)
  lines_length_dict = {}
  index_to_length_dict = {}
  line_to_index_dict = {}

  for i,j in enumerate(lines):
    line_to_index_dict[j] = i
    index_to_length_dict[i] = len(j)
    line_to_index_dict[j] = i

  ## 1st model confidence and label prediction code
  scores_1 = classifier1(lines)
  confidence_scores_1 = []
  labels_1 = []
  for i in scores_1:
    confidence_scores_1.append(i['score'])
    labels_1.append(i['label'])
  
  if (len(confidence_scores_1)==0):
        confidence_1 = "empty"
        mode_label_1 = "empty"
  else:
    confidence_1 = np.mean(confidence_scores_1)
    mode_label_1 = stats.mode(labels_1)[0][0]
  
  ## 2nd model confidence and label prediction code
  scores_2 = classifier2(lines)
  confidence_scores_2 = []
  labels_2 = []
  for i in scores_2:
    confidence_scores_2.append(i['score'])
    labels_2.append(i['label'])
  
  if (len(confidence_scores_2)==0):
        confidence_2 = "empty"
        mode_label_2 = "empty"
  else:
    confidence_2 = np.mean(confidence_scores_2)
    mode_label_2 = stats.mode(labels_2)[0][0]
  
  ## 3rd nodel confidence and label prediction code
  scores_3 = classifier3(lines)
  confidence_scores_3 = []
  labels_3 = []
  for i in scores_3:
    confidence_scores_3.append(i['score'])
    labels_3.append(i['label'])
  
  if (len(confidence_scores_3)==0):
        confidence_3 = "empty"
        mode_label_3 = "empty"
  else:
    confidence_3 = np.mean(confidence_scores_3)
    mode_label_3 = stats.mode(labels_3)[0][0]
  
  ## confidence and label prediction code
  # taking a voting classificaiton of mode_label_1, mode_label_2, mode_label_3
  label_final = stats.mode([mode_label_1,mode_label_2,mode_label_3])[0][0]
  # create label and confidence dict of all the three
  label_dict = {mode_label_1:confidence_1,mode_label_2:confidence_2,mode_label_3:confidence_3}
  confidence_final = label_dict[label_final]

  # getting top 10 positive and 10 negative sentences
  # getting the index of the sentences
  # using classifier1 sentiment scores for depicting sentiment of the sentences
  pos_label_dict = {}
  neg_label_dict = {}
  neutral_label_dict = {}
  for i in range(len(labels_1)):
    if (labels_1[i]=='positive'):
      pos_label_dict[lines[i]] = confidence_scores_1[i]
    elif (labels_1[i]=='negative'):
      neg_label_dict[lines[i]] = confidence_scores_1[i]
    else:
      neutral_label_dict[lines[i]] = confidence_scores_1[i]
  
  # sorting the dicts according to the confidence scores
  pos_label_dict = {k: v for k, v in sorted(pos_label_dict.items(), key=lambda item: item[1], reverse=True)}
  neg_label_dict = {k: v for k, v in sorted(neg_label_dict.items(), key=lambda item: item[1], reverse=True)}

  top_10_pos_sentence = list(pos_label_dict.keys())[:10]
  top_10_neg_sentence = list(neg_label_dict.keys())[:10]
  
  # making the dictionary that stores the positive and negative sentence indices
  sentences_sentiment_dict = {}
  sentences_sentiment_dict['positive'] = []
  sentences_sentiment_dict['negative'] = []

  for i in top_10_pos_sentence:
    temp_index = line_to_index_dict[i]
    temp_index = temp_index - 1
    temp_sum = 0
    while (temp_index>=0):
      temp_sum += index_to_length_dict[temp_index]
      temp_index = temp_index - 1
    sentences_sentiment_dict['positive'].append((temp_sum+line_to_index_dict[i],temp_sum+len(i)+line_to_index_dict[i]))
  
  for i in top_10_neg_sentence:
    temp_index = line_to_index_dict[i]
    temp_sum = 0
    temp_index = temp_index- 1
    while (temp_index>=0):
      temp_sum += index_to_length_dict[temp_index]
      temp_index = temp_index -1
    sentences_sentiment_dict['negative'].append((temp_sum+line_to_index_dict[i],temp_sum+len(i)+line_to_index_dict[i]))
    
  value_dict = {"confidence":confidence_final,"label": label_final, "sentences_highlight": sentences_sentiment_dict}
  return value_dict

In [None]:
# go to directory where all the files and code are stored
cd /content/drive/MyDrive/NLP/Sentiment Analysis

/content/drive/MyDrive/NLP/Sentiment Analysis


In [None]:
store_path = "/path/to/store_preds/"
files_path = "/path/to/10K_fillings/"
items_list = ["item_7","item_7A","item_9","item_9A","item_9B"]

In [None]:
files = os.listdir(files_path)
len(files)

1014

In [None]:
def predict_and_save(file_name):
  fp = open(files_path+file_name,'r')
  df = json.load(fp)

  value_d = {}

  for i in tqdm(items_list, position=0, leave = True):
    i_dict = predict(df[i])
    value_d[i] = i_dict

  df = pd.DataFrame.from_dict(value_d, orient='index')
  df.to_csv(store_path + file_name[:-5] + '_sentiment_dict.csv')

In [None]:
# initially this was tested for a small number of files and hence only 14 are shown in tqdm
files_not_having_item_7_or_9 = []
for f in tqdm(files, position=0, leave=True):
    try:
        predict_and_save(f)
    except:
        files_not_having_item_7_or_9.append(f)

100%|██████████| 5/5 [01:29<00:00, 17.90s/it]
100%|██████████| 5/5 [01:30<00:00, 18.06s/it]
100%|██████████| 5/5 [01:09<00:00, 13.92s/it]
100%|██████████| 5/5 [01:12<00:00, 14.43s/it]
100%|██████████| 5/5 [01:05<00:00, 13.12s/it]
100%|██████████| 5/5 [01:03<00:00, 12.74s/it]
100%|██████████| 5/5 [02:43<00:00, 32.68s/it]
100%|██████████| 5/5 [02:37<00:00, 31.56s/it]
100%|██████████| 5/5 [02:25<00:00, 29.09s/it]
100%|██████████| 5/5 [02:06<00:00, 25.37s/it]
100%|██████████| 5/5 [02:29<00:00, 29.83s/it]
100%|██████████| 5/5 [02:10<00:00, 26.07s/it]
100%|██████████| 5/5 [01:55<00:00, 23.05s/it]
100%|██████████| 5/5 [01:47<00:00, 21.54s/it]
100%|██████████| 14/14 [25:56<00:00, 111.16s/it]


#### Checking the text of the json files

In [None]:
# opening a test file to check data
fp = open(files_path+files[0],'r')
df = json.load(fp)

In [None]:
df

{'cik': '1023731',
 'company': '8X8 INC /DE/',
 'complete_text_filing_link': 'https://www.sec.gov/Archives/edgar/data/1023731/0001136261-18-000158.txt',
 'filename': '10K_1023731_20180331_0001136261-18-000158.htm',
 'filing_date': '2018-05-30',
 'filing_html_index': 'https://www.sec.gov/Archives/edgar/data/1023731/0001136261-18-000158-index.html',
 'filing_type': '10-K',
 'fiscal_year_end': '0331',
 'htm_filing_link': 'https://www.sec.gov/Archives/edgar/data/1023731/000113626118000158/body10k.htm',
 'item_1': 'ITEM 1. BUSINESS\nOverview\nA provider of enterprise cloud communications solutions, 8x8 helps businesses get their employees, customers and applications talking, and to make people more connected and productive worldwide. From a unified, proprietary platform, we offer unified communications, team collaboration, conferencing, contact center, analytics and other services to our business customers on a Software-as-a-Service (SaaS) model.\nSmall businesses were the first to transiti