# Install dependencies

* Three main dependencies
  * sentence transformers
  * transformers (for hugging face models)
  * sentnecepiece (for testing slow tokenizers)

In [1]:
!pip install -U sentence-transformers -q
!pip install -U transformers -q
!pip install -U sentencepiece -q
!pip install -U word2number -q
!pip install -U tqdm -q
!pip install -U spacy -q

[K     |████████████████████████████████| 79 kB 2.9 MB/s 
[K     |████████████████████████████████| 3.8 MB 9.9 MB/s 
[K     |████████████████████████████████| 1.2 MB 49.6 MB/s 
[K     |████████████████████████████████| 67 kB 3.8 MB/s 
[K     |████████████████████████████████| 596 kB 44.5 MB/s 
[K     |████████████████████████████████| 6.5 MB 46.0 MB/s 
[K     |████████████████████████████████| 895 kB 52.3 MB/s 
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone


# Pre trained load models

* Two main models required in the application
  * sentence similarity model
  * roberta based fine tuned question answering model

In [3]:
from sentence_transformers import SentenceTransformer, util
sim_model = SentenceTransformer('bert-base-nli-mean-tokens')

from transformers import AutoTokenizer, AutoModelForQuestionAnswering
tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")
model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")

Downloading:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.95k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/399 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/473M [00:00<?, ?B/s]

# Test sentence similarity

In [None]:
enc_1 = sim_model.encode(' '.join(['last year']))
enc_2 = sim_model.encode(' '.join(['this year']))
enc_3 = sim_model.encode(' '.join(['next year']))

enc1 = sim_model.encode(' '.join(['previous year']))
enc2 = sim_model.encode(' '.join(['present year']))
enc3 = sim_model.encode(' '.join(['future year']))

print("Similarity between Last Year and Previous Year: ", util.cos_sim(enc_1, enc1))
print("Similarity between Last Year and Current Year:  ", util.cos_sim(enc_1, enc2))
print("Similarity between Last Year and Next Year:     ", util.cos_sim(enc_1, enc3))
print()
print("Similarity between This Year and Previous Year: ", util.cos_sim(enc_2, enc1))
print("Similarity between This Year and Current Year:  ", util.cos_sim(enc_2, enc2))
print("Similarity between This Year and Next Year:     ", util.cos_sim(enc_2, enc3))
print()
print("Similarity between Next Year and Previous Year: ", util.cos_sim(enc_3, enc1))
print("Similarity between Next Year and Current Year:  ", util.cos_sim(enc_3, enc2))
print("Similarity between Next Year and Next Year:     ", util.cos_sim(enc_3, enc3))

Similarity between Last Year and Previous Year:  tensor([[0.9076]])
Similarity between Last Year and Current Year:   tensor([[0.8051]])
Similarity between Last Year and Next Year:      tensor([[0.7196]])

Similarity between This Year and Previous Year:  tensor([[0.8332]])
Similarity between This Year and Current Year:   tensor([[0.9008]])
Similarity between This Year and Next Year:      tensor([[0.8382]])

Similarity between Next Year and Previous Year:  tensor([[0.7747]])
Similarity between Next Year and Current Year:   tensor([[0.8176]])
Similarity between Next Year and Next Year:      tensor([[0.8933]])


### Flatten the metrics list for easy matching

In [6]:
import json
from pprint import pprint

fp = open('./files/metrics.json', 'r')
deepList = json.load(fp)
metricList = dict()

for main in deepList:
    for key in deepList[main]:
        metricList[key.lower()] = list(map(lambda x: x.lower(), deepList[main][key]))
        metricList[key.lower()].append(key.lower())

### Subsequence matching

Define subsequence matching function for matching the metrics in the metrics list in the sentence.

In [7]:
import re

def is_subseq(s1, s2):
    s1 = re.split('\. |\n|\s|\-', s1.lower()) # Metric 
    s2 = re.split('\. |\n|\s|\-', s2.lower()) # Context

    p1, p2 = 0, 0

    while p1 < len(s1) and p2 < len(s2):
        if s1[p1] == s2[p2]:
            p1+=1
        p2 += 1
    return p1 == len(s1)

# Entity Recognition Module

here we define the class of entity recognition module which will be used in the named entity recognition task

In [8]:
class EntityRecognitionModule:
    def __init__(self, nerModel):
        self.nerModel = nerModel
        self.create_pipeline()

    def __call__(self, sent):
        # Call the model  for  entity
        # recognition  and return the
        # entities on single sentence

        # TODO:
        # 1. Call the model for entity recognition
        # 2. Return the entities on single sentence

        # Example:
        # sent = "ARR is $1.2 Bil for last year"
        # entities = {"ORG": [("ARR", 0, 2)],
        #             "MONEY": [("$1.2 Billion", 7, 14)],
        #             "DATE": [("last year", 20,28)]}
        # return entities

        doc = self.nerModel(sent)

        data = dict()

        for ent in doc.ents:
            label = ent.label_
            text = ent.text
            start = ent.start_char
            end = ent.end_char

            if label not in data:
                data[label] = []

            data[label].append((text, start, end))
        return data


    def create_pipeline(self):
        # If the model doesnt have inbuilt
        # pipeline like spacy,  create one
        pass


# Question Answering Module

Question answering module inherits utilities from the entity recognition module and is used to answer the questions.

In [9]:
class QuestionAnsweringModule(EntityRecognitionModule):
    def __init__(self, nerModel, qaModel):
        super(QuestionAnsweringModule, self).__init__(nerModel=nerModel)
        self.qaModel = qaModel

    def __call__(self, qs, ctx):
        # TODO:
        # 1. Call the model for question answering
        # 2. Return the answer on single sentence
        #    with highest score

        qs = qs.strip().lower()
        ctx = ctx.strip().lower()

        res = self.qaModel(question=qs, context=ctx)
        res = self.cleanAnswer(res, ctx)
        return res

    def cleanAnswer(self, res, ctx):
        # Logic for cleaning answer
        return res


# Text Extraction Module

Text extraction module inherits utilities from the entity recognition module and is used to extract the metrics from the document.

In [10]:
class TextExtractionModule(QuestionAnsweringModule):
    def __init__(self, nerModel, qaModel):
        super(TextExtractionModule, self).__init__(qaModel=qaModel, nerModel=nerModel)

    def __call__(self, sent, filing_year):
        # TODO:
        # 1. Call the model for entity recognition
        # 2. Create question and context
        # 3. Call the model for question answering
        # 4. Return the entities on single sentence
        #    with highest score

        sent = sent.strip().lower()
        entities = EntityRecognitionModule.__call__(self, sent)
        if 'MONEY' not in entities:
            entities['MONEY'] = []
        if 'DATE' not in entities:
            entities['DATE'] = []

        if 'CARDINAL' in entities:
            entities['MONEY'].extend(entities['CARDINAL'])
        if 'QUANTITY' in entities:
            entities['MONEY'].extend(entities['QUANTITY'])


        # Create context for the extractive question answering
        ctx = self.create_context(sent)

        # Grid search on ['ORG', 'DATE', 'MONEY']
        results = []
        all_metrics = [] # List of all the metris that can be extracted from current sentence

        def get_year(date):
            '''
            Get the year from the date

            1. Check if the year is in the date
            
            1.1. If yes, return the year
            
            1.2. If no, match the date with the 
                 following set of possibilities 
                 and shift from the filing year
                 use the  senetence  similarity
                 model to find the  best  match 
            
            1.2.1. If the date is in the past,
                   return the filing year-1
            
            1.2.2. If the date is in the future,
                   return the filing year+1
            
            1.2.3. If the date is in the present,
                   return the filing year
            '''
            
            dates = re.split('\. |\n|\s|\-', date.lower())

            for d in dates:
                if len(d) == 4:
                    try:
                        year = int(d)
                    except:
                        continue
                    return year
            
            year_match = [
                ('previous year', -1), 
                ('present year', 0),
                ('future year', 1)
            ]

            gsim, year = -1, filing_year
            for ysent, shift in year_match:
                enc1 = sim_model.encode(ysent)
                enc2 = sim_model.encode(date)
                sim = util.cos_sim(enc1, enc2)[0][0]

                if gsim < sim:
                    gsim = sim
                    year = filing_year + shift

            return year

        for unit in metricList:
            for alt in metricList[unit]:
                if not is_subseq(alt, sent):
                    continue
                all_metrics.append((alt, unit))

        # Start matching with longest string
        all_metrics.sort(key=lambda x: len(x[0]), reverse=True)

        for alt, unit in all_metrics:
            for date in entities['DATE']:
                year = get_year(date[0])
                for money in entities['MONEY']:

                    qs1 = 'What is value of {} on {} ?'.format(alt, date[0])
                    qs2 = 'What has value {} on {} ?'.format(money[0], date[0])
                    qs3 = 'When is value of {} {} ?'.format(alt, money[0])

                    res1 = QuestionAnsweringModule.__call__(self, qs1, ctx)
                    res2 = QuestionAnsweringModule.__call__(self, qs2, ctx)
                    res3 = QuestionAnsweringModule.__call__(self, qs3, ctx)

                    score = max(res1['score'],
                                res2['score'], 
                                res3['score']
                                )

                    if score >= 0.1:
                        # Do not consider the results with score less than 0.1
                        results.append({
                            'sentence': sent,
                            'metric'  : unit, #res2['answer'],
                            'date'    : date[0], #res3['answer'],
                            'value'   : res1['answer'],
                            'year'    : year,
                            'score'   : score
                        })

        # As a part of the text cleaning we remove the pairs
        # that have two or more out of the  three  parts  of 
        # the pair same

        def match(r1, r2):
            num_matches = 0
            num_matches += int(r1['metric'] == r2['metric'])
            num_matches += int(r1['value'] == r2['value'])
            num_matches += int(r1['date'] == r2['date'])
            return num_matches > 1

        # Sort the results based on the score
        results.sort(key=lambda x: x['score'], reverse=True)
        final = []

        # clean the numeric metrics and those that contain money
        numeric_metrics = ['total number of customers', 'new customers', 'number of new accounts', 'dau', 'wau', 'mau', 'employee count']

        for result in results:

            # Clean the value to only include Number in value
            metric = result['metric']
            entities = EntityRecognitionModule.__call__(self, result['value'])
            
            if 'MONEY' not in entities or metric in numeric_metrics:
                entities['MONEY'] = []

            if metric in numeric_metrics and 'CARDINAL' in entities:
                entities['MONEY'].extend(entities['CARDINAL'])
            if metric in numeric_metrics and 'QUANTITY' in entities:
                entities['MONEY'].extend(entities['QUANTITY'])
            
            if len(entities['MONEY']) > 0:
                result['value'] = entities['MONEY'][0][0]
            else:
                continue

            # Stack based implementation for cleaning the repeated metrics
            if len(final) == 0:
                final.append(result)
            else:
                found = False
                for res in final:
                    if match(res, result):
                        found = True
                        break
                if not found:
                    final.append(result)

        return final

    def create_context(self, sent):
        # Logic for creating custom context
        return sent


# Paragraph Extraction Module

Paragraph extraction module inherits utilities from the text extraction module and is used to extract the paragraphs from the document.

In [11]:
import re
class ParagraphExtractionModule(TextExtractionModule):
    def __init__(self, nerModel, qaModel):
        super(ParagraphExtractionModule, self).__init__(nerModel=nerModel, qaModel=qaModel)

    def __call__(self, para, filing_year):

        # Logic to handle paragraph
        sents = re.split(r'\. |\n', para)

        metrics = []
        global metricList
        for sent in sents:
            found = False
            for unit in metricList:
                for alt in metricList[unit]:
                    if is_subseq(alt, sent):
                      found = True
                      break

                if found:
                    break

            # If the sentence has no relevant metrics then continue
            if not found:
                continue

            res = TextExtractionModule.__call__(self, sent, filing_year)
            metrics.extend(res)

        metrics.sort(key=lambda x: x['score'], reverse=True)
        # final = metrics
        return metrics

Create the question answering pipeline using hugging face transformers

In [12]:
from transformers import pipeline
QuestionAnswerer = pipeline("question-answering", model=model, tokenizer=tokenizer)

Create the named entity recognition model using the spacy library

In [13]:
# this cell may take upto 5 minutes to load the model

import spacy
import spacy.cli

spacy.cli.download("en_core_web_lg")
nlp = spacy.load('en_core_web_lg')

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


Create the pargraph extractor and then test it on sample sentence

In [14]:
pe = ParagraphExtractionModule(nerModel=nlp, qaModel=QuestionAnswerer)
sent = 'Creative ARR for last year was $3.2 billion and increased upto $8.78 billion this year.'
pe(sent.lower(), 2020)

[{'date': 'last year',
  'metric': 'arr',
  'score': 0.8518530130386353,
  'sentence': 'creative arr for last year was $3.2 billion and increased upto $8.78 billion this year.',
  'value': '$3.2 billion',
  'year': 2019},
 {'date': 'this year',
  'metric': 'arr',
  'score': 0.8271325826644897,
  'sentence': 'creative arr for last year was $3.2 billion and increased upto $8.78 billion this year.',
  'value': '$8.78 billion',
  'year': 2020}]

# JSON testing

In [20]:
FILINGS_PATH = 'path/to/filings-10K' # Path to the filings
OUTPUT_PATH = 'path/to/output'       # Path to the output

# EXAMPLE

# FILINGS_PATH = "sample-10K"
# OUTPUT_PATH = "sample-10K-output"


import os
try:
    os.mkdir(OUTPUT_PATH)
except:
    print('ERROR: occured, create output directory manually')

Get the json filings from FILING_PATH

In [17]:
import os
dir, _, files = next(os.walk(FILINGS_PATH))

Run the Paragraph Extraction Model on all the filings in the FILING_PATH

In [21]:
from tqdm.notebook import tqdm
import pandas as pd

for file in tqdm(files):
    fp = open(f'{dir}/{file}', 'r')
    filing = json.load(fp)

    res = []

    para = ""
    for item in filing:
        para += str(filing[item])

    res = pe(para, int(filing['filing_date'].split('-')[0]))

    data = {
        'score'    : [r['score'] for r in res],
        'metric'   : [r['metric'] for r in res],
        'date'     : [r['date'] for r in res],
        'value'    : [r['value'] for r in res],
        'year'     : [r['year'] for r in res],
        'sentence' : [r['sentence'] for r in res],
    }

    pd.DataFrame(data).to_csv(f'{OUTPUT_PATH}/{file.strip(".json")}.csv', index = False)

  0%|          | 0/1 [00:00<?, ?it/s]

## Convert values containing words to numbers

Example two hundred thirty -> 230

In [22]:
import os
dir, _, files = next(os.walk(OUTPUT_PATH))

import re
import pandas as pd
from word2number import w2n
from tqdm.notebook import tqdm

for file in tqdm(files):
    if file.endswith('.gsheet'):
       continue
    df = pd.read_csv(f'{dir}/{file}')
    vals = []
    for i in df.index:
        val = df['value'][i]
        val = re.sub(r',', '', str(val))

        # fractions in words like half and one quater is hardcoded
        if val == 'less than half' or val == 'greater than half' or val == 'just over half':
            num = 0.5
        elif val == 'thousands':
            num = 1000.
        elif val == 'millions':
            num = 1e6
        else:
            # Extract the integer value if it is there in the value
            try:
                num = float(re.findall(r'[0-9\.]+', val)[0])
            except:
                # convert word like "two" to number 2
                num = float(w2n.word_to_num(val))

            # multiply the factors
            if 'billion' in val:
                num *= 1e9
            elif 'million' in val:
                num *= 1e6
        vals.append(num)

    df['number'] = vals
    df.to_csv(f'{dir}/{file}', index=False)

  0%|          | 0/1 [00:00<?, ?it/s]