In [174]:
import spacy
from spacy.language import Language
from spacy import displacy
import time
import re
import statistics
import os
import json
import calendar
from pathlib import Path
from datetime import datetime

In [19]:
alias = {"MAUS":"MONTHLY ACTIVE USERS", "ARR":"ANNUAL RECURRING REVENUE"}
print(alias)

{'MAUS': 'MONTHLY ACTIVE USERS', 'ARR': 'ANNUAL RECURRING REVENUE'}


In [181]:
def getOrgData(org):
    orgDataPath = "../../Summary/orgData/"+org+".txt"
    file = Path(orgDataPath)
    if file.is_file():
        #print(True)
        with open(orgDataPath) as f:
            data = json.load(f)
        #print(data)
        return data
    return None

In [201]:
def getOrgAttr(orgData, attr):
    if not orgData:
        return None
    asplit = attr.split("|")
    p = orgData
    for i in range(0, len(asplit)):
        if asplit[i] not in p:
            return None
        p = p[asplit[i]]
        #print(p)
    return(p)

In [211]:
data = getOrgData("PINS")
val = getOrgAttr(data, "ORGPROFILE|FiscalYearEnd")
print(val)
dobj = datetime.strptime(val, "%B")
print(dobj.month)

December
12


In [155]:
def getQtrs(org, edate, fynd):
    #print(edate)
    dateObj = datetime.strptime(edate, '%Y-%m-%d')
    #print(dateObj.month)
    emonth = dateObj.month
    eyr = dateObj.year
    k = 1
    qend = fynd
    qstart = 0
    year = eyr
    qtrs = dict()
    while(k!=5):
        #print(k)
        qstart = (qend + 1)%12
        qend = (qend + 3)%12
        if qstart == 0:
            qstart = 12
        if qend == 0:
            qend = 12
        if((emonth >= qstart and emonth <= qend) or (qstart > qend and (emonth >= qstart or emonth <= qend))):
            #print("CQTR:", "Q"+str(k))
            qtrs["CQTR"] = "Q"+str(k)
            qtrs["CYR"] = year
            eqtr = (k-1)%4
            if(eqtr == 0):
                eqtr = 4
            nqtr = (k+1)%4
            if(nqtr == 0):
                nqtr = 4
            pqtr = (k-2)%4
            if(pqtr == 0):
                pqtr = 4
            qtrs["EQTR"] = "Q"+str(eqtr)
            qtrs["PQTR"] = "Q"+str(pqtr)
            #print("EQTR:", "Q"+str(eqtr))
            #print("PQTR:", "Q"+str(pqtr))
            #print("NQTR:", "Q"+str(nqtr))
        #print("Q"+str(k), "START", qstart)
        #print("Q"+str(k), "END", qend)
        qtr = "Q"+str(k)
        if (qtr not in qtrs):
            qtrs[qtr] = dict()
        qtrs[qtr]["START"] = qstart
        qtrs[qtr]["SM"] = calendar.month_abbr[qstart].upper()
        qtrs[qtr]["END"] = qend
        qtrs[qtr]["EM"] = calendar.month_abbr[qend].upper()
        #print("Q"+str(k), "YEAR", year)
        k = k + 1
        
    if("EQTR" in qtrs and "CQTR" in qtrs and "PQTR" in qtrs):
        eqtrEnd = qtrs[qtrs["EQTR"]]["END"]
        cqtrEnd = qtrs[qtrs["CQTR"]]["END"]
        pqtrEnd = qtrs[qtrs["PQTR"]]["END"]
        #print(eqtrEnd, cqtrEnd, pqtrEnd)
        qtrs["EYR"] = eyr
        qtrs["PYR"] = eyr
        if(eqtrEnd > cqtrEnd):
            qtrs["EYR"] = eyr - 1
        if(pqtrEnd > cqtrEnd):
            qtrs["PYR"] = eyr - 1
    #print(qtrs)
    return (qtrs)

In [158]:
getQtrs("PINS", "2021-10-20", 12)

{'Q1': {'START': 1, 'SM': 'JAN', 'END': 3, 'EM': 'MAR'},
 'Q2': {'START': 4, 'SM': 'APR', 'END': 6, 'EM': 'JUN'},
 'Q3': {'START': 7, 'SM': 'JUL', 'END': 9, 'EM': 'SEP'},
 'CQTR': 'Q4',
 'CYR': 2021,
 'EQTR': 'Q3',
 'PQTR': 'Q2',
 'Q4': {'START': 10, 'SM': 'OCT', 'END': 12, 'EM': 'DEC'},
 'EYR': 2021,
 'PYR': 2021}

In [3]:
def splitSentences(nlp, sent, lner):
    ss = list()
    doc = nlp(sent)
    
    date = None
    
    if(len(lner) > 0 and lner[0][1] == "DATE"):
        date = lner[0][0]

    #for i,token in enumerate(doc):
    #    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
    #            token.conjuncts, token.shape_, token.is_alpha, token.is_stop, i)
    #print(sent)
    #print("\n")
    start = -1
    end = -1
    prev_start = -1
    prev_chunk = None
    curr_chunk = None
    first = False
    prefix = None

    for chunk in doc.noun_chunks:
        #print(chunk.text, chunk.root.text, chunk.root.dep_,
        #        chunk.root.head.text)
        #print(chunk.text, chunk.root.dep_, chunk.start, chunk.end)
    
        if chunk.root.dep_ == "nsubj" or chunk.root.dep_ == "nsubjpass":
            #print(chunk.text, chunk.root.dep_, chunk.start, chunk.end)
        
            start = chunk.start
            if not first:
                if start > 0:
                    if(date and date in str(doc[0:start])):
                        prefix = date
                    start = 0
                first = True
                
            curr_chunk = chunk
            verb_cnt = 0
            if prev_start != -1:
                end = start
            if(end!=-1):
                #print(prev_start,end, start, doc[prev_start:end])
                text = None
                if prev_chunk:
                    text = prev_chunk.text
                sc = prev_start
                ec = -1
                pc = -1
                for i in range(prev_start,end):
                    token = doc[i]
                    if(token.pos_ == "VERB"):
                        verb_cnt = verb_cnt + 1
                    if(token.dep_ == "conj" and verb_cnt > 1 and token.pos_ == "VERB" and token.tag_ != "VB"):
                        #print(token.text,i)
                        ec = i
                        if (text): 
                            if(text in str(doc[sc:ec])):
                                #print(sc,ec,doc[sc:ec])
                                ss.append(str(doc[sc:ec]))
                            else:
                                #print(sc,ec, text + " " + str(doc[sc:ec]))
                                ss.append(text + " " + str(doc[sc:ec]))
                        else:
                            #print(sc,ec,doc[sc:ec])
                            ss.append(str(doc[sc:ec]))
                        sc = ec
                #if(sc>0 and sc!=prev_start):
                if(sc>=0):
                    #text = chunk.text
                    if(text):
                        if (text in str(doc[sc:end])):
                            #print(sc,end,doc[sc:end])
                            ss.append(str(doc[sc:end]))
                        else:
                            #print(sc,end, text + " " + str(doc[sc:end]))
                            ss.append(text + " " + str(doc[sc:end]))
                    else:
                        #print(sc,end,doc[sc:end])
                        ss.append(str(doc[sc:end]))
            prev_start = start
            prev_chunk = chunk
    
    if(start >= 0):
        #print(prev_start,len(doc), start, doc[start:len(doc)])
        text = curr_chunk.text
        end = len(doc)
        sc = prev_start
        ec = -1
        pc = -1
        verb_cnt = 0
        for i in range(prev_start,end):
            token = doc[i]
            if(token.pos_ == "VERB"):
                verb_cnt = verb_cnt + 1
            #text = token.text
            if(token.dep_ == "conj" and verb_cnt > 1 and token.pos_ == "VERB" and token.tag_ != "VB"):
                #print(token.text,i)
                ec = i
                if (text): 
                    if(text in str(doc[sc:ec])):
                        #print("HELLO", token.text, token.pos_, token.dep_)
                        #print(sc,ec,doc[sc:ec])
                        ss.append(str(doc[sc:ec]))
                    else:
                        #print("HELLO1")
                        #print(sc,ec, text + " " + str(doc[sc:ec]))
                        ss.append(text + " " + str(doc[sc:ec]))
                else:
                    #print("HELLO2")
                    #print(sc,ec,doc[sc:ec])
                    ss.append(str(doc[sc:ec]))
                sc = ec
            #if(sc>0 and sc!=prev_start):
        if(sc>=0):
            #text = token.text
            if (text):
                if (text in str(doc[sc:end])):
                    #print("HELLO3")
                    #print(sc,end,doc[sc:end])
                    if(prefix and prefix not in str(doc[sc:end])):
                        ss.append(prefix + " " + str(doc[sc:end]))
                    else:
                        ss.append(str(doc[sc:end]))
                else:
                    #print("HELLO4")
                    #print(sc,end, text + " " + str(doc[sc:end]))
                    ss.append(text + " " + str(doc[sc:end]))
            else:
                #print("HELLO5")
                #print(sc,end,doc[sc:end])
                if(prefix and prefix not in str(doc[sc:end])):
                    ss.append(prefix + " " + str(doc[sc:end]))
                else:
                    ss.append(str(doc[sc:end]))

    #print("\n")
    if (len(ss) == 0):
        ss.append(sent)
    
    print("Sentences\n", ss)
    return ss

In [4]:
def is_attr_for_metric(metric, attr, doc):
    if attr[1] == "LOC":
        value = attr[0]
        start = attr[2]
        end = attr[3]
        for chunk in doc.noun_chunks:
            if((value in chunk.text and start >= chunk.start and end <= chunk.end)):
                if(chunk.root.dep_ == "nsubj"):
                    return False
    return True

def is_earningReportLine(sent):
    words1 = ["Announces", "Reports", "Releases", "Increase", "Raises", "Announced", "Reported"]
    words2 = ["Earnings", "Results"]
    if any(word.upper() in sent.upper() for word in words1) and any(word.upper() in sent.upper() for word in words2):
        #print(True)
        return True
    return False

def get_section(text):
    table = ["following table", "CONSOLIDATED BALANCE SHEETS", "CONSOLIDATED STATEMENTS OF OPERATIONS", "CONDENSED CONSOLIDATED"]
    guide = ["expect", "expectation", "will grow", "will decline", "forecast", "is expected", "financial targets", "following guidance"]
    compare = ["compared with", "compared to"]
    
    if is_earningReportLine(text):
        return "EARNING"
    
    if(any(word.upper() in text.upper() for word in guide)):
        return "GUIDE"
    
    if(any(word.upper() in text.upper() for word in table)):
        return "TABLE"
    
    if(any(word.upper() in text.upper() for word in compare)):
        return "COMPARE"
    
    return "REGULAR"

def is_found_param(v):
    if v[1] == "MONEY" or v[1] == "PCT" or v[1] == "CD" or v[1] == "CHG":
        return True
    return False

def is_valid_param(v):
    if v[1] == "CS" or v[1] == "PER" or v[1] == "ROLE":
        return False
    return True

def get_metric(metric_arr, doc):
    global alias
    metric = ""
    end = -1
    if(len(metric_arr) == 0):
        return None
    if(len(metric_arr) == 1):
        if metric_arr[0][0] in alias:
            return alias[metric_arr[0][0]]
        return(metric_arr[0][0])
    else:
        metric = metric_arr[0][0]
        if metric in alias:
            metric = alias[metric]
        for i,v in enumerate(metric_arr):
            start = v[2]
            end = -1
            sep = " "
            if i+1 < len(metric_arr):
                #print(metric_arr[i+1])
                end = metric_arr[i+1][1]
            #print(start, end)
            if end != -1:
                for index, token in enumerate(doc):
                    if(index < start):
                        continue
                    elif(index > end):
                        break
                    else:
                        if (token.text.upper() == "AND"):
                            sep = " AND "
                            break
                        elif (token.text.upper() == ":" and ((end-start) <= 2)):
                            sep = None
                            metric = metric_arr[i+1][0]
                            if metric in alias:
                                metric = alias[metric]
                            break
                if sep:
                    if metric_arr[i+1][0] in alias:
                        metric = metric + sep + alias[metric_arr[i+1][0]]
                    else:
                        metric = metric + sep + metric_arr[i+1][0]
            #else:
            #    metric = metric + sep + v[0]
    return(metric)

def insert_other_items(entity, v):
    if(v[1] == "ORG"):
        if("ORG" not in entity):
            entity["ORG"] = list()
        entity["ORG"].append(v[0].upper())
    elif(v[1] == "MISC"):
        if("MISC" not in entity):
            entity["MISC"] = list()
        entity["MISC"].append(v[0].upper())
    elif(v[1] == "SYM"):
        if("SYM" not in entity):
            entity["SYM"] = list()
        entity["SYM"].append(v[0].upper())
    elif(v[1] == "CALENDAR"):
        if("CALENDAR" not in entity):
            entity["CALENDAR"] = list()
        if "YEAR" not in v[0].upper():
            entity["CALENDAR"].append(v[0].upper())
    elif(v[1] == "DATE"):
        if("DATE" not in entity):
            entity["DATE"] = list()
        entity["DATE"].append(v[0].upper())
    return(entity)

def is_metric_subj(v, prev_metric, entity, doc, section, e):
    metric = v[0]
    start = v[2]
    end = v[3]
    #found = False
    
    if section == "TABLE":
        #print("HELLLOOOOOO111111")
        return True
    
    if not prev_metric:
        #print("HELLLOOOOOO")
        return True
    
    for chunk in doc.noun_chunks:
        #print(chunk.text, chunk.root.dep_)
        if((metric in chunk.text and end <= chunk.end) or (chunk.text in metric and chunk.start >= start and chunk.end <= end)):
            #print(chunk.text, chunk.root.dep_, metric, chunk.start, chunk.end)
            if(chunk.root.dep_ == "ROOT" or chunk.root.dep_ == "nsubj" or chunk.root.dep_ == "nsubjpass" or chunk.root.dep_ == "conj"):
                #print(metric, chunk.root.dep_)
                return True
            #print(metric, chunk.root.dep_)
            #print("HELLLOOOOOO2222222")
            return False
            #break

    if(prev_metric and prev_metric in entity):
        #v = entity[prev_metric][-1]
        v = e[-1]
        #print("Hello", v, metric, start, v[3])
        #print(v)
        if((v[3] - start) <= 1):
            #print("HELLLOOOOOO3333333")
            return False
    
    return True


In [5]:
def getRelations(nlpWebtf, nlpModel, sentences):
    entity = dict()
    tuples = list()
    prev_metric = None
    found = False
    cnt = 0
    spl_section = None
    
    for sent in sentences:
        cnt = cnt + 1
        print(cnt, sent)
        text1 = list()
        text1.append(sent)
        for doc in nlpModel.pipe(text1, disable=["tagger"]):
            #print([(ent.text, ent.label_) for ent in doc.ents])
            lner = ([(ent.text, ent.label_, ent.start, ent.end) for ent in doc.ents])
        text1 = splitSentences(nlpWebtf, sent, lner)
        #text1 = list()
        #text1.append(sent)
        #print("EEEEEEEEEFFFFFFF", e)
        if found and prev_metric and prev_metric in entity and len(e) > 0:
            entity[prev_metric].append(e)
            print(prev_metric, entity[prev_metric])
        
        if prev_metric and prev_metric in entity and len(entity[prev_metric]) == 0:
            del entity[prev_metric]
        #print(text1)
        section = get_section(sent)
        e = list()
        cs_added = False
    
        #if(section == "GUIDE"):
        #    print(sent)
    
        for doc in nlpModel.pipe(text1, disable=["tagger"]):
            #print([(ent.text, ent.label_) for ent in doc.ents])
            l = ([(ent.text, ent.label_, ent.start, ent.end) for ent in doc.ents])
            print("NER", l)
            metric = None
            metric_arr = list()
            #print("EEEEEEEEE", e)
            if found and prev_metric and prev_metric in entity and len(e) > 0:
                entity[prev_metric].append(e)
                print(prev_metric, entity[prev_metric])
            if prev_metric and prev_metric in entity and len(entity[prev_metric]) == 0:
                del entity[prev_metric]
            prev_metric = None
            tuples = list()
            found = False
            doc1 = nlpWebtf(sent)
            e = list()
            
            if (len(l) == 0 and "Highlights:" in sent):
                spl_section = None
            if ("Business Highlights:" in sent):
                spl_section = None
        
            for i,v in enumerate(l):
                #print(i,v)
                #print(v)
                #if v[1] == "FC":
                #    print(sent)
                if (v[1] == "CS" or v[1] == "PER"):
                    #print("HELLOOOOOOOOOOOOOOOOOOO", sent[0])
                    if not cs_added and v[1] == "CS":
                        if "CS" not in entity:
                            entity["CS"] = sent
                        else:
                            entity["CS"] = entity["CS"] + sent
                        cs_added = True
                    person = None
                    role = None
                    if v[1] == "PER" and len(l) < 7:
                        person = v[0].upper()
                    else:
                        continue
                    for k in range(i+1, len(l)):
                        if(l[k][1] == "PER"):
                            person = l[k][0].upper()
                        elif(l[k][1] == "ROLE"):
                            if not role:
                                role = l[k][0].upper()
                            else:
                                role = role + " AND " + l[k][0].upper()
                    if (role and ("CEO" in role or "EXECUTIVE" in role)):
                        role = "CEO"
                    if (role and ("FINANCIAL" in role or "CFO" in role)):
                        role = "CFO"
                    if (role and ("MARKETING" in role or "CMO" in role)):
                        role = "CMO"
                    if (role and ("OPERATING" in role or "COO" in role)):
                        role = "COO"
                    if(role and person):
                        print("ROLEEEEEEEEEEEEEEEE", role, "PERSONNNNNNNNNNN", person)
                        if role not in entity:
                            entity[role] = list()
                        if person not in entity[role]:
                            entity[role].append(person)
                        role = None
                        person = None
                    #cs_added = True
                    break
                if(section == "EARNING"):
                    if v[1] == "ORG" and "CNAME" not in entity:
                        entity["CNAME"] = v[0].upper()
                    elif v[1] == "SYM" and "CSYM" not in entity:
                        entity["CSYM"] = v[0].upper()
                    elif v[1] == "DATE" and "EDATE" not in entity:
                        entity["EDATE"] = v[0].upper()
                    elif v[1] == "CALENDAR" and "EQTR" not in entity:
                        entity["EQTR"] = v[0].upper()
                    elif v[1] == "YEAR" and "EYR" not in entity:
                        entity["EYR"] = v[0].upper()
                elif(section == "GUIDE"):
                    if v[1] == "CALENDAR" and "GQTR" not in entity:
                        entity["GQTR"] = v[0].upper()
                    elif v[1] == "DATE" and "GDATE" not in entity:
                        entity["GDATE"] = v[0].upper()
                    elif v[1] == "YEAR" and "GYR" not in entity:
                        entity["GYR"] = v[0].upper()
                if "basis" in sent and "reported:" in sent:
                    #splits = sent.split(" ")
                    if(v[1] == "METRIC"):
                        print(v[0], " FOUND AT ", sent.find(v[0]))
                        print("BASIS FOUND AT ", sent.find("basis"))
                        print("REPORTED FOUND AT ", sent.find("reported:"))
                        if((sent.find(v[0]) < sent.find("basis")) and (sent.find(v[0]) < sent.find("reported:"))):
                            spl_section = v[0].upper()
                #if(v[1] == "METRIC" and "basis" in sent and "reported:" in sent):
                #    spl_section = v[0]
                #elif(v[1] == "METRIC" and is_metric_subj(v, prev_metric, entity, doc1, section)):
                if(v[1] == "METRIC" and is_metric_subj(v, prev_metric, entity, doc1, section, e)):
                    #print(i, v)
                    if not metric:
                        metric = v[0].upper()
                        metric_arr.append((v[0].upper(), v[2], v[3]))
                    elif metric:
                        metric = metric + " " + v[0].upper()
                        metric_arr.append((v[0].upper(), v[2], v[3]))
                    #print(metric, prev_metric)
                    #print(prev_metric,v[0],e)
                    if found and prev_metric and prev_metric in entity and len(e) > 0:
                        entity[prev_metric].append(e)
                        print(prev_metric, entity[prev_metric])
                    if prev_metric and prev_metric in entity and len(entity[prev_metric]) == 0:
                        del entity[prev_metric]
                    prev_metric = None
                    found = False
                    e = list()
                else:
                    if(metric):
                        #print(text1)
                        #print(l)
                        #print(metric_arr)
                        #print(metric_arr)
                        entity = insert_other_items(entity, v)
                        metric = (get_metric(metric_arr, doc))
                        metric = ' '.join(dict.fromkeys(metric.split()))
                        #metric = metric + "-" + str(cnt)
                        if len(metric.split()) > 6:
                            metric = None
                            metric_arr = list()
                            prev_metric = metric
                            tuples = list()
                            continue
                        if metric not in entity:    
                            entity[metric] = list()
                            found = False
                        prev_metric = metric
                        #print(prev_metric)
                        e.append((str(cnt), "LINE", -1, -1))
                        if spl_section is not None:
                            #entity[metric].append((spl_section, "SPLSECTION", -1, -1))
                            e.append((spl_section, "SPLSECTION", -1, -1))
                        if section is not None:
                            #entity[metric].append((section, "SECTION", -1, -1))
                            e.append((section, "SECTION", -1, -1))
                        if(len(tuples) > 0):
                            #entity[metric] = entity[metric] + tuples
                            e = e + tuples
                            tuples = list()
                        if(is_valid_param(v)):
                            #entity[metric].append(v)
                            e.append(v)
                        if(is_found_param(v)):
                            found = True
                        #print(metric)
                        metric = None
                        metric_arr = list()
                    else:
                        entity = insert_other_items(entity, v)
                        if not is_attr_for_metric(prev_metric, v, doc1):
                            prev_metric = None
                        if not prev_metric:
                            if(is_valid_param(v)):
                                tuples.append(v)
                            if(is_found_param(v)):
                                found = True
                        elif prev_metric in entity:
                            if(is_valid_param(v)):
                                #entity[prev_metric].append(v)
                                e.append(v)
                            if(is_found_param(v)):
                                found = True
    if prev_metric and prev_metric in entity and len(entity[prev_metric]) == 0:
        del entity[prev_metric]
        
    print(entity["ORG"])

    entity["CNAME"] = statistics.mode(entity["ORG"])
    entity["EQTR"] = statistics.mode(entity["CALENDAR"])


    if entity["EQTR"] == "FIRST QUARTER":
        entity["EQTR"] = "Q1"
    elif entity["EQTR"] == "SECOND QUARTER":
        entity["EQTR"] = "Q2"
    elif entity["EQTR"] == "THIRD QUARTER":
        entity["EQTR"] = "Q3"
    elif entity["EQTR"] == "FOURTH QUARTER":
        entity["EQTR"] = "Q4"
    
    if "GQTR" in entity:
        if entity["GQTR"] == "FIRST QUARTER":
            entity["GQTR"] = "Q1"
        elif entity["GQTR"] == "SECOND QUARTER":
            entity["GQTR"] = "Q2"
        elif entity["GQTR"] == "THIRD QUARTER":
            entity["GQTR"] = "Q3"
        elif entity["GQTR"] == "FOURTH QUARTER":
            entity["GQTR"] = "Q4"
    else:
        if entity["EQTR"] == "Q1":
            entity["GQTR"] = "Q2"
        elif entity["EQTR"] == "Q2":
            entity["GQTR"] = "Q3"
        elif entity["EQTR"] == "Q3":
            entity["GQTR"] = "Q4"
        elif entity["EQTR"] == "Q4":
            entity["GQTR"] = "Q1"

    if "EYR" not in entity and "EDATE" in entity:
        esplit = entity["EDATE"].split(" ")
        if len(esplit) >= 3:
            entity["EYR"] = esplit[2]
    
    print(entity)
    return(entity)
    

In [6]:
rules = [
    {
        "CONDITIONS":["ITEM-0 LINE-1 REGULAR-2 CALENDAR-3 CHG-4 PCT-5 CALENDAR-6 MONEY-7"],
        "ACTIONS":["ITEM-0 IS MONEY-7 IN *CALENDAR *YEAR", "ITEM-0 CHG-4 PCT-5 CALENDAR-6 IN *CALENDAR *YEAR"],
        "DATA-ACTIONS":["ITEM-0 MONEY-7 PCT-5 CHG-4 CALENDAR-6"]
    },
    {
        "CONDITIONS":["ITEM-0 LINE-1 GUIDE-2 CALENDAR-3 CHG-4 PCT-5 CALENDAR-6"],
        "ACTIONS":["GUIDANCE : ITEM-0 IS EXPECTED TO CHG-4 PCT-5 CALENDAR-6 IN *GCALENDAR *YEAR"],
        "DATA-ACTIONS":["ITEM-0+GUIDE CHG-4 PCT-5 CALENDAR-6"]
    },
    {
        "CONDITIONS":["ITEM-0 LINE-1 REGULAR-2 CHG-3 PCT-4 CALENDAR-5 CD-6"],
        "ACTIONS":["ITEM-0 IS CD-6 IN *CALENDAR *YEAR", "ITEM-0 CHG-3 PCT-4 CALENDAR-5 IN *CALENDAR *YEAR"],
        "DATA-ACTIONS":["ITEM-0 CD-6 PCT-4 CHG-3 CALENDAR-5"]
    },
    {
        "CONDITIONS":["ITEM-0 LINE-1 REGULAR-2 DATE-3 LOC-4 CHG-5 PCT-6"],
        "ACTIONS":["ITEM-0 CHG-5 PCT-6 IN LOC-4 AS OF DATE-3"],
        "DATA-ACTIONS":[]
    },
    {
        "CONDITIONS":["ITEM-0 LINE-1 REGULAR-2 DATE-3 CHG-4 PCT-5"],
        "ACTIONS":["ITEM-0 CHG-4 PCT-5 AS OF DATE-3"],
        "DATA-ACTIONS":[]
    },
    {
        "CONDITIONS":["ITEM-0 LINE-1 REGULAR-2 CHG-3 PCT-4 CALENDAR-5"],
        "ACTIONS":["ITEM-0 CHG-3 PCT-4 CALENDAR-5 IN *CALENDAR *YEAR"],
        "DATA-ACTIONS":[]
    },
    {
        "CONDITIONS":["ITEM-0 LINE-1 REGULAR-2 MONEY-3 CALENDAR-4"],
        "ACTIONS":["ITEM-0 IS MONEY-3 IN *CALENDAR *YEAR"],
        "DATA-ACTIONS":["ITEM-0 MONEY-3"]
    },
    {
        "CONDITIONS":["ITEM-0 LINE-1 GUIDE-2 CHG-3 CD-4"],
        "ACTIONS":["GUIDANCE: ITEM-0 IS EXPECTED TO CHG-3 CD-4 IN *GCALENDAR *YEAR"],
        "DATA-ACTIONS":["ITEM-0+GUIDE CHG-3 CD-4"]
    },
    {
        "CONDITIONS":["ITEM-0 LINE-1 GUIDE-2 CD-3 CALENDAR-4"],
        "ACTIONS":["GUIDANCE: ITEM-0 IS EXPECTED TO BE CD-3 CALENDAR-4 IN *GCALENDAR *YEAR"],
        "DATA-ACTIONS":[]
    },
    {
        "CONDITIONS":["ITEM-0 LINE-1 GUIDE-2 CHG-3 CHG-4 CALENDAR-5"],
        "ACTIONS":["GUIDANCE: ITEM-0 CHG-3 TO CHG-4 IN *GCALENDAR *YEAR"],
        "DATA-ACTIONS":[]
    },
    {
        "CONDITIONS":["ITEM-0 LINE-1 GUIDE-2 CHG-3"],
        "ACTIONS":["GUIDANCE: ITEM-0 CHG-3 TO BE EXPECTED IN *GCALENDAR *YEAR"],
        "DATA-ACTIONS":[]
    },
    {
        "CONDITIONS":["ITEM-0 LINE-1 TABLE-2 LOC-3 MONEY-4 MONEY-5 PCT-6"],
        "ACTIONS":["ITEM-0 IN LOC-3 IS MONEY-4 *CHANGED PCT-6 IN *CALENDAR *YEAR"],
        "DATA-ACTIONS":["ITEM-0+LOC-3 MONEY-4"]
    },
    {
        "CONDITIONS":["ITEM-0 LINE-1 TABLE-2 PCT-3 PCT-4"],
        "ACTIONS":["ITEM-0 IS *CHANGED PCT-3 IN *CALENDAR *YEAR"],
        "DATA-ACTIONS":[]
    },
    {
        "CONDITIONS":["ITEM-0 LINE-1 TABLE-2 MONEY-3 MONEY-4 PCT-5"],
        "ACTIONS":["ITEM-0 IS MONEY-3 *CHANGED PCT-5 IN *CALENDAR *YEAR"],
        "DATA-ACTIONS":["ITEM-0 MONEY-3"]
    },
    {
        "CONDITIONS":["ITEM-0 LINE-1 GUIDE-2 CALENDAR-3 CHG-4 CALENDAR-5"],
        "ACTIONS":["GUIDANCE : ITEM-0 IS EXPECTED TO CHG-4 CALENDAR-5 IN *GCALENDAR *YEAR"],
        "DATA-ACTIONS":[]
    },
]
print(rules)

[{'CONDITIONS': ['ITEM-0 LINE-1 REGULAR-2 CALENDAR-3 CHG-4 PCT-5 CALENDAR-6 MONEY-7'], 'ACTIONS': ['ITEM-0 IS MONEY-7 IN *CALENDAR *YEAR', 'ITEM-0 CHG-4 PCT-5 CALENDAR-6 IN *CALENDAR *YEAR'], 'DATA-ACTIONS': ['ITEM-0 MONEY-7 PCT-5 CHG-4 CALENDAR-6']}, {'CONDITIONS': ['ITEM-0 LINE-1 GUIDE-2 CALENDAR-3 CHG-4 PCT-5 CALENDAR-6'], 'ACTIONS': ['GUIDANCE : ITEM-0 IS EXPECTED TO CHG-4 PCT-5 CALENDAR-6 IN *GCALENDAR *YEAR'], 'DATA-ACTIONS': ['ITEM-0+GUIDE CHG-4 PCT-5 CALENDAR-6']}, {'CONDITIONS': ['ITEM-0 LINE-1 REGULAR-2 CHG-3 PCT-4 CALENDAR-5 CD-6'], 'ACTIONS': ['ITEM-0 IS CD-6 IN *CALENDAR *YEAR', 'ITEM-0 CHG-3 PCT-4 CALENDAR-5 IN *CALENDAR *YEAR'], 'DATA-ACTIONS': ['ITEM-0 CD-6 PCT-4 CHG-3 CALENDAR-5']}, {'CONDITIONS': ['ITEM-0 LINE-1 REGULAR-2 DATE-3 LOC-4 CHG-5 PCT-6'], 'ACTIONS': ['ITEM-0 CHG-5 PCT-6 IN LOC-4 AS OF DATE-3'], 'DATA-ACTIONS': []}, {'CONDITIONS': ['ITEM-0 LINE-1 REGULAR-2 DATE-3 CHG-4 PCT-5'], 'ACTIONS': ['ITEM-0 CHG-4 PCT-5 AS OF DATE-3'], 'DATA-ACTIONS': []}, {'CONDITIONS

In [7]:
def get_number_from_string(numstr):
    if(numstr and numstr == "MID-TEENS"):
        return 15
    return None

In [8]:
def get_facts(rules, metric, entity, orgEntities):
    #if not data_facts:
    #    return None
    
    facts = list()
    dfacts = dict()
    attrb = entity[metric]
    print(metric,attrb)
    eqtr = None
    gqtr = None
    eyr = None
    estr = None
    gstr = None
    cname = None
    csym = None
    pstr = None # Previous quarter
    pqtr = None
    pyr = None
    yoyr = None
    yoystr = None # year over year quarter
    
    if "CSYM" in entity:
        #print("Symbol:", entity["CSYM"])
        csym = entity["CSYM"]
        
    if "CNAME" in entity:
        #print("Name:", entity["CNAME"])
        cname = entity["CNAME"]
        
    if "EQTR" in entity:
        #print("EY Quarter:", entity["EQTR"])
        eqtr = entity["EQTR"]
        if eqtr == "Q1":
            pqtr = "Q4"
        elif eqtr == "Q2":
            pqtr = "Q1"
        elif eqtr == "Q3":
            pqtr = "Q2"
        elif eqtr == "Q4":
            pqtr = "Q1"
    if "EYR" in entity:
        #print("EY Year:", entity["EYR"])
        eyr = entity["EYR"]
        yoyr = str(int(eyr) - 1)
        estr = eqtr + "-" + eyr
        pstr = pqtr + "-" + eyr
        yoystr = eqtr + "-" + yoyr
    if "GQTR" in entity:
        #print("Guidance Quarter:", entity["GQTR"])
        gqtr = entity["GQTR"]
        gstr = gqtr + "-" + eyr
    if metric:
        #print(item)
        item = metric
        #ma = metric[item]
        ma = attrb
        #print(item, ma)
    
        for j, v in enumerate(ma):
            #print("\n")
            mv = ma[j]
            #print(mv)
            condition = None
            metrics = dict()
            metrics["ITEM-0"] = item
            #condition = "ITEM0"
            for i, attr in enumerate(mv):
                #print(i+1, attr[1])
                #print("ATTRRRRRRRRRR", attr)
                if(len(attr) < 4):
                    continue
                if attr[1] != "SECTION":
                    key = attr[1]+"-"+str(i+1)
                else:
                    key = attr[0]+"-"+str(i+1)
                metrics[key] = attr[0]
                #condition = condition + " " + key 

            #print(metrics)
            match = False
            last_cond_match_cnt = 0
            cond_match_cnt = 0
            #print(condition)
        
            for i,v in enumerate(rules):
                #print(rules[i])
                #rule = rules[i]
                rule = v
                #print(rule, rule["CONDITIONS"])
                for index,cond in enumerate(rule["CONDITIONS"]):
                    csplit = cond.split(" ")
                    cond_match_cnt = len(csplit)
                    #print(csplit)
                    for key,value in enumerate(csplit):
                        #print(value)
                        if value not in metrics:
                            #error = "Condition " + cond + " is not match for metric " + item + " and attribute " + str(mv)
                            #print(error)
                            match = False
                            break
                        else:
                            match = True
                if match:
                    #error = "Condition " + cond + " is match for metric " + item + " and attribute " + str(mv)
                    #print(error)
                    #error = "Executing Actions " + str(rule["ACTIONS"])
                    #print(error)
                    # Keep better match
                    if cond_match_cnt <= last_cond_match_cnt:
                        #error = "Ignoring matched condition " + cond + " as last condition match count " + str(last_cond_match_cnt) + " is better match than current match count " + str(cond_match_cnt)
                        #print(error)
                        continue
                    
                    million = ["MILLION", "MN", "MILLIONS", "MNS"]
                    billion = ["BILLION", "BN", "BILLIONS", "BNS", "B"]
                    dactions = rule["DATA-ACTIONS"]
                    dkey = None
                    for k, daction in enumerate(dactions):
                        dsplit = daction.split(" ")
                        for key, value in enumerate(dsplit):
                            if ("ITEM" in value):
                                vsplit = value.split("+")
                                for u,v in enumerate(vsplit):
                                    if not dkey:
                                        if v in metrics:
                                            dkey = metrics[v]
                                        else:
                                            dkey = v
                                    else:
                                        if v in metrics:
                                            dkey = dkey + "-" + metrics[v]
                                        else:
                                            dkey = dkey + "-" + v
                                if dkey not in dfacts:
                                    dfacts[dkey] = dict()
                            else:
                                mvalue = metrics[value]
                                mtype = value.split("-")[0]
                                dfacts[dkey]["TEXT-"+mtype] = dict()
                                dfacts[dkey]["TEXT-"+mtype] = metrics[value].upper()
                                dfacts[dkey]["TYPE-"+mtype] = value.split("-")[0]
                                if mtype == "PCT":
                                    temp = re.findall(r'\d+', mvalue)
                                    dfacts[dkey]["NUMBER-"+mtype] = int(temp[0])
                                elif mtype == "MONEY":
                                    for m,n in enumerate(million+billion):
                                        if n in mvalue.upper():
                                            temp = re.findall(r'\d+', mvalue)
                                            dfacts[dkey]["CURRENCY-"+mtype] = mvalue[0]
                                            dfacts[dkey]["VECTOR-"+mtype] = "+"
                                            
                                            if n in million:
                                                dfacts[dkey]["NUMBER-"+mtype] = int(temp[0]) * 10**6
                                                dfacts[dkey]["RTEXT-"+mtype] = dfacts[dkey]["CURRENCY-"+mtype]+temp[0]+"MN"
                                            else:
                                                dfacts[dkey]["NUMBER-"+mtype] = int(temp[0]) * 10**8
                                                dfacts[dkey]["RTEXT-"+mtype] = dfacts[dkey]["CURRENCY-"+mtype]+temp[0]+"BN"
                                                
                                            if (dfacts[dkey]["CURRENCY-"+mtype]+"(") in mvalue or (dfacts[dkey]["CURRENCY-"+mtype]+" (") in mvalue:
                                                dfacts[dkey]["VECTOR-"+mtype] = "-"
                                                if n in million:
                                                    dfacts[dkey]["RTEXT-"+mtype] = "-"+dfacts[dkey]["CURRENCY-"+mtype]+temp[0]+"MN"
                                                else:
                                                    dfacts[dkey]["RTEXT-"+mtype] = "-"+dfacts[dkey]["CURRENCY-"+mtype]+temp[0]+"BN"
                                            break
                                    if "NUMBER-"+mtype not in dfacts[dkey]:
                                        #temp = re.findall(r'\d+', mvalue)
                                        #dfacts[dkey]["NUMBER"] = int(temp[0])
                                        mvalue = mvalue.replace(mvalue[0]+" ", mvalue[0])
                                        dfacts[dkey]["NUMBER-"+mtype] = float(mvalue[1:len(mvalue)])
                                        dfacts[dkey]["CURRENCY-"+mtype] = mvalue[0]
                                        dfacts[dkey]["VECTOR-"+mtype] = "+"
                                        dfacts[dkey]["RTEXT-"+mtype] = dfacts[dkey]["CURRENCY-"+mtype]+str(dfacts[dkey]["NUMBER-"+mtype])
                                        if (dfacts[dkey]["CURRENCY-"+mtype]+"(") in mvalue or (dfacts[dkey]["CURRENCY-"+mtype]+" (") in mvalue:
                                            mv = mvalue.replace((dfacts[dkey]["CURRENCY-"+mtype]+"("), "")
                                            mv = mv.replace(")", "")
                                            dfacts[dkey]["NUMBER-"+mtype] = float(mv)
                                            dfacts[dkey]["VECTOR-"+mtype] = "-"
                                            dfacts[dkey]["RTEXT-"+mtype] = "-"+dfacts[dkey]["CURRENCY-"+mtype]+str(dfacts[dkey]["NUMBER-"+mtype])
                                            
                                elif mtype == "CD":
                                    for m,n in enumerate(million+billion):
                                        if n in mvalue.upper():
                                            temp = re.findall(r'\d+', mvalue)
                                            dfacts[dkey]["VECTOR-"+mtype] = "+"
                                            
                                            if n in million:
                                                dfacts[dkey]["NUMBER-"+mtype] = int(temp[0]) * 10**6
                                                dfacts[dkey]["RTEXT-"+mtype] = temp[0]+"MN"
                                            else:
                                                dfacts[dkey]["NUMBER-"+mtype] = int(temp[0]) * 10**8
                                                dfacts[dkey]["RTEXT-"+mtype] = temp[0]+"BN"
                                                
                                            if ("(") in mvalue or (" (") in mvalue:
                                                dfacts[dkey]["VECTOR-"+mtype] = "-"
                                                if n in million:
                                                    dfacts[dkey]["RTEXT-"+mtype] = "-"+temp[0]+"MN"
                                                else:
                                                    dfacts[dkey]["RTEXT-"+mtype] = "-"+temp[0]+"BN"
                                            break
                                    if "NUMBER-"+mtype not in dfacts[dkey]:
                                        temp = re.findall('\d*\.?\d+', mvalue)
                                        if(len(temp) > 0):
                                            dfacts[dkey]["NUMBER-"+mtype] = float(temp[0])
                                            dfacts[dkey]["VECTOR-"+mtype] = "+"
                                            dfacts[dkey]["RTEXT-"+mtype] = temp[0]
                                        else:
                                            dfacts[dkey]["VECTOR-"+mtype] = "+"
                                            dfacts[dkey]["RTEXT-"+mtype] = mvalue.upper()
                                            dfacts[dkey]["NUMBER-"+mtype] = get_number_from_string(mvalue.upper())
                                        if ("(") in mvalue or (" (") in mvalue:
                                            dfacts[dkey]["VECTOR-"+mtype] = "-"
                                            if(len(temp) > 0):
                                                dfacts[dkey]["RTEXT-"+mtype] = "-"+temp[0]
                                            else:
                                                dfacts[dkey]["RTEXT-"+mtype] = "-"+mvalue.upper()
                    
                    if(dkey):
                        if(orgEntities and pstr in orgEntities and dkey in orgEntities[pstr] and "TEXT-PCT" in orgEntities[pstr][dkey]):
                            stmt = dkey + " " + orgEntities[pstr][dkey]["TEXT-CHG"] + " " + orgEntities[pstr][dkey]["TEXT-PCT"] + " " + orgEntities[pstr][dkey]["TEXT-CALENDAR"] + " IN " + pqtr + " " + eyr
                            facts.append(stmt)
                        if(orgEntities and pstr in orgEntities and dkey in orgEntities[pstr] and "TEXT-MONEY" in orgEntities[pstr][dkey]):
                            stmt = dkey + " WAS " + orgEntities[pstr][dkey]["TEXT-MONEY"] + " IN " + pqtr + " " + eyr
                            facts.append(stmt)
                            stmt = None
                            if (dfacts[dkey]["NUMBER-MONEY"] - orgEntities[pstr][dkey]["NUMBER-MONEY"]) == 0 :
                                chng = "FLAT"
                            elif (dfacts[dkey]["NUMBER-MONEY"] - orgEntities[pstr][dkey]["NUMBER-MONEY"]) < 0:
                                chng = "DECLINED"
                            else:
                                chng = "GREW"
                            prct = abs(dfacts[dkey]["NUMBER-MONEY"] - orgEntities[pstr][dkey]["NUMBER-MONEY"])/orgEntities[pstr][dkey]["NUMBER-MONEY"]
                            prct = prct * 100
                            prct = round(prct, 2)
                            stmt = dkey + " " + chng + " " + str(prct) + "% QUARTER OVER QUARTER IN " + eqtr + " " + eyr
                            facts.append(stmt)
                        elif(orgEntities and pstr in orgEntities and dkey in orgEntities[pstr] and "TEXT-CD" in orgEntities[pstr][dkey]):
                            stmt = dkey + " WAS " + orgEntities[pstr][dkey]["TEXT-CD"] + " IN " + pqtr + " " + eyr
                            facts.append(stmt)
                            stmt = None
                            if (dfacts[dkey]["NUMBER-CD"] - orgEntities[pstr][dkey]["NUMBER-CD"]) == 0 :
                                chng = "FLAT"
                            elif (dfacts[dkey]["NUMBER-CD"] - orgEntities[pstr][dkey]["NUMBER-CD"]) < 0:
                                chng = "DECLINED"
                            else:
                                chng = "GREW"
                            prct = abs(dfacts[dkey]["NUMBER-CD"] - orgEntities[pstr][dkey]["NUMBER-CD"])/orgEntities[pstr][dkey]["NUMBER-CD"]
                            prct = prct * 100
                            prct = round(prct, 2)
                            stmt = dkey + " " + chng + " " + str(prct) + "% QUARTER OVER QUARTER IN " + eqtr + " " + eyr
                            facts.append(stmt)
                        if(orgEntities and yoystr in orgEntities and dkey in orgEntities[yoystr] and "TEXT-MONEY" in orgEntities[yoystr][dkey]):
                            stmt = dkey + " WAS " + orgEntities[yoystr][dkey]["TEXT-MONEY"] + " IN " + eqtr + " " + yoyr
                            facts.append(stmt)
                        if "GUIDE" not in dkey:
                            gkey = dkey+"-GUIDE"
                            if(orgEntities and pstr in orgEntities and gkey in orgEntities[pstr] and "TEXT-PCT" in orgEntities[pstr][gkey]):
                                stmt = dkey + " WAS EXPECTED TO " + orgEntities[pstr][gkey]["TEXT-CHG"] + " " + orgEntities[pstr][gkey]["TEXT-PCT"] + " " + orgEntities[pstr][gkey]["TEXT-CALENDAR"] + " IN " + eqtr + " " + eyr
                                facts.append(stmt)
                                stmt = None
                                if (dfacts[dkey]["NUMBER-PCT"] - orgEntities[pstr][gkey]["NUMBER-PCT"]) >= 0 :
                                    stmt = dkey + " BEATS OWN GUIDANCE"
                                    facts.append(stmt)
                                else:
                                    stmt = dkey + " DID NOT BEAT OWN GUIDANCE"
                                    facts.append(stmt)
                            elif(orgEntities and pstr in orgEntities and gkey in orgEntities[pstr] and "TEXT-CD" in orgEntities[pstr][gkey]):
                                stmt = dkey + " WAS EXPECTED TO " + orgEntities[pstr][gkey]["TEXT-CHG"] + " " + orgEntities[pstr][gkey]["TEXT-CD"] + " IN " + eqtr + " " + eyr
                                facts.append(stmt)
                                stmt = None
                                if "NUMBER-CD" in dfacts[dkey] and orgEntities[pstr][gkey]["NUMBER-CD"] > 100:
                                    currNum = (dfacts[dkey]["NUMBER-CD"])
                                else:
                                    currNum = (dfacts[dkey]["NUMBER-PCT"])
                                if (currNum - orgEntities[pstr][gkey]["NUMBER-CD"]) >= 0 :
                                    stmt = dkey + " BEATS OWN GUIDANCE"
                                    facts.append(stmt)
                                else:
                                    stmt = dkey + " DID NOT BEAT OWN GUIDANCE"
                                    facts.append(stmt)
                    last_cond_match_cnt = cond_match_cnt
                    actions = rule["ACTIONS"]
                    for k, action in enumerate(actions):
                        #print(action)
                        asplit = action.split(" ")
                        #print(asplit)
                        stmt = None
                        for key, value in enumerate(asplit):
                            if "-" in value:
                                #print(value)
                                if not stmt:
                                    stmt = metrics[value]
                                else:
                                    stmt = stmt + " " + metrics[value]
                            elif "*CALENDAR" in value:
                                if not stmt:
                                    stmt = eqtr
                                else:
                                    stmt = stmt + " " + eqtr
                            elif "*YEAR" in value:
                                if not stmt:
                                    stmt = eyr
                                else:
                                    stmt = stmt + " " + eyr
                            elif "*GCALENDAR" in value:
                                if not stmt:
                                    stmt = gqtr
                                else:
                                    stmt = stmt + " " + gqtr
                            elif "*CHANGED" in value:
                                if key+1 < len(asplit):
                                    val = asplit[key+1]
                                    if val[0] == "(":
                                        if not stmt:
                                            stmt = "DECLINED"
                                        else:
                                            if("GUIDANCE" not in stmt):
                                                stmt = stmt + " " + "DECLINED"
                                            else:
                                                stmt = stmt + " " + "WILL DECLINE"
                                    else:
                                        if not stmt:
                                            stmt = "GREW"
                                        else:
                                            if("GUIDANCE" not in stmt):
                                                stmt = stmt + " " + "GREW"
                                            else:
                                                stmt = stmt + " " + "WILL GROW"
                            else:
                                if not stmt:
                                    stmt = value
                                else:
                                    stmt = stmt + " " + value
                        #print(stmt)
                        facts.append(stmt.upper())
                        #break
    #print(dfacts)
    if(len(facts) > 0):
        return facts, dfacts
    return None, None

In [225]:
#Build Facts
def buildFacts(entities, entity, filename):
    excludes = ["CNAME", "CSYM", "EQTR", "EDATE", "EYR", "GQTR", "CEO", "CFO", "CMO", "COO", "CS", "ORG", "MISC", "GDATE", "GYR", "SYM", "DATE", "CALENDAR"]
    csym = None
    estr = None
    gstr = None
    cname = None
    
    fsplit = filename.split("-")
    csym = fsplit[0]
    
    orgData = getOrgData(csym)
    if not orgData:
        print("Date for {} not present, bailing out".format(csym))
        return None
    val = getOrgAttr(orgData, "ORGPROFILE|FiscalYearEnd") 
    if not val:
        print("Date for {} not present, bailing out".format(csym))
        return None
    edate = fsplit[1]+"-"+fsplit[2]+"-"+fsplit[3].split("_")[0]
    print("\nEARNING DATE", edate)
    dobj = datetime.strptime(val, "%B")
    fynd = (dobj.month)
    
    qtrs = getQtrs(csym, edate, fynd)
    print("\nQUARTER INFORMATION")
    print(qtrs)

    if "CNAME" in entity:
        print("Name:", entity["CNAME"])
        cname = entity["CNAME"]
    else:
        print("Company Name missing, bailing out")
        return None
    if "CSYM" in entity:
        print("Symbol:", entity["CSYM"])
        csym = entity["CSYM"]
    else:
        print("Company Symbol missing, bailing out")
        return None
    if "EQTR" in entity:
        print("EY Quarter:", entity["EQTR"])
    else:
        print("Earning Quarter missing, bailing out")
        return None
    if "EYR" in entity:
        print("EY Year:", entity["EYR"])
        estr = entity["EQTR"] + "-" + entity["EYR"]
    else:
        print("Earning Year missing, bailing out")
        return None

    #data_facts = dict()
    lqtrChg = False
    
    if csym not in entities:
        entities[csym] = dict()
    if cname not in entities[csym]:
        entities[csym]["NAME"] = cname
    if "LATEST-QTR" not in entities[csym]:
        entities[csym]["LATEST-QTR"] = estr
        lqtrChg = True
    else:
        if(entities[csym]["LATEST-QTR"] != estr):
            lsplit = entities[csym]["LATEST-QTR"].split("-")
            lyr = int(lsplit[1])
            lqtr = lsplit[0]
            if(int(entity["EYR"]) > lyr):
                entities[csym]["LATEST-QTR"] = estr
                lqtrChg = True
            elif(int(entity["EYR"]) == lyr and entity["EQTR"] > lqtr):
                entities[csym]["LATEST-QTR"] = estr
                lqtrChg = True
     
    if estr not in entities[csym]:
        entities[csym][estr] = dict()
    entities[csym]["LAST-QUERY-REPORT"] = estr
    entities[csym][estr]["FACTS"] = list()
    
    if "EDATE" in entity:
        print("EY Quarter End Date:", entity["EDATE"])
        entities[csym][estr]["EDATE"] = entity["EDATE"]
        if(lqtrChg):
            entities[csym]["LATEST-QTR-DATE"] = entity["EDATE"]
        
    if "GQTR" in entity:
        print("Guidance Quarter:", entity["GQTR"])
        gstr = entity["GQTR"] + "-" + entity["EYR"]
        entities[csym][estr]["GUIDE"] = gstr
    #if "GDATE" in entity:
        #    print("Guidance End Date:", entity["GDATE"])
    #if "GYR" in entity:
        #    print("Year:", entity["GYR"])
    if "CEO" in entity:
        print("CEO:", entity["CEO"])
        entities[csym][estr]["CEO"] = entity["CEO"]
    if "CFO" in entity:
        print("CFO:", entity["CFO"])
        entities[csym][estr]["CFO"] = entity["CFO"]
    if "COO" in entity:
        print("COO:", entity["CFO"])
        entities[csym][estr]["COO"] = entity["COO"]
    if "CS" in entity:
        print("Management Commentary:", entity["CS"])
        entities[csym][estr]["COMMENT"] = entity["CS"]
    if "ORG" in entity:
        entities[csym][estr]["ORG"] = list(set(entity["ORG"]))
    if "MISC" in entity:
        entities[csym][estr]["MISC"] = list(set(entity["MISC"]))
    
    print("\n")
    for item in entity:
        if item not in excludes:
            #print(item, entity[item])
            facts, dfacts = get_facts(rules, item, entity, entities[csym])
            if(facts):
                #print(item, entity[item])
                #print("\n")
                entities[csym][estr]["FACTS"] = entities[csym][estr]["FACTS"] + facts
                print(item)
                print("\n".join(facts))
            if(dfacts):
                for key in dfacts:
                    entities[csym][estr][key] = dfacts[key]
            print("\n")
    print(entities[csym])
    return(entities)

In [10]:
@Language.component("newsent")
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        #print(token.text, token.text in ("’s", "'s"))
        if token.text.upper() in ("--", "\n\n", "QUARTERLY", "STORY"):
            #print("Detected:", token.text)
            doc[token.i].is_sent_start = True
    return doc

def loadModel():
    nlp = spacy.load("../../Summary/NER/RelateEntity/train/model-best")
    nlp.add_pipe('sentencizer')
    nlp.add_pipe('newsent', name="customsent", last=True)
    return nlp

def loadSpacy():
    nlp = spacy.load("en_core_web_trf")
    return nlp

def getSentences(path, nlpModel):
    
    with open(path, 'r', encoding="utf-8") as f:
        text = f.read()

    #print(text)

    doc = nlpModel(text)
    sentences = [str(sent).strip() for sent in doc.sents]

    print(len(sentences))
    return sentences, doc
    

In [11]:
nlpModel = loadModel()
nlpWebtf = loadSpacy()

In [217]:
import os
import glob

def getReportForOrg(csym, nlpModel, nlpWebtf):
    path = "../../Summary/unsup"
    files = glob.glob(path+"/"+csym+"-*")   
    files.sort(key=os.path.getmtime)
    print(files)
    
    entities = dict()
    for i,file in enumerate(files):
        if i == 5:
            break
        print(file)
        basefile = os.path.basename(file)
        filename = os.path.splitext(basefile)[0]
        print(filename)
        #print(basefile)
        #print("====================================================================")
        sentences, doc = getSentences(file, nlpModel)
        #displacy.render(doc, style="ent")
        
        entity = getRelations(nlpWebtf, nlpModel, sentences)
        if (entity):
            ret = buildFacts(entities, entity, filename)
            
    return entities

In [None]:
entities = getReportForOrg("PINS", nlpModel, nlpWebtf)

['../../Summary/unsup\\PINS-2021-04-21_13-32-24.231523.txt', '../../Summary/unsup\\PINS-2021-07-29_13-32-23.370618.txt']
../../Summary/unsup\PINS-2021-04-21_13-32-24.231523.txt
PINS-2021-04-21_13-32-24.231523
25
1 Pinterest, Inc. (NYSE: PINS) today announced financial results for the quarter ended March 31, 2021.
Sentences
 ['Pinterest, Inc. (NYSE: PINS) today announced financial results for the quarter ended March 31, 2021.']
NER [('Pinterest', 'ORG', 0, 1), ('NYSE', 'ORG', 4, 5), ('PINS', 'SYM', 6, 7), ('announced', 'CHG', 9, 10), ('March 31, 2021', 'DATE', 16, 20)]
2 Q1 revenue grew 78% year over year to $485 million.
Sentences
 ['Q1 revenue grew 78% year over year to $485 million.']
NER [('Q1', 'CALENDAR', 0, 1), ('revenue', 'METRIC', 1, 2), ('grew', 'CHG', 2, 3), ('78%', 'PCT', 3, 5), ('year over year', 'CALENDAR', 5, 8), ('$485 million', 'MONEY', 9, 12)]
3 Global Monthly Active Users (MAUs) grew 30% year over year to 478 million.
Sentences
 ['Global Monthly Active Users (MAUs) gr

NER [('MAUs', 'METRIC', 1, 2), ('around flat', 'CD', 4, 6), ('year-over-year', 'CALENDAR', 8, 13)]
GLOBAL MONTHLY ACTIVE USERS [[('3', 'LINE', -1, -1), ('REGULAR', 'SECTION', -1, -1), ('grew', 'CHG', 7, 8), ('30%', 'PCT', 8, 10), ('year over year', 'CALENDAR', 10, 13), ('478 million', 'CD', 14, 16)], [('16', 'LINE', -1, -1), ('GUIDE', 'SECTION', -1, -1), ('grow', 'CHG', 3, 4), ('mid-teens', 'CD', 6, 9)]]
17 Finally, we expect sequential operating expense growth to accelerate in Q2 as we continue to ramp investments in our long-term initiatives and growth drivers.
Sentences
 ['Finally, we expect', 'sequential operating expense growth to accelerate in Q2 as', 'we continue to ramp investments in our long-term initiatives and growth drivers.']
MONTHLY ACTIVE USERS [[('16', 'LINE', -1, -1), ('GUIDE', 'SECTION', -1, -1), ('around flat', 'CD', 4, 6), ('year-over-year', 'CALENDAR', 8, 13)]]
NER [('expect', 'FC', 3, 4)]
NER [('operating expense', 'METRIC', 1, 3), ('growth', 'CHG', 3, 4), ('acce

76
1 SAN FRANCISCO, July 29, 2021
Sentences
 ['SAN FRANCISCO, July 29, 2021']
NER [('SAN FRANCISCO', 'LOC', 0, 2), ('July 29, 2021', 'DATE', 3, 7)]
2 --( BUSINESS WIRE )--Pinterest, Inc. (NYSE: PINS) today announced financial results for the quarter ended June 30, 2021.
Sentences
 ['--( BUSINESS WIRE )--Pinterest, Inc. (NYSE: PINS) today announced financial results for the quarter ended June 30, 2021.']
NER [('NYSE', 'ORG', 9, 10), ('PINS', 'SYM', 11, 12), ('announced', 'CHG', 14, 15), ('June 30, 2021', 'DATE', 21, 25)]
3 Q2 revenue grew 125% year over year to $613 million.
Sentences
 ['Q2 revenue grew 125% year over year to $613 million.']
NER [('Q2', 'CALENDAR', 0, 1), ('revenue', 'METRIC', 1, 2), ('grew', 'CHG', 2, 3), ('125%', 'PCT', 3, 5), ('year over year', 'CALENDAR', 5, 8), ('$613 million', 'MONEY', 9, 12)]
4 Global Monthly Active Users (MAUs) grew 9% year over year to 454 million.
Sentences
 ['Global Monthly Active Users (MAUs) grew 9% year over year to 454 million.']
REVENUE 

REVENUE GLOBAL [[('10', 'LINE', -1, -1), ('TABLE', 'SECTION', -1, -1), ('$ 613', 'MONEY', 55, 57), ('$ 272', 'MONEY', 57, 59), ('125 %', 'PCT', 59, 61)]]
REVENUE [[('3', 'LINE', -1, -1), ('REGULAR', 'SECTION', -1, -1), ('Q2', 'CALENDAR', 0, 1), ('grew', 'CHG', 2, 3), ('125%', 'PCT', 3, 5), ('year over year', 'CALENDAR', 5, 8), ('$613 million', 'MONEY', 9, 12)], [('9', 'LINE', -1, -1), ('TABLE', 'SECTION', -1, -1), ('Q2', 'CALENDAR', 0, 1), ('2021', 'YEAR', 1, 2), ('June 30, %', 'DATE', 24, 28), ('2021', 'YEAR', 29, 30), ('2020', 'YEAR', 30, 31), ('$ 613,210', 'MONEY', 32, 34), ('$ 272,485', 'MONEY', 34, 36), ('125%', 'PCT', 36, 38)], [('10', 'LINE', -1, -1), ('TABLE', 'SECTION', -1, -1), ('United States', 'LOC', 63, 65), ('$ 480', 'MONEY', 65, 67), ('$ 232', 'MONEY', 67, 69), ('107 %', 'PCT', 69, 71)]]
REVENUE INTERNATIONAL [[('10', 'LINE', -1, -1), ('TABLE', 'SECTION', -1, -1), ('$ 133', 'MONEY', 74, 76), ('$ 41', 'MONEY', 76, 78), ('227 %', 'PCT', 78, 80)]]
GLOBAL [[('10', 'LINE', -1

Sentences
 ['Webcast and conference call information', 'A live audio webcast of our second quarter 2021 earnings release call will be available at  investor.pinterestinc.com .']
NER []
NER [('second quarter', 'CALENDAR', 6, 8), ('2021', 'YEAR', 8, 9)]
25 The call begins today at 1:30 PM (PT) / 4:30 PM (ET).
Sentences
 ['The call begins today at 1:30 PM (PT) / 4:30 PM (ET).']
NER []
26 We have also posted to our investor relations website a letter to shareholders.
Sentences
 ['We have also posted to our investor relations website a letter to shareholders.']
NER [('shareholders', 'METRIC', 12, 13)]
27 This press release, including the reconciliations of certain non-GAAP measures to their nearest comparable GAAP measures, letter to shareholders and slide presentation are also available.
Sentences
 ['This press release, including the reconciliations of certain non-GAAP measures to their nearest comparable GAAP measures, letter to shareholders and slide presentation are also available.']
NE

NER [('COVID-19 pandemic', 'MISC', 40, 42)]
NER [('global', 'METRIC', 34, 35), ('COVID-19 pandemic', 'MISC', 48, 50), ('global', 'METRIC', 56, 57), ('global', 'METRIC', 62, 63), ('growth', 'CHG', 70, 71), ('COVID-19 pandemic', 'MISC', 76, 78), ('revenue', 'METRIC', 87, 88), ('cash flow', 'METRIC', 89, 91), ('engagement', 'METRIC', 96, 97), ('engagement', 'METRIC', 106, 107), ('operating results', 'METRIC', 157, 159), ('revenue', 'METRIC', 216, 217), ('revenue', 'METRIC', 223, 224), ('growth', 'CHG', 243, 244), ('internationally', 'METRIC', 250, 251), ('revenue', 'METRIC', 269, 270)]
GLOBAL [[('10', 'LINE', -1, -1), ('TABLE', 'SECTION', -1, -1), ('9 %', 'PCT', 85, 87), ('United States', 'LOC', 89, 91)], [('32', 'LINE', -1, -1), ('GUIDE', 'SECTION', -1, -1), ('COVID-19 pandemic', 'MISC', 48, 50), ('global', 'METRIC', 56, 57), ('global', 'METRIC', 62, 63), ('growth', 'CHG', 70, 71), ('COVID-19 pandemic', 'MISC', 76, 78)]]
NER []
NER [('stock', 'METRIC', 21, 22)]
NER [('stock', 'METRIC', 4

In [343]:
print(entities)

{'PINS': {'NAME': 'PINTEREST', 'LATEST-QTR': 'Q2-2021', 'Q1-2021': {'FACTS': ['REVENUE IS $485 MILLION IN Q1 2021', 'REVENUE GREW 78% YEAR OVER YEAR IN Q1 2021', 'GUIDANCE : REVENUE IS EXPECTED TO GROW 105% YEAR OVER YEAR IN Q2 2021', 'GLOBAL MONTHLY ACTIVE USERS IS 478 MILLION IN Q1 2021', 'GLOBAL MONTHLY ACTIVE USERS GREW 30% YEAR OVER YEAR IN Q1 2021', 'GUIDANCE: GLOBAL MONTHLY ACTIVE USERS IS EXPECTED TO GROW MID-TEENS IN Q2 2021', 'GAAP NET LOSS IS $(22) MILLION IN Q1 2021', 'ADJUSTED EBITDA IS $84 MILLION IN Q1 2021', 'GUIDANCE: MONTHLY ACTIVE USERS IS EXPECTED TO BE AROUND FLAT YEAR-OVER-YEAR IN Q2 2021', 'GUIDANCE: OPERATING EXPENSE GROWTH TO ACCELERATE IN Q2 2021', 'GUIDANCE: LONG-TERM GROWTH TO BE EXPECTED IN Q2 2021'], 'EDATE': 'MARCH 31, 2021', 'GUIDE': 'Q2-2021', 'CEO': ['BEN SILBERMANN'], 'CFO': ['TODD MORGENFELD'], 'COMMENT': '"Whether it’s recipe ideas during the pandemic or dream vacation planning for the future, I’m proud that we now help 478 million people every mont