In [1]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import spacy
from spacy.language import Language
from spacy import displacy
import time
import glob
import re
import math
import statistics
import os
import json
import calendar
import holidays
from pathlib import Path
from datetime import date
from datetime import datetime
import pandas as pd
import numpy as np
import collections
import hashlib
from dateutil.parser import parse
import shutil
import ast
from io import StringIO
import requests

In [2]:
import hashlib
def getHash(sent):
    hash_object = hashlib.sha1(sent.upper().encode("UTF-8"))
    hex_dig = hash_object.hexdigest()
    return(hex_dig)

In [6]:
def getRelationFromER(sym):
    entityFile = "../../Summary/entities/"+sym+"-ENTITIES.json"
    print(entityFile)
    #search = "Q\d+-[0-9][0-9][0-9][0-9]|ALL-[0-9][0-9][0-9][0-9]"
    search = "Q\d+-[0-9][0-9][0-9][0-9]"
    print(search)
    cnt = 0
    sentER = dict()
    with open(entityFile, encoding="utf-8") as f:
        entity = json.load(f)
    if(entity):
        for key in entity[sym].keys():
            if(re.search(search, key)):
                if("ALL" not in key):
                    cnt = cnt + 1
                #if(cnt == 1):
                #    continue
                if(cnt > 5):
                    break
                for ent in entity[sym][key]:
                    if("PREV" not in ent and type(entity[sym][key][ent]) == dict and "RELATION" in entity[sym][key][ent] and len(entity[sym][key][ent]["RELATION"]) > 0):
                        for r in entity[sym][key][ent]["RELATION"]:
                            newent = ent.replace("-GUIDEFULL","")
                            newent = newent.replace("-GUIDE","")
                            newent = re.sub("-"+search, "", newent)
                            sent = r.split(":::")[2]
                            sent = sent.strip()
                            hexDig = getHash(sent)
                            relation = "{METRIC:"+newent+","
                            if hexDig not in sentER:
                                sentER[hexDig] = dict()
                            else:
                                if("RELATION" in sentER[hexDig] and relation in sentER[hexDig]["RELATION"][0]):
                                    print("SAME RELATION " + relation + " ALREADY PRESENT FOR " + sent)
                                    continue
                            if("RTEXT-MONEY" in entity[sym][key][ent] and "RTEXT-RANGEM1" in entity[sym][key][ent]):
                                relation = relation + "MONEY:"+entity[sym][key][ent]["RTEXT-MONEY"]+",RANGEM1:"+entity[sym][key][ent]["RTEXT-RANGEM1"]+",RELATION:KVRANGE}"
                            elif("RTEXT-CD" in entity[sym][key][ent] and "RTEXT-RANGEM1" in entity[sym][key][ent]):
                                relation = relation + "CD:"+entity[sym][key][ent]["RTEXT-CD"]+",RANGEM1:"+entity[sym][key][ent]["RTEXT-RANGEM1"]+",RELATION:KVRANGE}"
                            elif("RTEXT-PCT" in entity[sym][key][ent] and "RTEXT-RANGEM1" in entity[sym][key][ent]):
                                relation = relation + "PCT:"+entity[sym][key][ent]["RTEXT-PCT"]+",RANGEM1:"+entity[sym][key][ent]["RTEXT-RANGEM1"]+",RELATION:KVRANGE}"
                            elif("RTEXT-MONEY" in entity[sym][key][ent]):
                                relation = relation + "MONEY:"+entity[sym][key][ent]["RTEXT-MONEY"]+",RELATION:KV}"
                            elif("RTEXT-CD" in entity[sym][key][ent]):
                                relation = relation + "CD:"+entity[sym][key][ent]["RTEXT-CD"]+",RELATION:KV}"
                            elif("RTEXT-PCT" in entity[sym][key][ent]):
                                relation = relation + "PCT:"+entity[sym][key][ent]["RTEXT-PCT"]+",RELATION:KV}"
                            #print(key, sent+"\t"+relation, hexDig)
                            sentER[hexDig]["ORIGSENT"] = sent
                            sentER[hexDig]["KEY"] = key
                            if("RELATION" not in sentER[hexDig]):
                                sentER[hexDig]["RELATION"] = list()
                            sentER[hexDig]["RELATION"].append(relation)
    print(sentER)
    return(sentER)

In [11]:
def getFileSortTimestamp(csym, src="REFINED"):
    if(src == "POSTREFINED"):
        path = "../../Summary/PostRefined"
    else:
        path = "../../Summary/Refined"
    files = glob.glob(path+"/"+csym+"/*")   
    files.sort(reverse=True, key=os.path.getmtime)
    return(files)

In [21]:
def getOrgData(org):
    orgDataPath = "../../Summary/orgData/"+org+".txt"
    file = Path(orgDataPath)
    if file.is_file():
        #print(True)
        with open(orgDataPath) as f:
            data = json.load(f)
        #print(data)
        return data
    return None

In [22]:
def getOrgAttr(orgData, attr):
    if not orgData:
        return None
    asplit = attr.split("|")
    parent = asplit[0]
    if "SOURCE" in orgData[parent]:
        src = orgData[parent]["SOURCE"]
        if src == "YH" or (parent == "ORGPROFILE" and src == "AD"):
            p = orgData
            for i in range(0, len(asplit)):
                if asplit[i] not in p:
                    return None
                p = p[asplit[i]]
            #print(p)
            return(p)
    return None

In [23]:
def getQtrs(org, edate, fynd):
    #print(edate)
    dateObj = datetime.strptime(edate, '%Y-%m-%d')
    #print(dateObj.month)
    emonth = dateObj.month
    eyr = dateObj.year
    k = 1
    qend = fynd
    qstart = 0
    year = eyr
    #print(year)
    qtrs = dict()
    while(k!=5):
        #print(k)
        qstart = (qend + 1)%12
        qend = (qend + 3)%12
        if qstart == 0:
            qstart = 12
        if qend == 0:
            qend = 12
        if((emonth >= qstart and emonth <= qend) or (qstart > qend and (emonth >= qstart or emonth <= qend))):
            #print("CQTR:", "Q"+str(k))
            qtrs["CQTR"] = "Q"+str(k)
            qtrs["GQTR"] = "Q"+str(k) # Guidance qtr is same as current qtr
            qtrs["CYR"] = str(year)
            qtrs["GYR"] = str(year) # Guidance year same as current year
            eqtr = (k-1)%4
            if(eqtr == 0):
                eqtr = 4
            nqtr = (k+1)%4
            if(nqtr == 0):
                nqtr = 4
            pqtr = (k-2)%4
            if(pqtr == 0):
                pqtr = 4
            qtrs["EQTR"] = "Q"+str(eqtr)
            qtrs["PQTR"] = "Q"+str(pqtr)
            #print("EQTR:", "Q"+str(eqtr))
            #print("PQTR:", "Q"+str(pqtr))
            #print("NQTR:", "Q"+str(nqtr))
        #print("Q"+str(k), "START", qstart)
        #print("Q"+str(k), "END", qend)
        qtr = "Q"+str(k)
        if (qtr not in qtrs):
            qtrs[qtr] = dict()
        qtrs[qtr]["START"] = qstart
        qtrs[qtr]["SM"] = calendar.month_abbr[qstart].upper()
        qtrs[qtr]["END"] = qend
        qtrs[qtr]["EM"] = calendar.month_abbr[qend].upper()
        #print("Q"+str(k), "YEAR", year)
        k = k + 1
    
    #print(eyr)   
    if("EQTR" in qtrs and "CQTR" in qtrs and "PQTR" in qtrs):
        eqtrEnd = qtrs[qtrs["EQTR"]]["END"]
        cqtrEnd = qtrs[qtrs["CQTR"]]["END"]
        pqtrEnd = qtrs[qtrs["PQTR"]]["END"]
        #print(eqtrEnd, cqtrEnd, pqtrEnd)
        qtrs["EYR"] = str(eyr)
        qtrs["PYR"] = str(eyr)
        #print(eqtrEnd, cqtrEnd, qtrs[qtrs["CQTR"]]["START"], qtrs[qtrs["CQTR"]]["END"])
        if(eqtrEnd > cqtrEnd):
            if(qtrs[qtrs["CQTR"]]["START"] > qtrs[qtrs["CQTR"]]["END"] and emonth > fynd):
                gyr = year + 1
                qtrs["GYR"] = str(gyr)
                qtrs["EYR"] = str(gyr - 1)
            else:
                qtrs["EYR"] = str(eyr - 1)
                qtrs["PYR"] = str(eyr - 1)
        if(pqtrEnd > eqtrEnd):
            qtrs["PYR"] = str(int(qtrs["EYR"]) - 1)
    if(emonth <= fynd):
        qtrs["CFYR"] = str(year)
    else:
        qtrs["CFYR"] = str(year + 1)
    qtrs["FYR"] = qtrs["CFYR"]
    if(qtrs[qtrs["EQTR"]]["END"] <= fynd and qtrs["EQTR"] > qtrs["CQTR"]):
        qtrs["FYR"] = str(int(qtrs["FYR"]) - 1)
    qtrs["EFYR"] = qtrs["FYR"]
    qtrs["PFYR"] = qtrs["FYR"]
    if(qtrs["PQTR"] > qtrs["EQTR"]):
        qtrs["PFYR"] = str(int(qtrs["FYR"]) - 1)
    qtrs["GFYR"] = qtrs["CFYR"]
    #qtrs["EYR"] = qtrs["FYR"]
    #qtrs["CYR"] = qtrs["CFYR"]
    #qtrs["PYR"] = qtrs["FYR"]
    qtrs["CYR"] = qtrs["GYR"]
    #print(qtrs)
    return (qtrs)

In [43]:
csym = "APPN"
files = getFileSortTimestamp(csym)
print(files)
pfiles = getFileSortTimestamp(csym, "POSTREFINED")
print(pfiles)

#entityFile = "../../Summary/entities/"+csym+"-ENTITIES.json"
#print(entityFile)
#with open(entityFile, encoding="utf-8") as f:
#    entity = json.load(f)

orgData = getOrgData(csym)
if orgData:
    val = getOrgAttr(orgData, "ORGPROFILE|FiscalYearEnd")

cnt = 0
cdate = "2023-06-30"
for file in files:
    cnt = cnt + 1
    if(cnt > 5):
        break
    basefile = os.path.basename(file)
    filename = os.path.splitext(basefile)[0]
    #print(filename)
    edate = filename.split("_")[1]
    rtype = filename.split("_")[0]
    estr = None
    if orgData:
        if(val):
            dobj = datetime.strptime(val, "%B")
            fynd = (dobj.month)
            qtrs = getQtrs(csym, edate, fynd)
            eqtr = qtrs["EQTR"]
            eyr = qtrs["EYR"]
            estr = eqtr+"-"+eyr
    if(estr):
        nedate = time.strptime(edate, "%Y-%m-%d")
        ncdate = time.strptime(cdate, "%Y-%m-%d")
        if(nedate > ncdate):
            nfile = "../../Summary/PostRefined/"+csym+"/"+basefile
            print(edate, rtype, estr, nfile)
        else:
            nfile = file
            print(edate, rtype, estr, nfile)
        continue

['../../Summary/Refined/APPN\\APPN_2023-08-03_EP_YH.txt', '../../Summary/Refined/APPN\\APPN_2023-05-09_EP_YH.txt', '../../Summary/Refined/APPN\\APPN_2023-02-16_EP_YH.txt', '../../Summary/Refined/APPN\\APPN_2022-11-03_EP_YH.txt', '../../Summary/Refined/APPN\\APPN_2022-08-04_EP_YH.txt', '../../Summary/Refined/APPN\\APPN_2022-05-06_EP_YH.txt']
['../../Summary/PostRefined/APPN\\APPN_2023-08-03_EP_YH.txt', '../../Summary/PostRefined/APPN\\APPN_2023-05-09_EP_YH.txt', '../../Summary/PostRefined/APPN\\APPN_2023-02-16_EP_YH.txt', '../../Summary/PostRefined/APPN\\APPN_2022-11-03_EP_YH.txt', '../../Summary/PostRefined/APPN\\APPN_2022-08-04_EP_YH.txt']
2023-08-03 APPN Q2-2023 ../../Summary/PostRefined/APPN/APPN_2023-08-03_EP_YH.txt
2023-05-09 APPN Q1-2023 ../../Summary/Refined/APPN\APPN_2023-05-09_EP_YH.txt
2023-02-16 APPN Q4-2022 ../../Summary/Refined/APPN\APPN_2023-02-16_EP_YH.txt
2022-11-03 APPN Q3-2022 ../../Summary/Refined/APPN\APPN_2022-11-03_EP_YH.txt
2022-08-04 APPN Q2-2022 ../../Summary/R

In [2]:
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")

Downloading:   0%|          | 0.00/662 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.92G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.48k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.15k [00:00<?, ?B/s]

In [3]:
inputs = tokenizer("A step by step recipe to make bolognese pasta:", return_tensors="pt")
outputs = model.generate(**inputs)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

['Toss the pasta with the sauce, then add the meat and toss again.']


In [4]:
inputs = tokenizer("Get the Named entity from this sentence: Total revenue is expected to be between $115.0 million and $117.0 million, representing a year-over-year increase of 24% to 27%.", return_tensors="pt")
outputs = model.generate(**inputs)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

['Total revenue']


In [5]:
advanced_ner = """Microsoft Corporation is a company that makes computer software and video games. Bill Gates and Paul Allen founded the company in 1975 
[Company]: Microsoft, [Founded]: 1975, [Founders]: Bill Gates, Paul Allen 
 
Amazon.com, Inc., known as Amazon , is an American online business and cloud computing company. It was founded on July 5, 1994 by Jeff Bezos 
[Company]: Amazon, [Founded]: 1994, [Founders]: Jeff Bezos 
 
Apple Inc. is a multinational company that makes personal computers, mobile devices, and software. Apple was started in 1976 by Steve Jobs and Steve Wozniak."""
inputs = tokenizer(advanced_ner, return_tensors="pt")
outputs = model.generate(**inputs)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

['[Company]: Apple, [Founded]: 1976, [Founders]:']


In [6]:
report_ner = """"Revenue is $503MN up 77% year over year
[Metric]: Revenue, [Money]: $503MN, [PCT]: 77%, [CALENDAR]: year over year

Revenue is $603MN up 87% year over year
[Metric]: Revenue, [Money]: $603MN, [PCT]: 87%, [CALENDAR]: year over year

Revenue is $63MN down 12% year over year.
"""
inputs = tokenizer(report_ner, return_tensors="pt")
outputs = model.generate(**inputs)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

['"Revenue is $503MN up 77% year over year [Metric]:']
