In [1]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, TrainingArguments, Trainer
import spacy
import torch
from spacy.language import Language
from spacy import displacy
import time
import glob
import re
import math
import statistics
import os
import json
import calendar
import holidays
from pathlib import Path
from datetime import date
from datetime import datetime
import pandas as pd
import numpy as np
import collections
import hashlib
from dateutil.parser import parse
import shutil
import ast
from io import StringIO
import requests
import glob

In [2]:
import spacy
from spacy.language import Language
from spacy import displacy
import time

@Language.component("newsent")
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        #print(token.text, token.text in ("’s", "'s"))
        if token.text.upper() in (";", "--", "\n\n", "\n", "QUARTERLY", "STORY", "\n\n\n\n", "\n\n\n"):
            #print("Detected:", token.text)
            doc[token.i].is_sent_start = True
    return doc

#spacy.require_gpu()
nlp = spacy.load("../../Summary/NER/RelateEntity/train/model-best-local")
nlp.add_pipe('sentencizer')
nlp.add_pipe('newsent', name="newsent", last=True)



<function __main__.set_custom_boundaries(doc)>

In [3]:
def isScanningRqd(pfile):
    with open(pfile, 'r', encoding = "utf-8") as fp:
        for l_no, line in enumerate(fp):
            if "NOPAD***" in line:
                return False
    return True

In [4]:
def getSentences(inputfile, nlp, text=None):
    if(not text):
        with open(inputfile, 'r', encoding="utf-8") as f:
            text = f.read()

    doc = nlp(text)
    sentences = [str(sent).strip() for sent in doc.sents]

    #print(len(sentences))
    return(sentences)

In [5]:
rplStr = ["PG*** ", "ED*** ", "SCHQ*** ", "SCBQ*** ", "SCBF*** ", "SCHF*** ", "SCG*** ", "GF*** ", "GQ*** ", "SC*** ", "NOPAD*** "]
stag = ["SCHQ***", "SCHF***", "SCBQ***", "SCBF***", "SCG***"]
gtag = ["GF***", "GQ***"]
ptag = "PG***"

def preProcessSent(line):
    for s in rplStr:
        line = line.replace(s, "")
    if("TBLST***" in line or "TBLET***" in line or "CS***" in line or line == "\n" or line == "\n\n" or "https://finance.yahoo.com" in line):
        return(line, False)
    line = line.replace("\n", "")
    if(line == None or line == ""):
        return(line, False)
    return(line, True)

In [6]:
def getOrgData(org):
    orgDataPath = "../../Summary/orgData/"+org+".txt"
    file = Path(orgDataPath)
    if file.is_file():
        #print(True)
        with open(orgDataPath) as f:
            data = json.load(f)
        #print(data)
        return data
    return None

In [7]:
def getOrgAttr(orgData, attr):
    if not orgData:
        return None
    asplit = attr.split("|")
    parent = asplit[0]
    if "SOURCE" in orgData[parent]:
        src = orgData[parent]["SOURCE"]
        if src == "YH" or (parent == "ORGPROFILE" and src == "AD"):
            p = orgData
            for i in range(0, len(asplit)):
                if asplit[i] not in p:
                    return None
                p = p[asplit[i]]
            #print(p)
            return(p)
    return None

In [8]:
def getQtrs(org, edate, fynd):
    #print(edate)
    dateObj = datetime.strptime(edate, '%Y-%m-%d')
    #print(dateObj.month)
    emonth = dateObj.month
    eyr = dateObj.year
    k = 1
    qend = fynd
    qstart = 0
    year = eyr
    #print(year)
    qtrs = dict()
    while(k!=5):
        #print(k)
        qstart = (qend + 1)%12
        qend = (qend + 3)%12
        if qstart == 0:
            qstart = 12
        if qend == 0:
            qend = 12
        if((emonth >= qstart and emonth <= qend) or (qstart > qend and (emonth >= qstart or emonth <= qend))):
            #print("CQTR:", "Q"+str(k))
            qtrs["CQTR"] = "Q"+str(k)
            qtrs["GQTR"] = "Q"+str(k) # Guidance qtr is same as current qtr
            qtrs["CYR"] = str(year)
            qtrs["GYR"] = str(year) # Guidance year same as current year
            eqtr = (k-1)%4
            if(eqtr == 0):
                eqtr = 4
            nqtr = (k+1)%4
            if(nqtr == 0):
                nqtr = 4
            pqtr = (k-2)%4
            if(pqtr == 0):
                pqtr = 4
            qtrs["EQTR"] = "Q"+str(eqtr)
            qtrs["PQTR"] = "Q"+str(pqtr)
            #print("EQTR:", "Q"+str(eqtr))
            #print("PQTR:", "Q"+str(pqtr))
            #print("NQTR:", "Q"+str(nqtr))
        #print("Q"+str(k), "START", qstart)
        #print("Q"+str(k), "END", qend)
        qtr = "Q"+str(k)
        if (qtr not in qtrs):
            qtrs[qtr] = dict()
        qtrs[qtr]["START"] = qstart
        qtrs[qtr]["SM"] = calendar.month_abbr[qstart].upper()
        qtrs[qtr]["END"] = qend
        qtrs[qtr]["EM"] = calendar.month_abbr[qend].upper()
        #print("Q"+str(k), "YEAR", year)
        k = k + 1
    
    #print(eyr)   
    if("EQTR" in qtrs and "CQTR" in qtrs and "PQTR" in qtrs):
        eqtrEnd = qtrs[qtrs["EQTR"]]["END"]
        cqtrEnd = qtrs[qtrs["CQTR"]]["END"]
        pqtrEnd = qtrs[qtrs["PQTR"]]["END"]
        #print(eqtrEnd, cqtrEnd, pqtrEnd)
        qtrs["EYR"] = str(eyr)
        qtrs["PYR"] = str(eyr)
        #print(eqtrEnd, cqtrEnd, qtrs[qtrs["CQTR"]]["START"], qtrs[qtrs["CQTR"]]["END"])
        if(eqtrEnd > cqtrEnd):
            if(qtrs[qtrs["CQTR"]]["START"] > qtrs[qtrs["CQTR"]]["END"] and emonth > fynd):
                gyr = year + 1
                qtrs["GYR"] = str(gyr)
                qtrs["EYR"] = str(gyr - 1)
            else:
                qtrs["EYR"] = str(eyr - 1)
                qtrs["PYR"] = str(eyr - 1)
        if(pqtrEnd > eqtrEnd):
            qtrs["PYR"] = str(int(qtrs["EYR"]) - 1)
    if(emonth <= fynd):
        qtrs["CFYR"] = str(year)
    else:
        qtrs["CFYR"] = str(year + 1)
    qtrs["FYR"] = qtrs["CFYR"]
    if(qtrs[qtrs["EQTR"]]["END"] <= fynd and qtrs["EQTR"] > qtrs["CQTR"]):
        qtrs["FYR"] = str(int(qtrs["FYR"]) - 1)
    qtrs["EFYR"] = qtrs["FYR"]
    qtrs["PFYR"] = qtrs["FYR"]
    if(qtrs["PQTR"] > qtrs["EQTR"]):
        qtrs["PFYR"] = str(int(qtrs["FYR"]) - 1)
    qtrs["GFYR"] = qtrs["CFYR"]
    #qtrs["EYR"] = qtrs["FYR"]
    #qtrs["CYR"] = qtrs["CFYR"]
    #qtrs["PYR"] = qtrs["FYR"]
    qtrs["CYR"] = qtrs["GYR"]
    #print(qtrs)
    return (qtrs)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")

def query1_from_list(context):
    ans = "{\"RELATIONS\": [\"KEY:GAAP Net Income Per Share!!TYPE:OUT!!MONEY:$0.38!!LINK:KV\"]}"
    sent = ["GAAP Net Income Per Share is $0.38 in second quarter 2023 @@@"]
    
    ans1 = "{\"RELATIONS\": [\"KEY:Non-GAAP net income!!TYPE:OUT!!MONEY:$110.1 MN!!LINK:KV\"]}"
    sent1 = ["Non-GAAP net income is $110.1 million in third quarter 2022 @@@"]
    
    ans2 = "{\"RELATIONS\": [\"KEY:Non-GAAP Net Income Per Share!!TYPE:OUT!!MONEY:-$0.13!!LINK:KV\"]}"
    sent2 = ["Non-GAAP Net Income Per Share is ($0.13) in first quarter 2023 @@@"]
    
    tfewshot = f"""
    Question: What are the relations present in the following text? 
    
    Context: {" * ".join(sent)}. 
    
    Answer: {ans}.
    
    """
    tfewshot += f"""
    Question: What are the relations present in the following text? 
     
    Context: {" * ".join(sent1)}. 
    
    Answer: {ans1}.
    
    """
    tfewshot += f"""
    Question: What are the relations present in the following text?
    
    Context: {" * ".join(sent2)}. 
    
    Answer: {ans2}
    
    """
    #print(tfewshot)
    #print("\n\n")
    t5query = f"""{tfewshot}
    Question: What are the relations present in the following text? 
    
    Context:  {" * ".join(context)}.
    
    Answer:
    
    """
    print(t5query)
    inputs = tokenizer(t5query, return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens=100)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

context = ["GAAP Gross profit for the third quarter of 2022 was $210 million"]
result = query1_from_list(context)
print(f"{result[0]}")


    Question: What are the relations present in the following text? 
    
    Context: GAAP Net Income Per Share is $0.38 in second quarter 2023 @@@. 
    
    Answer: {"RELATIONS": ["KEY:GAAP Net Income Per Share!!TYPE:OUT!!MONEY:$0.38!!LINK:KV"]}.
    
    
    Question: What are the relations present in the following text? 
     
    Context: Non-GAAP net income is $110.1 million in third quarter 2022 @@@. 
    
    Answer: {"RELATIONS": ["KEY:Non-GAAP net income!!TYPE:OUT!!MONEY:$110.1 MN!!LINK:KV"]}.
    
    
    Question: What are the relations present in the following text?
    
    Context: Non-GAAP Net Income Per Share is ($0.13) in first quarter 2023 @@@. 
    
    Answer: {"RELATIONS": ["KEY:Non-GAAP Net Income Per Share!!TYPE:OUT!!MONEY:-$0.13!!LINK:KV"]}
    
    
    Question: What are the relations present in the following text? 
    
    Context:  GAAP Gross profit for the third quarter of 2022 was $210 million.
    
    Answer:
    
    


In [None]:
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base", torch_dtype=torch.bfloat16)

def query1_from_list(context):
    ans = "{\"RELATIONS\": [\"KEY:GAAP Net Income Per Share!!TYPE:OUT!!MONEY:$0.38!!LINK:KV\"]}"
    sent = ["GAAP Net Income Per Share is $0.38 in second quarter 2023 @@@"]
    
    ans1 = "{\"RELATIONS\": [\"KEY:Non-GAAP net income!!TYPE:OUT!!MONEY:$110.1 MN!!LINK:KV\"]}"
    sent1 = ["Non-GAAP net income is $110.1 million in third quarter 2022 @@@"]
    
    ans2 = "{\"RELATIONS\": [\"KEY:Non-GAAP Net Income Per Share!!TYPE:OUT!!MONEY:-$0.13!!LINK:KV\"]}"
    sent2 = ["Non-GAAP Net Income Per Share is ($0.13) in first quarter 2023 @@@"]
    
    tfewshot = f"""
    Article: {" * ".join(sent)}
    
    Question: What are the relations present in the text? Display it in json format. {ans}
    
    """
    tfewshot += f"""
    Article: {" * ".join(sent1)}. 
    
    Question: What are the relations present in the text? Display it in json format. {ans1}
    
    """
    tfewshot += f"""
    Article: {" * ".join(sent2)}.
    
    Question: What are the relations present in the text? Display it in json format. {ans2}
    
    """
    #print(tfewshot)
    #print("\n\n")
    t5query = f"""{tfewshot}
    Article: {" * ".join(context)}.
    
    Question: What are the relations present in the text? Display it in json format."""
    print(t5query)
    inputs = tokenizer(t5query, return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens=100)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

context = ["GAAP Gross profit for the third quarter of 2022 was $210 million"]
result = query1_from_list(context)
print(f"{result[0]}")

In [None]:
# Format 2, includiing entities and relations
maxFiles = 60
fileCnt = 0
source = "../../Summary/DATA/FLAN/Backup/Format-1"
files = glob.glob(source+"/*_ER.tsv")
#print(files)
for file in files:
    #print(file)
    basefile = os.path.basename(file)
    #print(basefile)
    outfile = "../../Summary/DATA/FLAN/Train/"+basefile
    of = None
    #print(outfile)
    if outfile:
        outfilePath = Path(outfile)
        if outfilePath.is_file():
            print("Output File {} already exists".format(outfile))
            continue
        
        if(maxFiles > 0):
            fileCnt = fileCnt + 1
            if(fileCnt > maxFiles):
                break
            
        print("Creating Output File {}".format(outfile))
        of = open(outfilePath, "w", encoding = "utf-8")
        
        
    with open(file, "r", encoding = "ISO-8859-1") as f:
        line = f.readline()
        #print(line)
        while line:
            if ("RELATIONS" not in line):
                of.write(line)
                #of.write("\n")
            else:
                relations = dict()
                nsplit = line.split("\t")
                relation = nsplit[2]
                relation = relation.replace("\n","")
                sent = nsplit[1]
                sent = sent.replace("\t","")
                #print(sent)
                relation = json.loads(relation)
                #print(relation)
                sentences = getSentences(None, nlp, sent)
                nerl = None
                for l in sentences:
                    text1 = list()
                    text1.append(l)
                    for doc in nlp.pipe(text1, disable=["tagger"]):
                        for ent in doc.ents:
                            if(not nerl):
                                nerl = ent.label_.replace(":","").replace(",","")+"="+ent.text.replace(":","").replace(",","")
                            else:
                                nerl = nerl + "," + ent.label_.replace(":","").replace(",","")+"="+ent.text.replace(":","").replace(",","")
                #relations["ENTITIES"] = None
                relations["ENTITIES"] = list()
                relations["ENTITIES"].append(nerl)
                #rlist = list()
                #for rstr in relation["RELATIONS"]:
                #    rstr = rstr.replace(",MONEY","!!MONEY").replace(",TYPE","!!TYPE").replace(",CD","!!CD").replace(",RELATION","!!RELATION").replace(",PCT","!!PCT").replace(",DATE","!!DATE").replace(",CALENDAR","!!CALENDAR")
                #    rlist.append(rstr)
                relations["RELATIONS"] = relation["RELATIONS"]
                #print(relations)
                nsplit[2] = json.dumps(relations)
                nsplit[1] = sent
                nline = "\t".join(nsplit)
                #nline = nline + "</s>" #Not Required
                of.write(nline)
                of.write("\n")
            line = f.readline()
    if(of):
        of.close()
    #break

In [None]:
# Format 3, combining entities, relations and earning section in single training data
maxFiles = -1
fileCnt = 0
source = "../../Summary/DATA/FLAN/Backup/Format-2"

files = glob.glob(source+"/*_ER.tsv")
#print(files)
for file in files:
    #print(file)
    basefile = os.path.basename(file)
    #print(basefile)
    outfile = "../../Summary/DATA/FLAN/Train/"+basefile
    of = None
    #print(outfile)
    if outfile:
        outfilePath = Path(outfile)
        if outfilePath.is_file():
            print("Output File {} already exists".format(outfile))
            continue
        
        if(maxFiles > 0):
            fileCnt = fileCnt + 1
            if(fileCnt > maxFiles):
                break
            
        print("Creating Output File {}".format(outfile))
        of = open(outfilePath, "w", encoding = "utf-8")
        
    
    tag = None
    section = None
    ssection = None
    header = None
    report = "REGULAR"
    
    with open(file, "r", encoding = "ISO-8859-1") as f:
        line = f.readline()
        #print(line)
        while line:
            if ("RELATIONS" not in line):
                of.write(line)
                #of.write("\n")
            else:
                relations = dict()
                nsplit = line.split("\t")
                relation = nsplit[2]
                relation = relation.replace("\n","")
                sent = nsplit[1]
                sent = sent.replace("\t","")
                tag = sent.split(" ")[0].strip()
                #print(tag)
                #print(sent)
                if(tag in stag):
                    section = sent.replace(tag, "").strip()
                    header = section
                    if("Q" in tag):
                        report = "REGULAR"
                    elif("F" in tag):
                        report = "REGULARFULL"
                    elif("G" in tag):
                        report = "GUIDE"
                    #print(sent)
                elif(tag in gtag):
                    ssection = sent.replace(tag, "").strip()
                    header = ssection
                    if("Q" in tag):
                        report = "GUIDE"
                    elif("F" in tag):
                        report = "GUIDEFULL"
                    #if(header):
                    #    if("***" in tag):
                    #        sent = sent.replace(tag, "")
                    #        sent = tag + " " + header + " " + sent
                    #    else:
                    #        sent = header + " " + sent
                    #if(section):
                    #    header = section + " " + ssection
                    #else:
                    #    header = ssection
                    #print(sent)
                else:
                    if(header):
                        if("***" in tag):
                            sent = sent.replace(tag, "")
                            sent = tag + " " + header + " " + sent
                        else:
                            sent = header + " " + sent
                    #print(sent)
                relation = json.loads(relation)
                #print(relation)
                relations["ENTITIES"] = relation["ENTITIES"]
                rltn = relation["RELATIONS"]
                if(len(rltn) != 0 and rltn[0] != ''):
                    rlist = list()
                    for rstr in rltn:
                        rstr = rstr + "!!SECTION:"+ report
                        rlist.append(rstr)
                    relations["RELATIONS"] = rlist
                else:
                    relations["RELATIONS"] = rltn 
                #print(relations)
                nsplit[2] = json.dumps(relations)
                nsplit[1] = sent
                nline = "\t".join(nsplit)
                #nline = nline + "</s>" #Not Required
                of.write(nline)
                of.write("\n")
            line = f.readline()
    if(of):
        of.close()
    #break

In [9]:
# Format 5, combining entities, relations and earning section in single training data aling with qtr and year detail
maxFiles = -1
fileCnt = 0
source = "../../Summary/DATA/FLAN/Backup/Format-4"

files = glob.glob(source+"/*_EP*.txt")
#print(files)
for file in files:
    #print(file)
    estr = None
    gqtr = None
    gfyr = None
    val = None
    basefile = os.path.basename(file)
    #print(basefile)
    outfile = "../../Summary/DATA/FLAN/Train/"+basefile
    of = None
    #print(outfile)
    if outfile:
        outfilePath = Path(outfile)
        if outfilePath.is_file():
            print("Output File {} already exists".format(outfile))
            continue
        
        if(maxFiles > 0):
            fileCnt = fileCnt + 1
            if(fileCnt > maxFiles):
                break
        
        print("Creating Output File {}".format(outfile))
        orgData = None
        csym = basefile.split("_")[0]
        edate = basefile.split("_")[1]
        estr = None
        orgData = getOrgData(csym)
        if orgData:
            val = getOrgAttr(orgData, "ORGPROFILE|FiscalYearEnd")
            if(val):
                dobj = datetime.strptime(val, "%B")
                fynd = (dobj.month)
                qtrs = getQtrs(csym, edate, fynd)
                eqtr = qtrs["EQTR"]
                efyr = qtrs["EFYR"]
                gqtr = qtrs["GQTR"]
                gfyr = qtrs["GFYR"]
                estr = eqtr+" "+efyr + " Earning Report"
                #print(csym, estr, edate)
        of = open(outfilePath, "w", encoding = "utf-8")
        
    
    tag = None
    section = None
    ssection = None
    header = None
    report = "REGULAR"
    
    with open(file, "r", encoding = "ISO-8859-1") as f:
        line = f.readline()
        #print(line)
        while line:
            if ("RELATIONS" not in line or "ENTITIES" not in line):
                of.write(line)
                #of.write("\n")
            elif ("CS***" in line or "TBLST***" in line or "TBLET***" in line):
                of.write(line)
            else:
                relations = dict()
                nsplit = line.split("\t")
                relation = nsplit[2]
                relation = relation.replace("\n","")
                sent = nsplit[1]
                sent = sent.replace("\t","")
                if(estr):
                    sent = estr + " -%%%- " + sent
                relation = json.loads(relation)
                #print(relation)
                relations["ENTITIES"] = relation["ENTITIES"]
                rltn = relation["RELATIONS"]
                if(len(rltn) != 0 and rltn[0] != ''):
                    rlist = list()
                    for rstr in rltn:
                        if("REGULARFULL" in rstr):
                            rstr = rstr + "!!QTR:ALL" + "!!YEAR:"+str(efyr) 
                        elif("GUIDEFULL" in rstr):
                            rstr = rstr + "!!QTR:ALL" + "!!YEAR:"+str(gfyr)
                        elif("GUIDE" in rstr):
                            rstr = rstr + "!!QTR:" + gqtr + "!!YEAR:"+str(gfyr)
                        else:
                            rstr = rstr + "!!QTR:" + eqtr + "!!YEAR:"+str(efyr)
                        rlist.append(rstr)
                    relations["RELATIONS"] = rlist
                else:
                    relations["RELATIONS"] = rltn 
                #print(relations)
                nsplit[2] = json.dumps(relations)
                nsplit[1] = sent
                nline = "\t".join(nsplit)
                #nline = nline + "</s>" #Not Required
                of.write(nline)
                of.write("\n")
            line = f.readline()
    if(of):
        of.close()
    #break

Creating Output File ../../Summary/DATA/FLAN/Train/APPN_2022-08-04_EP_YH.txt
Creating Output File ../../Summary/DATA/FLAN/Train/APPN_2022-11-03_EP_YH.txt
Creating Output File ../../Summary/DATA/FLAN/Train/APPN_2023-02-16_EP_YH.txt
Creating Output File ../../Summary/DATA/FLAN/Train/APPN_2023-05-09_EP_YH.txt
Creating Output File ../../Summary/DATA/FLAN/Train/APPN_2023-08-03_EP_YH.txt
Creating Output File ../../Summary/DATA/FLAN/Train/APPN_2023-11-02_EP_YH.txt
Creating Output File ../../Summary/DATA/FLAN/Train/BILL_2022-08-18_EP_YH.txt
Creating Output File ../../Summary/DATA/FLAN/Train/BILL_2022-11-03_EP_YH.txt
Creating Output File ../../Summary/DATA/FLAN/Train/BILL_2023-02-02_EP_YH.txt
Creating Output File ../../Summary/DATA/FLAN/Train/BILL_2023-05-04_EP_YH.txt
Creating Output File ../../Summary/DATA/FLAN/Train/BILL_2023-08-17_EP_YH.txt
Creating Output File ../../Summary/DATA/FLAN/Train/BILL_2023-11-02_EP_YH.txt
Creating Output File ../../Summary/DATA/FLAN/Train/CFLT_2022-08-03_EP_YH.txt

In [10]:
# Format 6, NO PREDICTION for mgmt comment and table header
maxFiles = -1
fileCnt = 0
source = "../../Summary/DATA/FLAN/Backup/Format-5"

files = glob.glob(source+"/*_EP*.txt")
#print(files)
for file in files:
    #print(file)
    estr = None
    gqtr = None
    gfyr = None
    val = None
    basefile = os.path.basename(file)
    #print(basefile)
    outfile = "../../Summary/DATA/FLAN/Train/"+basefile
    of = None
    #print(outfile)
    if outfile:
        outfilePath = Path(outfile)
        if outfilePath.is_file():
            print("Output File {} already exists".format(outfile))
            continue
        
        if(maxFiles > 0):
            fileCnt = fileCnt + 1
            if(fileCnt > maxFiles):
                break
        
        print("Creating Output File {}".format(outfile))
        of = open(outfilePath, "w", encoding = "utf-8")
        
    
    otag = ["CS***", "TBLST***", "TBLET***"]
    
    with open(file, "r", encoding = "ISO-8859-1") as f:
        line = f.readline()
        #print(line)
        while line:
            if ("RELATIONS" not in line or "ENTITIES" not in line):
                of.write(line)
                #of.write("\n")
            elif ("CS***" in line or "TBLST***" in line or "TBLET***" in line):
                nsplit = line.split("\t")
                nsplit[2] = "NO PREDICTION"
                nline = "\t".join(nsplit)
                of.write(nline)
                of.write("\n")
            else:
                of.write(line)
                #of.write("\n")
            line = f.readline()
    if(of):
        of.close()
    #break

Creating Output File ../../Summary/DATA/FLAN/Train/APPN_2022-08-04_EP_YH.txt
Creating Output File ../../Summary/DATA/FLAN/Train/APPN_2022-11-03_EP_YH.txt
Creating Output File ../../Summary/DATA/FLAN/Train/APPN_2023-02-16_EP_YH.txt
Creating Output File ../../Summary/DATA/FLAN/Train/APPN_2023-05-09_EP_YH.txt
Creating Output File ../../Summary/DATA/FLAN/Train/APPN_2023-08-03_EP_YH.txt
Creating Output File ../../Summary/DATA/FLAN/Train/APPN_2023-11-02_EP_YH.txt
Creating Output File ../../Summary/DATA/FLAN/Train/BILL_2022-08-18_EP_YH.txt
Creating Output File ../../Summary/DATA/FLAN/Train/BILL_2022-11-03_EP_YH.txt
Creating Output File ../../Summary/DATA/FLAN/Train/BILL_2023-02-02_EP_YH.txt
Creating Output File ../../Summary/DATA/FLAN/Train/BILL_2023-05-04_EP_YH.txt
Creating Output File ../../Summary/DATA/FLAN/Train/BILL_2023-08-17_EP_YH.txt
Creating Output File ../../Summary/DATA/FLAN/Train/BILL_2023-11-02_EP_YH.txt
Creating Output File ../../Summary/DATA/FLAN/Train/CFLT_2022-08-03_EP_YH.txt

In [8]:
devDataFile = "../../Summary/DATA/FLAN/Dev/dev.tsv"
trainDataFile = "../../Summary/DATA/FLAN/Train/train.tsv"
testDataFile = "../../Summary/DATA/FLAN/Test/test.tsv"

trainDir = "../../Summary/DATA/FLAN/Train"
devDir = "../../Summary/DATA/FLAN/Dev"
testDir = "../../Summary/DATA/FLAN/Test"

def writeTrainingData(writeFile, writeDir):
    files = glob.glob(writeDir+"/*_EP*.txt")
    print(files)
    frames = list()

    if(len(files) > 0):
        for file in files:
            print(file)
            #df = pd.read_csv(file, sep="\t", encoding = "utf-8").astype(str)
            df = pd.read_csv(file, sep="\t", encoding = "ISO-8859-1").astype(str)
            df = df.dropna()
            df = df[df['Sentence1'].notna()]
            #print(df)
            frames.append(df)
    result = pd.concat(frames)
    print(result)
    result.to_csv(writeFile, sep='\t', index=False, header=True)

In [9]:
writeTrainingData(trainDataFile, trainDir)
writeTrainingData(devDataFile, devDir)
writeTrainingData(testDataFile, testDir)

['../../Summary/DATA/FLAN/Train\\APPN_2022-11-03_EP_YH.txt', '../../Summary/DATA/FLAN/Train\\APPN_2023-02-16_EP_YH.txt', '../../Summary/DATA/FLAN/Train\\APPN_2023-05-09_EP_YH.txt', '../../Summary/DATA/FLAN/Train\\APPN_2023-08-03_EP_YH.txt', '../../Summary/DATA/FLAN/Train\\APPN_2023-11-02_EP_YH.txt', '../../Summary/DATA/FLAN/Train\\BILL_2022-08-18_EP_YH.txt', '../../Summary/DATA/FLAN/Train\\BILL_2023-02-02_EP_YH.txt', '../../Summary/DATA/FLAN/Train\\BILL_2023-05-04_EP_YH.txt', '../../Summary/DATA/FLAN/Train\\BILL_2023-08-17_EP_YH.txt', '../../Summary/DATA/FLAN/Train\\BILL_2023-11-02_EP_YH.txt', '../../Summary/DATA/FLAN/Train\\CFLT_2022-08-03_EP_YH.txt', '../../Summary/DATA/FLAN/Train\\CFLT_2022-11-02_EP_YH.txt', '../../Summary/DATA/FLAN/Train\\CFLT_2023-05-03_EP_YH.txt', '../../Summary/DATA/FLAN/Train\\CFLT_2023-08-02_EP_YH.txt', '../../Summary/DATA/FLAN/Train\\CFLT_2023-11-01_EP_YH.txt', '../../Summary/DATA/FLAN/Train\\CRWD_2022-08-30_EP_YH.txt', '../../Summary/DATA/FLAN/Train\\CRWD_20

../../Summary/DATA/FLAN/Train\HUBS_2023-08-02_EP_YH.txt
../../Summary/DATA/FLAN/Train\HUBS_2023-11-08_EP_YH.txt
../../Summary/DATA/FLAN/Train\MDB_2022-08-31_EP_YH.txt
../../Summary/DATA/FLAN/Train\MDB_2022-12-06_EP_YH.txt
../../Summary/DATA/FLAN/Train\MDB_2023-03-08_EP_YH.txt
../../Summary/DATA/FLAN/Train\MDB_2023-08-31_EP_YH.txt
../../Summary/DATA/FLAN/Train\MDB_2023-12-05_EP_YH.txt
../../Summary/DATA/FLAN/Train\NET_2022-08-04_EP_YH.txt
../../Summary/DATA/FLAN/Train\NET_2022-11-03_EP_YH.txt
../../Summary/DATA/FLAN/Train\NET_2023-02-09_EP_YH.txt
../../Summary/DATA/FLAN/Train\NET_2023-04-27_EP_YH.txt
../../Summary/DATA/FLAN/Train\NET_2023-11-02_EP_YH.txt
../../Summary/DATA/FLAN/Train\OKTA_2022-08-31_EP_YH.txt
../../Summary/DATA/FLAN/Train\OKTA_2022-11-30_EP_YH.txt
../../Summary/DATA/FLAN/Train\OKTA_2023-03-02_EP_YH.txt
../../Summary/DATA/FLAN/Train\OKTA_2023-05-31_EP_YH.txt
../../Summary/DATA/FLAN/Train\OKTA_2023-08-30_EP_YH.txt
../../Summary/DATA/FLAN/Train\PANW_2022-11-17_EP_YH.txt
..

../../Summary/DATA/FLAN/Dev\MDB_2023-06-01_EP_YH.txt
../../Summary/DATA/FLAN/Dev\NET_2023-08-03_EP_YH.txt
../../Summary/DATA/FLAN/Dev\OKTA_2023-11-29_EP_YH.txt
../../Summary/DATA/FLAN/Dev\PANW_2022-08-22_EP_YH.txt
../../Summary/DATA/FLAN/Dev\PATH_2022-12-01_EP_YH.txt
../../Summary/DATA/FLAN/Dev\PAYC_2023-02-07_EP_YH.txt
../../Summary/DATA/FLAN/Dev\PINS_2023-04-27_EP_YH.txt
../../Summary/DATA/FLAN/Dev\PLTR_2023-08-07_EP_YH.txt
../../Summary/DATA/FLAN/Dev\RNG_2023-11-06_EP_YH.txt
../../Summary/DATA/FLAN/Dev\SNOW_2022-11-30_EP_YH.txt
../../Summary/DATA/FLAN/Dev\S_2022-08-31_EP_YH.txt
../../Summary/DATA/FLAN/Dev\TTD_2023-02-15_EP_YH.txt
../../Summary/DATA/FLAN/Dev\TWLO_2023-05-09_EP_YH.txt
../../Summary/DATA/FLAN/Dev\UPST_2023-08-08_EP_YH.txt
../../Summary/DATA/FLAN/Dev\ZM_2022-08-22_EP_YH.txt
../../Summary/DATA/FLAN/Dev\ZS_2022-12-01_EP_YH.txt
                 filename                                          Sentence1  \
0   APPN_2022-08-04_EP_YH  Q2 2022 Earning Report -%%%- GAAP Net Lo

In [10]:
train_df = pd.read_csv(trainDataFile, sep="\t", encoding = "utf-8").astype(str)
eval_df = pd.read_csv(devDataFile, sep="\t", encoding = "utf-8").astype(str)
test_df = pd.read_csv(testDataFile, sep="\t", encoding = "utf-8").astype(str)

train_df = train_df.rename(
    columns={"Sentence1": "input_text", "Sentence2": "target_text"}
)
eval_df = eval_df.rename(
    columns={"Sentence1": "input_text", "Sentence2": "target_text"}
)
test_df = test_df.rename(
    columns={"Sentence1": "input_text", "Sentence2": "target_text"}
)

train_df = train_df[["input_text", "target_text"]]
eval_df = eval_df[["input_text", "target_text"]]
test_df = test_df[["input_text", "target_text"]]

#train_df["prefix"] = "paraphrase"
#train_df = train_df[["prefix", "input_text", "target_text"]]
#train_df = train_df[["input_text", "target_text"]]

#eval_df["prefix"] = "paraphrase"
#eval_df = eval_df[["prefix", "input_text", "target_text"]]
#eval_df = eval_df[["input_text", "target_text"]]

train_df = train_df.dropna()
train_df = train_df[train_df['input_text'].notna()]

eval_df = eval_df.dropna()
eval_df = eval_df[eval_df['input_text'].notna()]

test_df = test_df.dropna()
test_df = test_df[test_df['input_text'].notna()]

#train_df["input_text"] = train_df["input_text"].apply(clean_unnecessary_spaces)
#train_df["target_text"] = train_df["target_text"].apply(clean_unnecessary_spaces)
print("TRAIN DATA ..............")
print(train_df)

#eval_df["input_text"] = eval_df["input_text"].apply(clean_unnecessary_spaces)
#eval_df["target_text"] = eval_df["target_text"].apply(clean_unnecessary_spaces)
print("EVAL DATA ..............")
print(eval_df)

print("TEST DATA ..............")
print(test_df)

TRAIN DATA ..............
                                             input_text  \
0     Q3 2022 Earning Report -%%%- GAAP Net Loss Per...   
1     Q3 2022 Earning Report -%%%- Cash And Cash Equ...   
2     Q3 2022 Earning Report -%%%- GAAP Gross Profit...   
3     Q3 2022 Earning Report -%%%- GAAP Gross Margin...   
4     Q3 2022 Earning Report -%%%- PG*** Appian Corp...   
...                                                 ...   
7835  Q1 2024 Earning Report -%%%- PG*** For the ful...   
7836  Q1 2024 Earning Report -%%%- PG*** For the ful...   
7837  Q1 2024 Earning Report -%%%- PG*** For the ful...   
7838  Q1 2024 Earning Report -%%%- PG*** For the ful...   
7839  Q1 2024 Earning Report -%%%- PG*** For the ful...   

                                            target_text  
0     {"ENTITIES": ["METRIC=GAAP Net Loss Per Share,...  
1     {"ENTITIES": ["METRIC=Cash,METRIC=Cash Equival...  
2     {"ENTITIES": ["METRIC=GAAP,METRIC=Gross Profit...  
3     {"ENTITIES": ["METRIC=GAAP,

In [9]:
#modelPath = "./feroutputs/checkpoint-5203/pytorch_model.bin"
modelPath = "./feroutputs/pytorch_model.bin"
modelDir = "./feroutputs/"
model_def = 'google/flan-t5-small'
if os.path.isfile(modelPath):
    model_name = "./feroutputs"
else:
    di = sorted(os.listdir(modelDir), reverse=True)
    if(len(di) > 0):
        model_name = modelDir+(di[0])
    else:
        model_name = model_def
print(model_name)

./feroutputs


In [10]:

#original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
#tokenizer = AutoTokenizer.from_pretrained(model_name)
#model_name = "facebook/bart-base"

#original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
#original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
#original_model.to('cuda')

In [13]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(original_model))

trainable model parameters: 76961152
all model parameters: 76961152
percentage of trainable model parameters: 100.00%


In [None]:
inputText = (train_df["input_text"][0])
outputText = (train_df["target_text"][0])

ans = "{\"RELATIONS\": [\"KEY:GAAP Net Income Per Share!!TYPE:OUT!!MONEY:$0.38!!LINK:KV\"]}"
sent = ["GAAP Net Income Per Share is $0.38 in second quarter 2023 @@@"]

ans1 = "{\"RELATIONS\": [\"KEY:Non-GAAP net income!!TYPE:OUT!!MONEY:$110.1 MN!!LINK:KV\"]}"
sent1 = ["Non-GAAP net income is $110.1 million in third quarter 2022 @@@"]

ans2 = "{\"RELATIONS\": [\"KEY:Non-GAAP Net Income Per Share!!TYPE:OUT!!MONEY:-$0.13!!LINK:KV\"]}"
sent2 = ["Non-GAAP Net Income Per Share is ($0.13) in first quarter 2023 @@@"]

tfewshot = f"""
Article: {" * ".join(sent)}

Question: What are the relations present in the text? Display it in json format. {ans}

"""
tfewshot += f"""
Article: {" * ".join(sent1)}. 

Question: What are the relations present in the text? Display it in json format. {ans1}

"""
tfewshot += f"""
Article: {" * ".join(sent2)}.

Question: What are the relations present in the text? Display it in json format. {ans2}

"""
#print(tfewshot)
#print("\n\n")
t5query = f"""{tfewshot}
Article: {inputText}.

Question: What are the relations present in the text? Display it in json format."""
#print(t5query)
inputs = tokenizer(t5query, return_tensors="pt")
outputs = original_model.generate(**inputs, max_new_tokens=100)
output = tokenizer.batch_decode(outputs, skip_special_tokens=True)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{t5query}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{outputText}\n')
print(dash_line)
print(f'MODEL GENERATION - FEW SHOT:\n{output}')

In [14]:
import datasets
from datasets import Dataset, DatasetDict

train = Dataset.from_pandas(train_df)
valid = Dataset.from_pandas(eval_df)
test = Dataset.from_pandas(test_df)

ds = DatasetDict()

ds['train'] = train
ds['validation'] = valid
ds['test'] = test

print(ds)

DatasetDict({
    train: Dataset({
        features: ['input_text', 'target_text', '__index_level_0__'],
        num_rows: 7840
    })
    validation: Dataset({
        features: ['input_text', 'target_text', '__index_level_0__'],
        num_rows: 1627
    })
    test: Dataset({
        features: ['input_text', 'target_text', '__index_level_0__'],
        num_rows: 83
    })
})


In [20]:
from datasets import concatenate_datasets

# The maximum total input sequence length after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = concatenate_datasets([ds["train"], ds["validation"]]).map(lambda x: tokenizer(x["input_text"], truncation=True), batched=True, remove_columns=["input_text", "target_text", "__index_level_0__"])
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = concatenate_datasets([ds["train"], ds["validation"]]).map(lambda x: tokenizer(x["target_text"], truncation=True), batched=True, remove_columns=["input_text", "target_text", "__index_level_0__"])
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")

Map:   0%|          | 0/9467 [00:00<?, ? examples/s]

Max source length: 199


Map:   0%|          | 0/9467 [00:00<?, ? examples/s]

Max target length: 262


In [None]:
def tokenize_function(example):
    #print(example)
    start_prompt = 'Article: '
    end_prompt = '\n\nQuestion: What are the relations present in the text? Display it in json format.'
    prompt = [start_prompt + sentence + end_prompt for sentence in example["input_text"]]
    #print(prompt)
    example['input_ids'] = tokenizer(prompt, max_length=max_source_length, padding="max_length", truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["target_text"], max_length=max_target_length, padding="max_length", truncation=True, return_tensors="pt").input_ids
    #print(example)
    return example

# The dataset actually contains 3 diff splits: train, validation, test.
# The tokenize_function code is handling all data across all splits in batches.
tokenized_datasets = ds.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['input_text', 'target_text', '__index_level_0__',])

In [21]:
def newtokenize_function(example, padding="max_length"):
    #print(example)
    start_prompt = 'Article: '
    #end_prompt = '\n\nQuestion: What are the relations present in the text? Display it in json format.'
    end_prompt = '\n\nQuestion: What are the entities and relations between these entities present in the text? If entities are present then first generate entities and then list relations between entities in lists of lists format.'
    prompt = [start_prompt + sentence + end_prompt for sentence in example["input_text"]]
    #print(prompt)
    model_inputs = tokenizer(prompt, max_length=max_source_length, padding="max_length", truncation=True)
    labels = tokenizer(text=example["target_text"], max_length=max_target_length, padding="max_length", truncation=True)
    #print(example)
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# The dataset actually contains 3 diff splits: train, validation, test.
# The tokenize_function code is handling all data across all splits in batches.
tokenized_datasets = ds.map(newtokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['input_text', 'target_text', '__index_level_0__',])

Map:   0%|          | 0/7840 [00:00<?, ? examples/s]

Map:   0%|          | 0/1627 [00:00<?, ? examples/s]

Map:   0%|          | 0/83 [00:00<?, ? examples/s]

In [17]:
#tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 100 == 0, with_indices=True)
#tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 100 == 0, with_indices=True)

In [22]:
print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['validation'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")

print(tokenized_datasets)

Shapes of the datasets:
Training: (7840, 3)
Validation: (1627, 3)
Test: (83, 3)
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 7840
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1627
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 83
    })
})


In [23]:
import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

# Metric
metric = evaluate.load("rouge")

# helper function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ankan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [24]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=original_model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)
#data_collator = DataCollatorForSeq2Seq(tokenizer, model=original_model)

In [25]:
output_dir = "./feroutputs"
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    num_train_epochs=5,
    weight_decay=0.01,
    predict_with_generate=True,
    overwrite_output_dir=True,
    fp16=False,
    # logging & evaluation strategies
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=500,
    save_strategy="epoch",
    save_total_limit=2,
    report_to="none",
)

trainer = Seq2SeqTrainer(
    model=original_model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

In [26]:
trainer.train()

***** Running training *****
  Num examples = 7840
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 39200


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,0.0112,0.036162,45.9628,39.9047,45.8976,45.9359,17.378611
2,0.0089,0.041971,45.6892,39.7984,45.6458,45.6603,17.409342
3,0.0107,0.039498,46.1239,40.2762,46.0806,46.1221,17.393977
4,0.0096,0.040363,45.8874,40.2159,45.8297,45.8534,17.415489
5,0.0186,0.035506,46.0454,40.1894,46.0023,46.0259,17.393977


***** Running Evaluation *****
  Num examples = 1627
  Batch size = 1
Saving model checkpoint to ./feroutputs\checkpoint-7840
Configuration saved in ./feroutputs\checkpoint-7840\config.json
Model weights saved in ./feroutputs\checkpoint-7840\pytorch_model.bin
tokenizer config file saved in ./feroutputs\checkpoint-7840\tokenizer_config.json
Special tokens file saved in ./feroutputs\checkpoint-7840\special_tokens_map.json
Copy vocab file to ./feroutputs\checkpoint-7840\spiece.model
Deleting older checkpoint [feroutputs\checkpoint-31360] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 1627
  Batch size = 1
Saving model checkpoint to ./feroutputs\checkpoint-15680
Configuration saved in ./feroutputs\checkpoint-15680\config.json
Model weights saved in ./feroutputs\checkpoint-15680\pytorch_model.bin
tokenizer config file saved in ./feroutputs\checkpoint-15680\tokenizer_config.json
Special tokens file saved in ./feroutputs\checkpoint-15680\special_tokens_map.json
C

TrainOutput(global_step=39200, training_loss=0.010700264056118167, metrics={'train_runtime': 61519.0521, 'train_samples_per_second': 0.637, 'train_steps_per_second': 0.637, 'total_flos': 2846448353280000.0, 'train_loss': 0.010700264056118167, 'epoch': 5.0})

In [22]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 1471
  Batch size = 1


{'eval_loss': 0.03282016143202782,
 'eval_rouge1': 45.8284,
 'eval_rouge2': 39.9873,
 'eval_rougeL': 45.8336,
 'eval_rougeLsum': 45.8408,
 'eval_gen_len': 17.48130523453433,
 'eval_runtime': 1200.1182,
 'eval_samples_per_second': 1.226,
 'eval_steps_per_second': 1.226,
 'epoch': 20.0}

In [27]:
trainer.save_model('./feroutputs')

Saving model checkpoint to ./feroutputs
Configuration saved in ./feroutputs\config.json
Model weights saved in ./feroutputs\pytorch_model.bin
tokenizer config file saved in ./feroutputs\tokenizer_config.json
Special tokens file saved in ./feroutputs\special_tokens_map.json


In [11]:
instruct_model = AutoModelForSeq2SeqLM.from_pretrained("./feroutputs")#, torch_dtype=torch.bfloat16)
#tokenizer = AutoTokenizer.from_pretrained("./feroutputs/checkpoint-2028")

In [25]:
index = 3
dialogue = ds['test'][index]['input_text']
human_baseline_summary = ds['test'][index]['target_text']
#dialogue = "Net Loss: Net loss was $118.9 million , or $1.74 per share with 65 million shares outstanding in Q2 2023"
print(dialogue)
index = 3
dialogue2 = ds['test'][index]['input_text']
#print(dialogue2)
#prompt = dialogue

ans = "{\"RELATIONS\": [\"KEY:GAAP Net Income Per Share!!TYPE:OUT!!MONEY:$0.38!!LINK:KV\"]}"
sent = ["GAAP Net Income Per Share is $0.38 in second quarter 2023 @@@"]

ans1 = "{\"RELATIONS\": [\"KEY:Non-GAAP net income!!TYPE:OUT!!MONEY:$110.1 MN!!LINK:KV\"]}"
sent1 = ["Non-GAAP net income is $110.1 million in third quarter 2022 @@@"]

ans2 = "{\"RELATIONS\": [\"KEY:Non-GAAP Net Income Per Share!!TYPE:OUT!!MONEY:-$0.13!!LINK:KV\"]}"
sent2 = ["Non-GAAP Net Income Per Share is ($0.13) in first quarter 2023 @@@"]

tfewshot = f"""
Article: {" * ".join(sent)}

Question: What are the entities and relations between these entities present in the text? If entities are present then first generate entities and then list relations between entities in lists of lists format. {ans}

"""
tfewshot += f"""
Article: {" * ".join(sent1)}. 

Question: What are the entities and relations between these entities present in the text? If entities are present then first generate entities and then list relations between entities in lists of lists format. {ans1}

"""
tfewshot += f"""
Article: {" * ".join(sent2)}.

Question: What are the entities and relations between these entities present in the text? If entities are present then first generate entities and then list relations between entities in lists of lists format. {ans2}

"""

prompt = f"""
Article: {dialogue}.

Question: What are the entities and relations between these entities present in the text? If entities are present then first generate entities and then list relations between entities in lists of lists format."""

prompt2 = f"""
Article: {dialogue2}.

Question: What are the entities and relations between these entities present in the text? If entities are present then first generate entities and then list relations between entities in lists of lists format."""


#inputs = tokenizer([prompt], padding="max_length", truncation=True, return_tensors="pt")
inputs = tokenizer(prompt, return_tensors="pt")
outputs = instruct_model.generate(**inputs, max_new_tokens=300, top_p=.9)
#outputs = instruct_model.generate(**inputs)
instruct_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)

#print(prompt)
dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(prompt)
#print(prompt2)
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{human_baseline_summary}')
print(dash_line)
print(f'INSTRUCT MODEL:\n{instruct_output}')

Q3 2022 Earning Report -%%%- GAAP Gross Margin is 52.55% in third quarter 2022 @@@
---------------------------------------------------------------------------------------------------

Article: Q3 2022 Earning Report -%%%- GAAP Gross Margin is 52.55% in third quarter 2022 @@@.

Question: What are the entities and relations between these entities present in the text? If entities are present then first generate entities and then list relations between entities in lists of lists format.
---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
{"ENTITIES": ["METRIC=GAAP,METRIC=Gross Margin,PCT=52.55%,CALENDAR=third quarter,YEAR=2022"], "RELATIONS": ["KEY:GAAP GROSS MARGIN!!TYPE:OUT!!PCT:52.55%!!LINK:KV!!SECTION:REGULAR!!QTR:Q3!!YEAR:2022"]}
---------------------------------------------------------------------------------------------------
INSTRUCT MODEL:
['"ENTITIES": ["METRIC=GAAP,METRIC=Gross Margin,PCT=52.55%,CALENDAR=thi

In [12]:
def getPrediction(prompts, erModel):
    inputs = tokenizer(prompts, padding="max_length", truncation=True, return_tensors="pt")
    outputs = erModel.generate(**inputs, max_new_tokens=300, top_p=.9)
    outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return(outputs)

In [13]:
line = ""
pmpt = f"""
Article: [ARTICLE].

Question: What are the entities and relations between these entities present in the text? If entities are present then first generate entities and then list relations between entities in lists of lists format."""

def writetofile(inputfile, to_predict, predictions, of, forTrain=False):
    if forTrain:
        return
    for to_pred, preds in zip(to_predict, predictions):
        #print(preds)
        pred = preds
        pred = pred.replace("\n","")
        pred = "{"+pred+"}"
        print(pred)
        topred = to_pred
        #print(pred, str(type(ast.literal_eval(pred))))
        outstr = ""
        try:
            if(str(type(ast.literal_eval(pred))) == "<class 'dict'>"):
                outstr = (topred + "\t" + pred)
            else:
                outstr = (topred + "\t" + "NO PREDICTION")
        except:
            outstr = (topred + "\t" + "NO PREDICTION")

        #print(outstr)
        #print("\n")
        outstr = inputfile + "\t" + outstr
        of.write(outstr)
        of.write("\n\n")

In [14]:
maxCount = 30

def createDatawithModel(origFile, csym, nlp, erModel, forTrain=True):
    basefile = os.path.basename(origFile)
    print(basefile)
    inputfile = os.path.splitext(basefile)[0]
    if(not forTrain):
        #print("NOT SUPPORTED WITHOUT TRAINING FLAG AS OF NOW")
        #return
        if not isScanningRqd(origFile):
            print("Format not supported for file {}".format(origFile))
            return False
        
        outdir = "../../Summary/PostRefinedV3"
        outfileDir = outdir+"/"+csym
        if not os.path.exists(outfileDir):
            os.makedirs(outfileDir)
        outfilePath = outfileDir+"/"+inputfile+".txt"
        print(outfilePath)
        outfile = Path(outfilePath)
        if outfile.is_file():
            print(outfilePath + " Already exists")
            return False
        
        print("Creating post refined V3 data file " + str(outfile))
        edate = basefile.split("_")[1]
        estr = None
        orgData = getOrgData(csym)
        if orgData:
            val = getOrgAttr(orgData, "ORGPROFILE|FiscalYearEnd")
            if(val):
                dobj = datetime.strptime(val, "%B")
                fynd = (dobj.month)
                qtrs = getQtrs(csym, edate, fynd)
                eqtr = qtrs["EQTR"]
                efyr = qtrs["EFYR"]
                gqtr = qtrs["GQTR"]
                gfyr = qtrs["GFYR"]
                estr = eqtr+" "+efyr + " Earning Report"
        of = None
        of = open(outfile, "w", encoding = "utf-8")
        of.write("filename\tSentence1\tSentence2\n")
        
        header = None
        
        with open(origFile, "r", encoding = "utf-8") as f:
            line = f.readline()
            line = line.strip()
            #line = line.replace("\n", "")
            #print(line)
            to_predict = list()
            original = list()
            while line:
                #print(line)
                line = line.replace("\n","")
                if("ED***" in line):
                    break
                elif(line == "PG*** " or "sec.gov" in line):
                    line = f.readline()
                    continue
                elif("CS***" in line or "TBLST***" in line or "TBLET***" in line):
                    if(len(to_predict) > 0):
                        print(to_predict)
                        print("\n\n")
                        predictions = getPrediction(to_predict, erModel)
                        print(predictions)
                        print("\n\n")
                        writetofile(inputfile, original, predictions, of, forTrain=False)
                    to_predict = list()
                    original = list()
                    line = line.replace("\n","")
                    #print(line + " #### " + "NOT PROCCESSED")
                    #print("\n")
                    sentences = getSentences(None, nlp, line)
                    for l in sentences:
                        if("CS***" in line):
                            if("CS***" in l):
                                of.write(inputfile + "\t" + l + "\t" + "NO PREDICTION")
                            else:
                                of.write(inputfile + "\t" + "CS*** "+l + "\t" + "NO PREDICTION")
                        else:
                            of.write(inputfile + "\t" + l + "\t" + "NO PREDICTION")
                        of.write("\n\n")
                    line = f.readline()
                    continue
                #elif("PG***" in line or "NOPAD***" in line or ("***" not in line)):
                else:
                    tag = line.split(" ")[0].strip()
                    #print(tag)
                    #print(sent)
                    if(tag in stag):
                        original = original + [line]
                        section = line.replace(tag, "").strip()
                        header = section
                        #print(header)
                        nl = pmpt.replace("[ARTICLE]", line)
                        nl = estr + " -%%%- " + nl
                        to_predict = to_predict + [nl]
                    elif(tag in gtag):
                        original = original + [line]
                        ssection = line.replace(tag, "").strip()
                        header = ssection
                        #print(header)
                        nl = pmpt.replace("[ARTICLE]", line)
                        nl = estr + " -%%%- " + nl
                        to_predict = to_predict + [nl]
                    elif line:
                        sentences = getSentences(None, nlp, line)
                        #to_predict = list()
                        for l in sentences:
                            #print(l)
                            pline, isProcess = preProcessSent(l)
                            if not isProcess:
                                continue
                            if(header):
                                if("***" in tag):
                                    nl = l.replace(tag, "")
                                    nl = tag + " " + header + " -%%%- " + nl
                                else:
                                    nl = header + " -%%%- " + l
                            else:
                                nl = l
                            nl = estr + " -%%%- " + nl
                            original = original + [nl]
                            nl = pmpt.replace("[ARTICLE]", nl)
                            to_predict = to_predict + [nl]
                            if(len(to_predict) >= maxCount):
                                print(to_predict)
                                print("\n\n")
                                predictions = getPrediction(to_predict, erModel)
                                print(predictions)
                                print("\n\n")
                                writetofile(inputfile, original, predictions, of, forTrain=False)
                                to_predict = list()
                                original = list()
                line = f.readline()
            
            if(len(to_predict) > 0):
                print(to_predict)
                print("\n\n")
                predictions = getPrediction(to_predict, erModel)
                print(predictions)
                print("\n\n")
                writetofile(inputfile, original, predictions, of, forTrain=False)
                to_predict = list()
                original = list()
        if(of):
            of.close()
            
    return True

In [17]:
entPath = "../../Summary/entities/"
rPath = "../../Summary/Refined/"
files = glob.glob(entPath+"/*-ENTITIES.json")   
if(len(files) > 0):
    for file in (files):
        basefile = os.path.basename(file)
        filename = os.path.splitext(basefile)[0]
        csym = filename.split("-")[0]
        cPath = rPath + "/" + csym
        #print(cPath)
        cfiles = glob.glob(cPath+"/*.txt")
        #print(cfiles)
        if(len(cfiles) > 0):
            for cf in cfiles:
                if isScanningRqd(cf):
                    print(cf)
                    created = createDatawithModel(cf, csym, nlp, instruct_model, forTrain=False)
                    if not created:
                        continue
                    break
        if created:
            break

../../Summary/Refined//APPN\APPN_2022-08-04_EP_YH.txt
APPN_2022-08-04_EP_YH.txt
../../Summary/PostRefinedV3/APPN/APPN_2022-08-04_EP_YH.txt
../../Summary/PostRefinedV3/APPN/APPN_2022-08-04_EP_YH.txt Already exists
../../Summary/Refined//APPN\APPN_2022-11-03_EP_YH.txt
APPN_2022-11-03_EP_YH.txt
../../Summary/PostRefinedV3/APPN/APPN_2022-11-03_EP_YH.txt
../../Summary/PostRefinedV3/APPN/APPN_2022-11-03_EP_YH.txt Already exists
../../Summary/Refined//APPN\APPN_2023-02-16_EP_YH.txt
APPN_2023-02-16_EP_YH.txt
../../Summary/PostRefinedV3/APPN/APPN_2023-02-16_EP_YH.txt
../../Summary/PostRefinedV3/APPN/APPN_2023-02-16_EP_YH.txt Already exists
../../Summary/Refined//APPN\APPN_2023-05-09_EP_YH.txt
APPN_2023-05-09_EP_YH.txt
../../Summary/PostRefinedV3/APPN/APPN_2023-05-09_EP_YH.txt
../../Summary/PostRefinedV3/APPN/APPN_2023-05-09_EP_YH.txt Already exists
../../Summary/Refined//APPN\APPN_2023-08-03_EP_YH.txt
APPN_2023-08-03_EP_YH.txt
../../Summary/PostRefinedV3/APPN/APPN_2023-08-03_EP_YH.txt
../../Sum

../../Summary/PostRefinedV3/FIVN/FIVN_2022-07-28_EP_YH.txt Already exists
../../Summary/Refined//FIVN\FIVN_2022-11-07_EP_YH.txt
FIVN_2022-11-07_EP_YH.txt
../../Summary/PostRefinedV3/FIVN/FIVN_2022-11-07_EP_YH.txt
../../Summary/PostRefinedV3/FIVN/FIVN_2022-11-07_EP_YH.txt Already exists
../../Summary/Refined//FIVN\FIVN_2023-02-22_EP_YH.txt
FIVN_2023-02-22_EP_YH.txt
../../Summary/PostRefinedV3/FIVN/FIVN_2023-02-22_EP_YH.txt
../../Summary/PostRefinedV3/FIVN/FIVN_2023-02-22_EP_YH.txt Already exists
../../Summary/Refined//FIVN\FIVN_2023-05-04_EP_YH.txt
FIVN_2023-05-04_EP_YH.txt
../../Summary/PostRefinedV3/FIVN/FIVN_2023-05-04_EP_YH.txt
../../Summary/PostRefinedV3/FIVN/FIVN_2023-05-04_EP_YH.txt Already exists
../../Summary/Refined//FIVN\FIVN_2023-08-07_EP_YH.txt
FIVN_2023-08-07_EP_YH.txt
../../Summary/PostRefinedV3/FIVN/FIVN_2023-08-07_EP_YH.txt
../../Summary/PostRefinedV3/FIVN/FIVN_2023-08-07_EP_YH.txt Already exists
../../Summary/Refined//FIVN\FIVN_2023-11-02_EP_YH.txt
FIVN_2023-11-02_EP_Y

../../Summary/Refined//PAYC\PAYC_2022-08-02_EP_YH.txt
PAYC_2022-08-02_EP_YH.txt
../../Summary/PostRefinedV3/PAYC/PAYC_2022-08-02_EP_YH.txt
../../Summary/PostRefinedV3/PAYC/PAYC_2022-08-02_EP_YH.txt Already exists
../../Summary/Refined//PAYC\PAYC_2022-11-01_EP_YH.txt
PAYC_2022-11-01_EP_YH.txt
../../Summary/PostRefinedV3/PAYC/PAYC_2022-11-01_EP_YH.txt
../../Summary/PostRefinedV3/PAYC/PAYC_2022-11-01_EP_YH.txt Already exists
../../Summary/Refined//PAYC\PAYC_2023-02-07_EP_YH.txt
PAYC_2023-02-07_EP_YH.txt
../../Summary/PostRefinedV3/PAYC/PAYC_2023-02-07_EP_YH.txt
../../Summary/PostRefinedV3/PAYC/PAYC_2023-02-07_EP_YH.txt Already exists
../../Summary/Refined//PAYC\PAYC_2023-05-02_EP_YH.txt
PAYC_2023-05-02_EP_YH.txt
../../Summary/PostRefinedV3/PAYC/PAYC_2023-05-02_EP_YH.txt
../../Summary/PostRefinedV3/PAYC/PAYC_2023-05-02_EP_YH.txt Already exists
../../Summary/Refined//PAYC\PAYC_2023-08-01_EP_YH.txt
PAYC_2023-08-01_EP_YH.txt
../../Summary/PostRefinedV3/PAYC/PAYC_2023-08-01_EP_YH.txt
../../Sum

['"ENTITIES": ["METRIC=GAAP,METRIC=Net Income Per Share,MONEY=$0.29,CALENDAR=fourth quarter,YEAR=2023"], "RELATIONS": ["KEY:GAAP net income per share!!TYPE:OUT!!MONEY:$0.29!!LINK:KV!!SECTION:REGULAR!!QTR:Q4!!YEAR:2023"]', '"ENTITIES": ["METRIC=Cash,METRIC=Cash Equivalents,MONEY=$1361936 T,CALENDAR=fourth quarter,YEAR=2023"], "RELATIONS": ["KEY:CASH AND EQUIVALENTS!!TYPE:OUT!!MONEY:$1361936 T!!LINK:KV!!SECTION:REGULAR!!QTR:Q4!!YEAR:2023"]', '"ENTITIES": ["METRIC=GAAP,METRIC=Gross Profit,MONEY=$873MN,CALENDAR=fourth quarter,YEAR=2023"], "RELATIONS": ["KEY:GAAP GROSS PROFIT!!TYPE:OUT!!MONEY:$873 MN!!LINK:KV!!SECTION:REGULAR!!QTR:Q4!!YEAR:2023"]', '"ENTITIES": ["METRIC=GAAP,METRIC=Gross Margin,PCT=81.85%,CALENDAR=fourth quarter,YEAR=2023"], "RELATIONS": ["KEY:GAAP GROSS MARGIN!!TYPE:OUT!!PCT:81.85%!!LINK:KV!!SECTION:REGULAR!!QTR:Q4!!YEAR:2023"]', '"ENTITIES": ["METRIC=GAAP,METRIC=Free Cash Flow,MONEY=$60MN,CALENDAR=fourth quarter,YEAR=2023"], "RELATIONS": ["KEY:GAAP FREE CASH FLOW!!TYPE:OU

['"ENTITIES": ["METRIC=Revenue,MONEY=$981262 T"], "RELATIONS": ["KEY:REVENUE!!TYPE:OUT!!MONEY:$981262 T!!LINK:KV!!SECTION:REGULAR!!QTR:Q4!!YEAR:2023"]', '"ENTITIES": ["METRIC=Revenue,DATE=December 31 2023,MONEY=$3055071 T"], "RELATIONS": ["KEY:REVENUE!!TYPE:OUT!!MONEY:$3055071 T!!LINK:KV!!SECTION:REGULARFULL!!QTR:ALL!!YEAR:2023"]', '"ENTITIES": ["METRIC=Net Income,MONEY=$201178 T"], "RELATIONS": ["KEY:GAAP NET INCOME!!TYPE:OUT!!MONEY:$201178 T!!LINK:KV!!SECTION:REGULAR!!QTR:Q4!!YEAR:2023"]', '"ENTITIES": ["METRIC=Net Income,MONEY=$(35610) T"], "RELATIONS": ["KEY:GAAP NET INCOME!!TYPE:OUT!!MONEY:-$35610 T!!LINK:KV!!SECTION:REGULARFULL!!QTR:ALL!!YEAR:2023"]', '"ENTITIES": ["METRIC=Net Income,METRIC=Margin,YEAR=2023,PCT=21%"], "RELATIONS": ["KEY:GAAP NET INCOME MARGIN!!TYPE:OUT!!PCT:21.0%!!LINK:KV!!SECTION:REGULAR!!QTR:Q4!!YEAR:2023"]', '"ENTITIES": ["METRIC=Net Income,METRIC=Margin,YEAR=2023,MONEY=$(1) T"], "RELATIONS": ["KEY:GAAP NET INCOME MARGIN!!TYPE:OUT!!MONEY:$(1) T!!LINK:KV!!SECTI

['"ENTITIES": ["METRIC=Revenue,METRIC=Global,MONEY=$981 MN"], "RELATIONS": ["KEY:REVENUE-GLOBAL!!TYPE:OUT!!MONEY:$981 MN!!LINK:KV!!SECTION:REGULAR!!QTR:Q4!!YEAR:2023"]', '"ENTITIES": ["METRIC=Revenue,METRIC=Global,YEAR=2023,MONEY=$3055 MN"], "RELATIONS": ["KEY:REVENUE-GLOBAL!!TYPE:OUT!!MONEY:$3055 MN!!LINK:KV!!SECTION:REGULARFULL!!QTR:ALL!!YEAR:2023"]', '"ENTITIES": ["METRIC=Revenue,LOC=US,LOC=Canada,MONEY=$779 MN"], "RELATIONS": ["KEY:REVENUE-US AND CANADA!!TYPE:INPUT!!MONEY:$779 MN!!LINK:KV!!SECTION:REGULAR!!QTR:Q4!!YEAR:2023"]', '"ENTITIES": ["METRIC=Revenue,LOC=US,LOC=Canada,YEAR=2023,MONEY=$2448 MN"], "RELATIONS": ["KEY:REVENUE-US AND CANADA!!TYPE:INPUT!!MONEY:$2448 MN!!LINK:KV!!SECTION:REGULARFULL!!QTR:ALL!!YEAR:2023"]', '"ENTITIES": ["METRIC=Revenue,LOC=Europe,MONEY=$162 MN"], "RELATIONS": ["KEY:REVENUE-EUROPE!!TYPE:INPUT!!MONEY:$162 MN!!LINK:KV!!SECTION:REGULAR!!QTR:Q4!!YEAR:2023"]', '"ENTITIES": ["METRIC=Revenue,LOC=Europe,YEAR=2023,MONEY=$483 MN"], "RELATIONS": ["KEY:REVENUE-

['Q4 2023 Earning Report -%%%- \nArticle: SCG*** Guidance..\n\nQuestion: What are the entities and relations between these entities present in the text? If entities are present then first generate entities and then list relations between entities in lists of lists format.', '\nArticle: Q4 2023 Earning Report -%%%- PG*** Guidance. -%%%-  For Q1 2024, we expect revenue to be in the range of $690 million to $705 million, representing 15-17% growth year over year..\n\nQuestion: What are the entities and relations between these entities present in the text? If entities are present then first generate entities and then list relations between entities in lists of lists format.', '\nArticle: Q4 2023 Earning Report -%%%- PG*** Guidance. -%%%- We expect Q1 2024 Non-GAAP operating expenses* to be in the range of $450 million to $465 million, representing 9-13% growth year over year..\n\nQuestion: What are the entities and relations between these entities present in the text? If entities are prese

In [18]:
rPath = "../../Summary/Refined/"
csym = "NVDA"
cPath = rPath + "/" + csym
#print(cPath)
cfiles = glob.glob(cPath+"/*.txt")
#print(cfiles)
if(len(cfiles) > 0):
    for cf in cfiles:
        if isScanningRqd(cf):
            print(cf)
            created = createDatawithModel(cf, csym, nlp, instruct_model, forTrain=False)
            if not created:
                continue
            break

../../Summary/Refined//NVDA\NVDA_2022-08-24_EP_YH.txt
NVDA_2022-08-24_EP_YH.txt
../../Summary/PostRefinedV3/NVDA/NVDA_2022-08-24_EP_YH.txt
../../Summary/PostRefinedV3/NVDA/NVDA_2022-08-24_EP_YH.txt Already exists
../../Summary/Refined//NVDA\NVDA_2022-11-16_EP_YH.txt
NVDA_2022-11-16_EP_YH.txt
../../Summary/PostRefinedV3/NVDA/NVDA_2022-11-16_EP_YH.txt
../../Summary/PostRefinedV3/NVDA/NVDA_2022-11-16_EP_YH.txt Already exists
../../Summary/Refined//NVDA\NVDA_2023-02-22_EP_YH.txt
NVDA_2023-02-22_EP_YH.txt
../../Summary/PostRefinedV3/NVDA/NVDA_2023-02-22_EP_YH.txt
../../Summary/PostRefinedV3/NVDA/NVDA_2023-02-22_EP_YH.txt Already exists
../../Summary/Refined//NVDA\NVDA_2023-05-24_EP_YH.txt
NVDA_2023-05-24_EP_YH.txt
../../Summary/PostRefinedV3/NVDA/NVDA_2023-05-24_EP_YH.txt
../../Summary/PostRefinedV3/NVDA/NVDA_2023-05-24_EP_YH.txt Already exists
../../Summary/Refined//NVDA\NVDA_2023-08-23_EP_YH.txt
NVDA_2023-08-23_EP_YH.txt
../../Summary/PostRefinedV3/NVDA/NVDA_2023-08-23_EP_YH.txt
../../Sum

['\nArticle: Q3 2024 Earning Report -%%%- PG*** NVIDIA will pay its next.\n\nQuestion: What are the entities and relations between these entities present in the text? If entities are present then first generate entities and then list relations between entities in lists of lists format.', '\nArticle: Q3 2024 Earning Report -%%%- quarterly cash dividend of $0.04 per share on December 28, 2023, to all shareholders of record on December 6, 2023..\n\nQuestion: What are the entities and relations between these entities present in the text? If entities are present then first generate entities and then list relations between entities in lists of lists format.', 'Q3 2024 Earning Report -%%%- \nArticle: SCHQ*** Q3 Fiscal 2024 Summary..\n\nQuestion: What are the entities and relations between these entities present in the text? If entities are present then first generate entities and then list relations between entities in lists of lists format.']



['"ENTITIES": ["ORG=NVIDIA,FC=pays"], "RELATIO

['Q3 2024 Earning Report -%%%- \nArticle: SCG*** OutlookNVIDIA’s outlook for the fourth quarter of fiscal 2024 is as follows:..\n\nQuestion: What are the entities and relations between these entities present in the text? If entities are present then first generate entities and then list relations between entities in lists of lists format.', '\nArticle: Q3 2024 Earning Report -%%%- PG*** OutlookNVIDIA’s outlook for the fourth quarter of fiscal 2024 is as follows:. -%%%- .\n\nQuestion: What are the entities and relations between these entities present in the text? If entities are present then first generate entities and then list relations between entities in lists of lists format.', '\nArticle: Q3 2024 Earning Report -%%%- PG*** OutlookNVIDIA’s outlook for the fourth quarter of fiscal 2024 is as follows:. -%%%- Story continues..\n\nQuestion: What are the entities and relations between these entities present in the text? If entities are present then first generate entities and then list 

['"ENTITIES": ["FC=OutlookNVIDIA,FC=outlook,CALENDAR=fourth quarter,YEAR=2024"], "RELATIONS": [""]', '"ENTITIES": ["ORG=OutlookNVIDIA\'s,CALENDAR=fourth quarter,YEAR=2024"], "RELATIONS": [""]', '"ENTITIES": ["ORG=OutlookNVIDIA\'s,CALENDAR=fourth quarter,YEAR=2024"], "RELATIONS": [""]', '"ENTITIES": ["METRIC=Revenue,MONEY=$20.00 billion,PCT=2%"], "RELATIONS": ["KEY:REVENUE!!TYPE:OUT!!MONEY:$20.00 BN!!LINK:KV!!SECTION:GUIDE!!QTR:Q4!!YEAR:2024"]', '"ENTITIES": ["METRIC=GAAP,METRIC=non-GAAP,METRIC=gross margins,PCT=74.5%,PCT=75.5%,PCT=50 basis points"], "RELATIONS": ["KEY:GAAP GROSS MARGIN!!TYPE:OUT!!PCT:74.5%!!LINK:KV!!SECTION:GUIDE!!QTR:Q4!!YEAR:2024"]', '"ENTITIES": ["METRIC=GAAP,METRIC=non-GAAP,METRIC=operating expenses,MONEY=$3.17 billion,MONEY=$2.20 billion"], "RELATIONS": ["KEY:GAAP OPERATING EXPENSE!!TYPE:OUT!!MONEY:$3.17 BN!!MONEY:$2.20 BN!!LINK:KVRANGE!!SECTION:GUIDE!!QTR:Q4!!YEAR:2024"]', '"ENTITIES": ["METRIC=GAAP,METRIC=non-GAAP,METRIC=income and expense,MONEY=$200 million,CHG

['\nArticle: Q3 2024 Earning Report -%%%- PG*** Professional Visualization. -%%%-  Announced that Mercedes-Benz is using NVIDIA Omniverse to create digital twins to help plan, design, build and operate its manufacturing and assembly facilities around the world..\n\nQuestion: What are the entities and relations between these entities present in the text? If entities are present then first generate entities and then list relations between entities in lists of lists format.', '\nArticle: Q3 2024 Earning Report -%%%- PG*** Professional Visualization. -%%%-  Announced a new line of desktop workstations with NVIDIA RTX™ 6000 Ada Generation GPUs and NVIDIA ConnectX® smart interface cards for training smaller AI models, fine-tuning models and running inference locally..\n\nQuestion: What are the entities and relations between these entities present in the text? If entities are present then first generate entities and then list relations between entities in lists of lists format.', 'Q3 2024 Ear

In [43]:
#Taking backup of previous training format data
backupDir = "../../Summary/DATA/FLAN/Backup"
devDataFile = "../../Summary/DATA/FLAN/Dev/dev.tsv"
trainDataFile = "../../Summary/DATA/FLAN/Train/train.tsv"
testDataFile = "../../Summary/DATA/FLAN/Test/test.tsv"

trainDir = "../../Summary/DATA/FLAN/Train"
devDir = "../../Summary/DATA/FLAN/Dev"
testDir = "../../Summary/DATA/FLAN/Test"

trfiles = glob.glob(trainDir+"/*.txt")
dfiles = glob.glob(devDir+"/*.txt")
tefiles = glob.glob(testDir+"/*.txt")
bdirs = glob.glob(backupDir+"/*")

print(len(trfiles), len(dfiles), len(tefiles), len(bdirs))
cnt = len(trfiles) + len(dfiles) + len(tefiles)
print("{} files need to be moved for backup".format(cnt))

if(len(trfiles) > 0):
    newBackupDir = backupDir + "/" + "Format-" + str(len(bdirs) + 1)
    print(newBackupDir)
    if not os.path.exists(newBackupDir):
        os.makedirs(newBackupDir)
        print("New Backup dir {} created".format(newBackupDir))
    for file in trfiles:
        shutil.move(file, newBackupDir)
    for file in dfiles:
        shutil.move(file, newBackupDir)
    for file in tefiles:
        shutil.move(file, newBackupDir)
    os.remove(trainDataFile)
    os.remove(testDataFile)
    os.remove(devDataFile)
    print("{} files moved for backup".format(cnt))
else:
    print("No file moved for backup")

0 0 0 6
0 files need to be moved for backup
No file moved for backup


In [3]:
srcDir = "../../Summary/PostRefinedV3"
trainDir = "../../Summary/DATA/FLAN/Train"
devDir = "../../Summary/DATA/FLAN/Dev"
testDir = "../../Summary/DATA/FLAN/Test"

srcdirs = glob.glob(srcDir+"/*")
print(len(srcdirs))
maxrng = -1
didx = -1 #File index to move to DevDir
cnt = -1

for sdir in srcdirs:
    if(os.path.isdir(sdir)):
        tfiles = glob.glob(sdir+"/*")
        if(maxrng < 0 or len(tfiles) > maxrng):
            maxrng = (len(tfiles))
        #print(maxrng)
        if(didx < maxrng - 1):
            didx = didx + 1
            if(didx > len(tfiles)):
                didx = len(tfiles)
        else:
            didx = 0
        for index, file in enumerate(tfiles):
            basefile = os.path.basename(file)
            #print(basefile)
            if(index == didx):
                if("FIVN" in sdir or "ZI" in sdir):
                    destfile = testDir + "/" + basefile
                    print("Copying file {} with index {} from dir {} to dir{}".format(file, index, sdir, testDir))
                    if not os.path.exists(destfile) and not os.path.exists(devDir + "/" + basefile) and not os.path.exists(trainDir + "/" + basefile):
                        shutil.copy(file, testDir)
                    else:
                        print("File {} already exists in destination dir".format(basefile))
                else:
                    destfile = devDir + "/" + basefile
                    print("Copying file {} with index {} from dir {} to dir{}".format(file, index, sdir, devDir))
                    if not os.path.exists(destfile) and not os.path.exists(testDir + "/" + basefile) and not os.path.exists(trainDir + "/" + basefile):
                        shutil.copy(file, devDir)
                    else:
                        print("File {} already exists in destination dir".format(basefile))
            else:
                destfile = trainDir + "/" + basefile
                #print(destfile)
                print("Copying file {} with index {} from dir {} to dir{}".format(file, index, sdir, trainDir))
                if not os.path.exists(destfile) and not os.path.exists(devDir + "/" + basefile) and not os.path.exists(testDir + "/" + basefile):
                    shutil.copy(file, trainDir)
                else:
                    print("File {} already exists in destination dir".format(basefile))
            

27
Copying file ../../Summary/PostRefinedV3\APPN\APPN_2022-08-04_EP_YH.txt with index 0 from dir ../../Summary/PostRefinedV3\APPN to dir../../Summary/DATA/FLAN/Dev
File APPN_2022-08-04_EP_YH.txt already exists in destination dir
Copying file ../../Summary/PostRefinedV3\APPN\APPN_2022-11-03_EP_YH.txt with index 1 from dir ../../Summary/PostRefinedV3\APPN to dir../../Summary/DATA/FLAN/Train
File APPN_2022-11-03_EP_YH.txt already exists in destination dir
Copying file ../../Summary/PostRefinedV3\APPN\APPN_2023-02-16_EP_YH.txt with index 2 from dir ../../Summary/PostRefinedV3\APPN to dir../../Summary/DATA/FLAN/Train
File APPN_2023-02-16_EP_YH.txt already exists in destination dir
Copying file ../../Summary/PostRefinedV3\APPN\APPN_2023-05-09_EP_YH.txt with index 3 from dir ../../Summary/PostRefinedV3\APPN to dir../../Summary/DATA/FLAN/Train
File APPN_2023-05-09_EP_YH.txt already exists in destination dir
Copying file ../../Summary/PostRefinedV3\APPN\APPN_2023-08-03_EP_YH.txt with index 4 fr