In [1]:
import os
from datetime import datetime
import logging

import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from simpletransformers.seq2seq import Seq2SeqModel, Seq2SeqArgs
from simpletransformers.classification import ClassificationModel, ClassificationArgs
#from simpletransformers.t5 import T5Model, T5Args
import warnings
import pandas as pd
import os
from datetime import datetime
import logging
import glob
from pathlib import Path
import csv

In [43]:
import spacy
from spacy.language import Language
from spacy import displacy
import time

@Language.component("newsent")
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        #print(token.text, token.text in ("’s", "'s"))
        if token.text.upper() in (";", "--", "\n\n", "\n", "QUARTERLY", "STORY", "\n\n\n\n", "\n\n\n"):
            #print("Detected:", token.text)
            doc[token.i].is_sent_start = True
    return doc

#spacy.require_gpu()
nlp = spacy.load("../../Summary/NER/RelateEntity/train/model-best-local")
nlp.add_pipe('sentencizer')
nlp.add_pipe('newsent', name="newsent", last=True)

<function __main__.set_custom_boundaries(doc)>

In [2]:
def getSentences(inputfile, nlp):
    with open(inputfile, 'r', encoding="utf-8") as f:
        text = f.read()

    doc = nlp(text)
    sentences = [str(sent).strip() for sent in doc.sents]

    print(len(sentences))
    return(sentences)

In [3]:
def clean_unnecessary_spaces(out_string):
    if not isinstance(out_string, str):
        warnings.warn(f">>> {out_string} <<< is not a string.")
        out_string = str(out_string)
    out_string = (
        out_string.replace(" .", ".")
        .replace(" ?", "?")
        .replace(" !", "!")
        .replace(" ,", ",")
        .replace(" ' ", "'")
        .replace(" n't", "n't")
        .replace(" 'm", "'m")
        .replace(" 's", "'s")
        .replace(" 've", "'ve")
        .replace(" 're", "'re")
    )
    return out_string

In [4]:
import hashlib
def dedupList(flist):
    hashList = list()
    newList = list()
    for item in flist:
        #print(item.upper())
        hash_object = hashlib.sha1(item.upper().encode("UTF-8"))
        hex_dig = hash_object.hexdigest()
        if hex_dig not in hashList:
            hashList.append(hex_dig)
            #print(item, hex_dig)
            newList.append(item)
    return(newList)

In [35]:
devDir = "../../Summary/DATA/PARAPHRASE/Dev"
trainDir = "../../Summary/DATA/PARAPHRASE/Train"
testDir = "../../Summary/DATA/PARAPHRASE/Test"

spdevDir = "../../Summary/DATA/SENTPAIR/Dev"
sptrainDir = "../../Summary/DATA/SENTPAIR/Train"
sptestDir = "../../Summary/DATA/SENTPAIR/Test"

rplStr = ["PG***", "ED***", "SCHQ***", "SCBQ***", "SCBF***", "SCHF***", "SCG***", "GF***", "GQ***", "SC***", "NOPAD***"]

def preProcessSent(line):
    for s in rplStr:
        line = line.replace(s, "")
    if("TBLST***" in line or "TBLET***" in line or "CS***" in line or "@@@" in line or line == "\n" or line == "\n\n" or "https://finance.yahoo.com" in line):
        return(line, False)
    line = line.replace("\n", "")
    if(line == None or line == ""):
        return(line, False)
    line = line.replace("$(0.", "$(.")
    line = line.replace("($0.", "($.")
    line = line.replace("$0.", "$.")
    line = line.replace("$ 0.", "$.")
    return(line, True)

def createTrainingData(inputDir, ppModel=None, writetofile=False):
    files = glob.glob(inputDir+"/*_EP_YH.txt")
    #print(files)
    #print(inputDir)
    if(len(files) > 0):
        for file in (files):
            print("Input file " + file)
            basefile = os.path.basename(file)
            inputfile = os.path.splitext(basefile)[0]
            #print(inputfile)
            if not ppModel:
                outfilePath = inputfile + "_phrase.tsv"
                outfilePath = inputDir + "/" + outfilePath
                print("Phrase file " + outfilePath)
                outfile = Path(outfilePath)
                if outfile.is_file():
                    print("Phrase file " + str(outfile) + " already exists")
                    continue
                with open(outfile, "w", encoding = "utf-8") as of:
                    #cnt = 0
                    of.write("filename\tSentence1\tSentence2\n")
                    #with open(file, encoding="utf-8") as f:
                    sentences = getSentences(file, nlp)
                    #line = f.readline()
                    for line in sentences:
                        for s in rplStr:
                            line = line.replace(s, "")
                        if("TBLST***" in line or "TBLET***" in line or "CS***" in line or "@@@" in line or line == "\n" or line == "\n\n" or "https://finance.yahoo.com" in line):
                            #line = f.readline()
                            continue
                        line = line.replace("\n", "")
                        if(line == ""):
                            continue
                        #print(line)
                        #cnt = cnt + 1
                        phraseLine = line
                        of.write(inputfile+"\t"+line+"\t"+line+"\n")
                        #of.write(line + "\n")
                        #line = f.readline()
            else:
                f = None
                if(writetofile):
                    basefile = os.path.basename(file)
                    inputfile = os.path.splitext(basefile)[0]
                    print("Input file " + inputfile)
                    print(inputDir)
                    outfilePath = inputfile + "_sp.tsv"
                    outfilePath = inputDir + "/" + outfilePath
                    print("SP file " + outfilePath)
                    outfile = Path(outfilePath)
                    if outfile.is_file():
                        print("SP file " + str(outfile) + " already exists")
                        continue
                    f = open(outfile, "w", encoding = "utf-8")
                    f.write("filename\tSentence1\tSentence2\tTarget\n")
                sentences = getSentences(file, nlp)
                #line = f.readline()
                for line in sentences:
                    for s in rplStr:
                        line = line.replace(s, "")
                    if("TBLST***" in line or "TBLET***" in line or "CS***" in line or "@@@" in line or line == "\n" or line == "\n\n" or "https://finance.yahoo.com" in line):
                        #line = f.readline()
                        continue
                    line = line.replace("\n", "")
                    if(line == None or line == ""):
                        continue
                    #print(line)
                    #cnt = cnt + 1
                    #phraseLine = line
                    line = line.replace("$(0.", "$(.")
                    line = line.replace("($0.", "($.")
                    line = line.replace("$0.", "$.")
                    line = line.replace("$ 0.", "$.")
                    inp = [line]
                    #print(inp)
                    if(not writetofile):
                        print("ORIGINAL SENTENCE\n",line)
                        print("PREDICTED PARAPHRASE\n")
                        predicted = ppModel.predict(inp)
                        print(predicted)
                        print("\n")
                    else:
                        predicted = ppModel.predict(inp)
                        if(predicted):
                            predicted = dedupList(predicted[0])
                            #print("ORIGINAL SENTENCE\n",line)
                            #print("PREDICTED PARAPHRASE\n")
                            #print(predicted)
                            for pred in predicted:
                                if(f):
                                    f.write(inputfile+"\t"+line+"\t"+pred+"\t"+"1"+"\n")
                                    print(inputfile+"\t"+line+"\t"+pred+"\t"+"1")
                            #print("\n")
                if(f):
                    f.close()

In [37]:
def predictPhrase(line, ppModel, spModel):
    nline, isProcess = preProcessSent(line)
    if not isProcess:
        return(line)
    inp = [nline]
    predicted = ppModel.predict(inp)
    if(predicted):
        for pred in predicted:
            spinp = [[nline, pred]]
            print(spinp)

In [6]:
devDataFile = "../../Summary/DATA/PARAPHRASE/Dev/dev.tsv"
trainDataFile = "../../Summary/DATA/PARAPHRASE/Train/train.tsv"

devSPFile = "../../Summary/DATA/SENTPAIR/Dev/dev.tsv"
trainSPFile = "../../Summary/DATA/SENTPAIR/Train/train.tsv"

def writeTrainingData(writeFile, writeDir):
    if("PARAPHRASE" in writeDir):
        files = glob.glob(writeDir+"/*_phrase.tsv")
    else:
        files = glob.glob(writeDir+"/*_sp.tsv")
    print(files)
    frames = list()

    if(len(files) > 0):
        for file in files:
            df = pd.read_csv(file, sep="\t", encoding = "utf-8").astype(str)
            df = df.dropna()
            df = df[df['Sentence1'].notna()]
            #print(df)
            frames.append(df)
    result = pd.concat(frames)
    print(result)
    result.to_csv(writeFile, sep='\t', index=False, header=True)

In [8]:
createTrainingData(devDir)
writeTrainingData(devDataFile, devDir)

Input file ../../Summary/DATA/PARAPHRASE/Dev\APPN_2023-02-16_EP_YH.txt
Phrase file ../../Summary/DATA/PARAPHRASE/Dev/APPN_2023-02-16_EP_YH_phrase.tsv
Phrase file ..\..\Summary\DATA\PARAPHRASE\Dev\APPN_2023-02-16_EP_YH_phrase.tsv already exists
Input file ../../Summary/DATA/PARAPHRASE/Dev\BILL_2023-02-02_EP_YH.txt
Phrase file ../../Summary/DATA/PARAPHRASE/Dev/BILL_2023-02-02_EP_YH_phrase.tsv
Phrase file ..\..\Summary\DATA\PARAPHRASE\Dev\BILL_2023-02-02_EP_YH_phrase.tsv already exists
Input file ../../Summary/DATA/PARAPHRASE/Dev\CFLT_2022-11-02_EP_YH.txt
Phrase file ../../Summary/DATA/PARAPHRASE/Dev/CFLT_2022-11-02_EP_YH_phrase.tsv
Phrase file ..\..\Summary\DATA\PARAPHRASE\Dev\CFLT_2022-11-02_EP_YH_phrase.tsv already exists
Input file ../../Summary/DATA/PARAPHRASE/Dev\CRWD_2022-11-29_EP_YH.txt
Phrase file ../../Summary/DATA/PARAPHRASE/Dev/CRWD_2022-11-29_EP_YH_phrase.tsv
Phrase file ..\..\Summary\DATA\PARAPHRASE\Dev\CRWD_2022-11-29_EP_YH_phrase.tsv already exists
Input file ../../Summary

In [9]:
createTrainingData(trainDir)
writeTrainingData(trainDataFile, trainDir)

Input file ../../Summary/DATA/PARAPHRASE/Train\APPN_2022-05-06_EP_YH.txt
Phrase file ../../Summary/DATA/PARAPHRASE/Train/APPN_2022-05-06_EP_YH_phrase.tsv
Phrase file ..\..\Summary\DATA\PARAPHRASE\Train\APPN_2022-05-06_EP_YH_phrase.tsv already exists
Input file ../../Summary/DATA/PARAPHRASE/Train\APPN_2022-08-04_EP_YH.txt
Phrase file ../../Summary/DATA/PARAPHRASE/Train/APPN_2022-08-04_EP_YH_phrase.tsv
Phrase file ..\..\Summary\DATA\PARAPHRASE\Train\APPN_2022-08-04_EP_YH_phrase.tsv already exists
Input file ../../Summary/DATA/PARAPHRASE/Train\APPN_2022-11-03_EP_YH.txt
Phrase file ../../Summary/DATA/PARAPHRASE/Train/APPN_2022-11-03_EP_YH_phrase.tsv
Phrase file ..\..\Summary\DATA\PARAPHRASE\Train\APPN_2022-11-03_EP_YH_phrase.tsv already exists
Input file ../../Summary/DATA/PARAPHRASE/Train\APPN_2023-05-09_EP_YH.txt
Phrase file ../../Summary/DATA/PARAPHRASE/Train/APPN_2023-05-09_EP_YH_phrase.tsv
Phrase file ..\..\Summary\DATA\PARAPHRASE\Train\APPN_2023-05-09_EP_YH_phrase.tsv already exists


In [10]:
train_df = pd.read_csv(trainDataFile, sep="\t", encoding = "utf-8").astype(str)
eval_df = pd.read_csv(devDataFile, sep="\t", encoding = "utf-8").astype(str)

train_df = train_df.rename(
    columns={"Sentence1": "input_text", "Sentence2": "target_text"}
)
eval_df = eval_df.rename(
    columns={"Sentence1": "input_text", "Sentence2": "target_text"}
)

train_df = train_df[["input_text", "target_text"]]
eval_df = eval_df[["input_text", "target_text"]]

train_df["prefix"] = "paraphrase"
train_df = train_df[["prefix", "input_text", "target_text"]]

eval_df["prefix"] = "paraphrase"
eval_df = eval_df[["prefix", "input_text", "target_text"]]

train_df = train_df.dropna()
train_df = train_df[train_df['input_text'].notna()]

eval_df = eval_df.dropna()
eval_df = eval_df[eval_df['input_text'].notna()]

train_df["input_text"] = train_df["input_text"].apply(clean_unnecessary_spaces)
train_df["target_text"] = train_df["target_text"].apply(clean_unnecessary_spaces)
print("TRAIN DATA ..............")
print(train_df)

eval_df["input_text"] = eval_df["input_text"].apply(clean_unnecessary_spaces)
eval_df["target_text"] = eval_df["target_text"].apply(clean_unnecessary_spaces)
print("EVAL DATA ..............")
print(eval_df)

TRAIN DATA ..............
          prefix                                         input_text  \
0     paraphrase                                Appian Corporation.   
1     paraphrase   First quarter cloud subscription revenue incr...   
2     paraphrase         MCLEAN, Va., May 05, 2022 (GLOBE NEWSWIRE)   
3     paraphrase  -- Appian (Nasdaq: APPN) today announced finan...   
4     paraphrase   “We exceeded guidance and grew cloud subscrip...   
...          ...                                                ...   
1522  paraphrase  Accordingly, we are required to add back the n...   
1523  paraphrase  Additionally, we include the anti-dilutive imp...   
1524  paraphrase  We have not reconciled our expectations to non...   
1525  paraphrase  For those reasons, we are also unable to addre...   
1526  paraphrase  Accordingly, a reconciliation for the guidance...   

                                            target_text  
0                                   Appian Corporation.  
1    

In [7]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.ERROR)

In [8]:
def count_matches(labels, preds):
    print(labels)
    print(preds)
    return sum(
        [
            1 if label == pred else 0
            for label, pred in zip(labels, preds)
        ]
    )


In [14]:
model_args = Seq2SeqArgs()
#model_args = T5Args()
model_args.do_sample = True
model_args.train_batch_size = 4
model_args.use_multiprocessing = False
model_args.num_train_epochs = 2
#model_args.learning_rate = 5e-5
#model_args.no_save = True

model_args.eval_batch_size = 4
model_args.evaluate_generated_text = True
model_args.evaluate_during_training = True
model_args.evaluate_during_training_verbose = True
model_args.evaluate_during_training_steps = 50

model_args.overwrite_output_dir = True
model_args.reprocess_input_data = True
model_args.save_eval_checkpoints = False
model_args.save_model_every_epoch = False
model_args.save_steps = -1

model_args.max_length = 64
#model_args.max_seq_length = 32
model_args.num_return_sequences = 10
model_args.top_k = 50
model_args.top_p = 0.95
model_args.fp16 = False
#model_args.num_beams = None

model_args.use_early_stopping = False
#model_args.early_stopping_delta = 0.01
#model_args.early_stopping_metric = "mcc"
#model_args.early_stopping_metric_minimize = False
#model_args.early_stopping_patience = 5
#model_args.evaluate_during_training_steps = 500
#model_args.wandb_project =  "visualization-demo"

model = Seq2SeqModel(
    encoder_decoder_type="bart",
    encoder_decoder_name="facebook/bart-base",
    args=model_args,
    use_cuda=True,
)

#model = T5Model("t5", "t5-base", args=model_args, use_cuda=True)

# Train the model
model.train_model(
    train_df, eval_data=eval_df, matches=count_matches
)
results = model.eval_model(eval_df)

INFO:simpletransformers.seq2seq.seq2seq_utils: Creating features from dataset file at cache_dir/


  0%|          | 0/1505 [00:00<?, ?it/s]

INFO:simpletransformers.seq2seq.seq2seq_model: Training started


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/377 [00:00<?, ?it/s]

INFO:simpletransformers.seq2seq.seq2seq_utils: Creating features from dataset file at cache_dir/


  0%|          | 0/627 [00:00<?, ?it/s]

Generating outputs:   0%|          | 0/157 [00:00<?, ?it/s]

INFO:simpletransformers.seq2seq.seq2seq_model:{'eval_loss': 3.218969947972875, 'matches': 0}
INFO:simpletransformers.seq2seq.seq2seq_model:Saving model into outputs/best_model


[' Appian Corporation.', ' Cloud subscription revenue is $65.8 million in fourth quarter.', 'Cloud subscription revenue is $236.9 million for full year.', ' MCLEAN, Va., Feb. 16, 2023 (GLOBE NEWSWIRE)', '-- Appian (Nasdaq: APPN) today announced financial results for the fourth quarter and full year ended December 31, 2022.', ' Fourth Quarter 2022 Financial Highlights:.', ' Cloud subscription revenue is $65.8 million.', 'Total subscriptions revenue, is $93.2 million.', 'Professional services revenue is $32.5 million.', 'Total revenue is $125.8 million.', 'Cloud subscription revenue retention rate is 115% as of December 31, 2022.', ' GAAP operating loss is $(40.6) million.', 'Non-GAAP operating loss is $(26.8) million.', ' GAAP net loss is $(34.4) million.', 'GAAP net loss per share is $(0.47).', 'Non-GAAP net loss is $(20.6) million.', 'Non-GAAP net loss per share is $(0.28).', 'Invalid Sentence.', 'Invalid Sentence.', 'We do not forecast foreign exchange rate movements.', ' Adjusted EB

INFO:simpletransformers.seq2seq.seq2seq_utils: Creating features from dataset file at cache_dir/


  0%|          | 0/627 [00:00<?, ?it/s]

Generating outputs:   0%|          | 0/157 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [32]:
model_args = Seq2SeqArgs()
#model_args = T5Args()
model_args.do_sample = True
#model_args.train_batch_size = 4
model_args.train_batch_size = 8
model_args.use_multiprocessing = False
model_args.num_train_epochs = 2
#model_args.learning_rate = 5e-5
#model_args.no_save = True

#model_args.eval_batch_size = 4
model_args.eval_batch_size = 16
model_args.evaluate_generated_text = True
model_args.evaluate_during_training = True
model_args.evaluate_during_training_verbose = True
model_args.evaluate_during_training_steps = 50

model_args.overwrite_output_dir = True
model_args.reprocess_input_data = True
model_args.save_eval_checkpoints = False
model_args.save_model_every_epoch = False
model_args.save_steps = -1

model_args.max_length = 64
#model_args.max_seq_length = 32
model_args.num_return_sequences = 10
model_args.top_k = 50
model_args.top_p = 0.95
model_args.fp16 = False
#model_args.num_beams = None

model_args.use_early_stopping = False
#model_args.early_stopping_delta = 0.01
#model_args.early_stopping_metric = "mcc"
#model_args.early_stopping_metric_minimize = False
#model_args.early_stopping_patience = 5
#model_args.evaluate_during_training_steps = 500
model_args.wandb_project =  "visualization-demo"

In [33]:
model = Seq2SeqModel(
    encoder_decoder_type="bart", encoder_decoder_name="outputs", args=model_args
)

print(
    model.predict(
        [
            "Revenue between $2.08 billion and $2.9 billion"
        ]
    )
)

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[['Revenue is expected to be between $2.08 billion and $2,9 billion.', 'Revenue is expected to be between $2.08 billion and $2 to 2.9 billion', 'Revenue is expected to be between $2.08 billion and $2,9 billion.', 'Revenue is expected to be between $2.08 billion and $2 million.', 'Revenue is expected to be between $2.08 billion and $2,9 billion', 'Revenue between $2.08 billion and $2 for the year.', 'Revenue is expected to be between $2.08 billion and $29 billion.', 'Revenue is expected to be between $2.08 billion and $2 9 billion.', 'Revenue is expected to be between $2.08 billion and $2,9 billion.', 'Revenue between $2.08 billion and $2 billion.']]


In [10]:
createTrainingData(testDir, model)

Input file ../../Summary/DATA/PARAPHRASE/Test\ZS_2022-05-26_EP_YH.txt
Detected: 

Detected: 

Detected: 

Detected: 

Detected: 



Detected: 

Detected: 



Detected: 


Detected: 


Detected: 


Detected: 


Detected: 


Detected: 


Detected: --
Detected: 


Detected: 

Detected: 



Detected: 


Detected: 


Detected: 


Detected: 


Detected: 


Detected: 


Detected: 


Detected: 


Detected: Story
Detected: 



Detected: 


Detected: 


Detected: 


Detected: 


Detected: 



Detected: 


Detected: 


Detected: 


Detected: 


Detected: 


Detected: 


Detected: 


Detected: 


Detected: 


Detected: 


Detected: 


Detected: 



Detected: 


Detected: 



Detected: 

Detected: 

Detected: 

Detected: 

Detected: 




Detected: 


Detected: 


Detected: 


Detected: 


Detected: 


71
ORIGINAL SENTENCE
  Zscaler, Inc.
PREDICTED PARAPHRASE



Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Zscaler, Inc.', ' Zscaler, Inc.', ' Zscaler, Inc.', ' Zscaler, Inc.', ' Zscaler, Inc.', ' Zscaler, Inc.', ' Zscaler, Inc.', ' Zscaler, Inc.', ' Zscaler, Inc.', ' Zscaler, Inc.']]


ORIGINAL SENTENCE
  Third Quarter Highlights.
PREDICTED PARAPHRASE



Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Third Quarter Highlights.', ' Third Quarter Highlights.', ' Third Quarter Highlights.', ' Third Quarter Highlights.', ' Third Quarter Highlights.', ' Third Quarter Highlights.', ' Third Quarter Highlights.', ' Third Quarter Highlights.', ' Third Quarter Highlights.', ' Third Quarter Highlights.']]


ORIGINAL SENTENCE
  Revenue grows 63% year-over-year to $286.8 million.
PREDICTED PARAPHRASE



Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Revenue is $286.8 million.', ' Revenue grows 63% year-over-year to $286.8 million.', ' Revenue is $286.8 million.', ' Revenue is $286.8 million.', ' Revenue is $286.8 million.', ' Revenue grows 63% year-over-year to $286.8 million.', ' Revenue is $286.8 million.', ' Revenue is $286.8 million.', ' Revenue is $286.8 million.', ' Revenue is $286.8 million.']]


ORIGINAL SENTENCE
  Calculated billings grows 54% year-over-year to $345.6 million.
PREDICTED PARAPHRASE



Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Calculated billings is $345.6 million.', ' Calculated billings is $345.6 million.', ' Calculated billings is $345.6 million.', ' Calculated billings is $345.6 million.', ' Calculated billings is $345.6 million.', ' Calculated billings is $345.6 million.', ' Calculated billings is $345.6 million.', ' Calculated billings is $345.6 million.', ' Calculated billings is $345.6 million.', ' Calculated billings is $345.6 million.']]


ORIGINAL SENTENCE
  Deferred revenue grows 65% year-over-year to $818.7 million.
PREDICTED PARAPHRASE



Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Deferred revenue is $818.7 million.', ' Deferred revenue is $818.7 million.', ' Deferred revenue is $818.7 million.', ' Deferred revenue is $818.7 million.', ' Deferred revenue is $818.7 million.', ' Deferred revenue is $818.7 million.', ' Deferred revenue is $818.7 million.', ' Deferred revenue is $818.7 million.', ' Deferred revenue is $818.7 million.', ' Deferred revenue is $818.7 million.']]


ORIGINAL SENTENCE
  GAAP net loss of $101.4 million compared to GAAP net loss of $58.5 million on a year-over-year basis.
PREDICTED PARAPHRASE



Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' GAAP net loss is $101.4 million.', ' GAAP net loss is $101.4 million.', ' GAAP net loss is $101.4 million.', ' GAAP net loss is $101.4 million.', ' GAAP net loss is $101.4 million.', ' GAAP net loss is $101.4 million.', ' GAAP net loss is $101.4 million.', ' GAAP net loss is $101.4 million.', ' GAAP net loss is $101.4 million.', ' GAAP net loss is $101.4 million.']]


ORIGINAL SENTENCE
  Non-GAAP net income of $24.7 million compared to non-GAAP net income of $21.4 million on a year-over-year basis.
PREDICTED PARAPHRASE



Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Non-GAAP net income is $24.7 million.', ' Non-GAAP net income is $24.7 million.', ' Non-GAAP net income is $24.7 million.', ' Non-GAAP net income is $24.7 million.', ' Non-GAAP net income is $24.7 million.', ' Non-GAAP net income is $24.7 million.', ' Non-GAAP net income is $24.7 million.', ' Non-GAAP net income is $24.7 million.', ' Non-GAAP net income is $24.7 million.', ' Non-GAAP net income is $24.7 million.']]


ORIGINAL SENTENCE
  SAN JOSE, Calif., May 26, 2022 (GLOBE NEWSWIRE)
PREDICTED PARAPHRASE



Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' SAN JOSE, Calif., May 26, 2022 (GLOBE NEWSWIRE)', ' SAN JOSE, Calif., May 26, 2022 (GLOBE NEWSWIRE)', ' SAN JOSE, Calif., May 26, 2022 (GLOBE NEWSWIRE)', ' SAN JOSE, Calif., May 26, 2022 (GLOBE NEWSWIRE)', ' SAN JOSE, Calif., May 26, 2022 (GLOBE NEWSWIRE)', ' SAN JOSE, Calif., May 26, 2022 (GLOBE NEWSWIRE)', ' SAN JOSE, Calif., May 26, 2022 (GLOBE NEWSWIRE)', ' SAN JOSE, Calif., May 26, 2022 (GLOBE NEWSWIRE)', ' SAN JOSE, Calif., May 26, 2022 (GLOBE NEWSWIRE)', ' SAN JOSE, Calif., May 26, 2022 (GLOBE NEWSWIRE)']]


ORIGINAL SENTENCE
 -- Zscaler, Inc. (Nasdaq: ZS), the leader in cloud security, today announced financial results for its third quarter of fiscal year 2022, ended April 30, 2022.
PREDICTED PARAPHRASE



Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[['-- Zscaler, Inc. (Nasdaq: ZS), the leader in cloud security, today announced financial results for its third quarter of fiscal year 2022, ended April 30, 2022.', '-- Zscaler, Inc. (Nasdaq: ZS), the leader in cloud security, today announced financial results for its third quarter of fiscal year 2022, ended April 30, 2022.', '-- Zscaler, Inc. (Nasdaq: ZS), the leader in cloud security, today announced financial results for its third quarter of fiscal year 2022, ended April 30, 2022.', '-- Zscaler, Inc. (Nasdaq: ZS), the leader in cloud security, today announced financial results for its third quarter of fiscal year 2022, ended April 30, 2022.', '-- Zscaler, Inc. (Nasdaq: ZS), the leader in cloud security, today announced financial results for its third quarter of fiscal year 2022, ended April 30, 2022.', '-- Zscaler, Inc. (Nasdaq: ZS), the leader in cloud security, today announced financial results for its third quarter of fiscal year 2022, ended April 30, 2022.', '-- Zscaler, Inc. (N

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' "Enterprises continue to consolidate point products in favor of our integrated Zero Trust security platform, resulting in larger, multi-year commitments to Zscaler.', ' "Enterprises continue to consolidate point products in favor of our integrated Zero Trust security platform, resulting in larger, multi-year commitments to Zscaler.', ' "Enterprises continue to consolidate point products in favor of our integrated Zero Trust security platform, resulting in larger, multi-year commitments to Zscaler.', ' "Enterprises continue to consolidate point products in favor of our integrated Zero Trust security platform, resulting in larger, multi-year commitments to Zscaler.', ' "Enterprises continue to consolidate point products in favor of our integrated Zero Trust security platform, resulting in larger, multi-year commitments to Zscaler.', ' "Enterprises continue to consolidate point products in favor of our integrated Zero Trust security platform, resulting in larger, multi-year commitment

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[['We delivered 63% revenue growth and 54% billings growth, while also generating a strong free cash flow margin of 15% for the third quarter.', 'We delivered 63% revenue growth and 54% billings growth, while also generating a strong free cash flow margin of 15% for the third quarter.', 'We delivered 63% revenue growth and 54% billings growth, while also generating a strong free cash flow margin of 15% for the third quarter.', 'We delivered 63% revenue growth and 54% billings growth, while also generating a strong free cash flow margin of 15% for the third quarter.', 'We delivered 63% revenue growth and 54% billings growth, while also generating a strong free cash flow margin of 15% for the third quarter.', 'We delivered 63% revenue growth and 54% billings growth in the third quarter.', 'We delivered 63% revenue growth and 54% billings growth, while also generating a strong free cash flow margin of 15% for the third quarter.', 'We delivered 63% revenue growth and 54% billings growth, w

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[['As a result of increasing demand, we are also raising our fiscal year guidance on all financial metrics,” said Jay Chaudhry, Chairman and CEO of Zscaler. “', 'As a result of increasing demand, we are also raising our fiscal year guidance on all financial metrics,” said Jay Chaudhry, Chairman and CEO of Zscaler. “', 'As a result of increasing demand, we are also raising our fiscal year guidance on all financial metrics,” said Jay Chaudhry, Chairman and CEO of Zscaler. “', 'As a result of increasing demand, we are also raising our fiscal year guidance on all financial metrics,” said Jay Chaudhry, Chairman and CEO of Zscaler. “', 'As a result of increasing demand, we are also raising our fiscal year guidance on all financial metrics,” said Jay Chaudhry, Chairman and CEO of Zscaler. “', 'As a result of increasing demand, we are also raising our fiscal year guidance on all financial metrics,” said Jay Chaudhry, Chairman and CEO of Zscaler. “', 'As a result of increasing demand, we are al

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[['Demanding enterprises look to Zscaler as their strategic partner of choice in their Zero Trust journey, as the world races towards network and security transformation.”.', 'Demanding enterprises look to Zscaler as their strategic partner of choice in their Zero Trust journey, as the world races towards network and security transformation.”.', 'Demanding enterprises look to Zscaler as their strategic partner of choice in their Zero Trust journey, as the world races towards network and security transformation.”.', 'Demanding enterprises look to Zscaler as their strategic partner of choice in their Zero Trust journey, as the world races towards network and security transformation.”.', 'Demanding enterprises look to Zscaler as their strategic partner of choice in their Zero Trust journey, as the world races towards network and security transformation.”.', 'Demanding enterprises look to Zscaler as their strategic partner of choice in their Zero Trust journey, as the world races towards n

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Third Quarter Fiscal 2022 Financial Highlights.', ' Third Quarter Fiscal 2022 Financial Highlights.', ' Third Quarter Fiscal 2022 Financial Highlights.', ' Third Quarter Fiscal 2022 Financial Highlights.', ' Third Quarter Fiscal 2022 Financial Highlights.', ' Third Quarter Fiscal 2022 Financial Highlights.', ' Third Quarter Fiscal 2022 Financial Highlights.', ' Third Quarter Fiscal 2022 Financial Highlights.', ' Third Quarter Fiscal 2022 Financial Highlights.', ' Third Quarter Fiscal 2022 Financial Highlights.']]


ORIGINAL SENTENCE
  Revenue: $286.8 million, an increase of 63% year-over-year.
PREDICTED PARAPHRASE



Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Revenue is $286.8 million.', ' Revenue is $286.8 million.', ' Revenue is $286.8 million.', ' Revenue is $286.8 million.', ' Revenue is $286.8 million.', ' Revenue is $286.8 million.', ' Revenue is $286.8 million.', ' Revenue is $286.8 million.', ' Revenue is $286.8 million.', ' Revenue is $286.8 million.']]


ORIGINAL SENTENCE
  Income (loss) from operations : GAAP loss from operations was $86.6 million, or 30% of total revenue, compared to $43.9 million, or 25% of total revenue, in the third quarter of fiscal 2021.
PREDICTED PARAPHRASE



Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' GAAP loss from operations is $86.6 million.', ' GAAP loss from operations is $86.6 million.', ' GAAP loss from operations is $86.6 million.', ' GAAP loss from operations is $86.6 million.', ' GAAP loss from operations is $86.6 million.', ' GAAP loss from operations is $86.6 million.', ' GAAP loss from operations is $86.6 million.', ' GAAP loss from operations is $86.6 million.', ' GAAP loss from operations is $86.6 million.', ' GAAP loss from operations is $86.6 million.']]


ORIGINAL SENTENCE
 Non-GAAP income from operations was $27.2 million, or 9% of total revenue, compared to $22.9 million, or 13% of total revenue, in the third quarter of fiscal 2021.
PREDICTED PARAPHRASE



Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[['Non-GAAP income from operations is $27.2 million.', 'Non-GAAP income from operations is $27.2 million.', 'Non-GAAP income from operations is $27.2 million.', 'Non-GAAP income from operations is $27.2 million and non-GAPS operating margin is 9%.', 'Non-GAAP income from operations is $27.2 million and non-GAPS operating margin is 9%.', 'Non-GAAP income from operations is $27.2 million.', 'Non-GAAP income from operations is $27.2 million.', 'Non-GAAP income from operations is $27.2 million.', 'Non-GAAP income from operations is $27.2 million.', 'Non-GAAP income from operations is $27.2 million.']]


ORIGINAL SENTENCE
  Net income (loss): GAAP net loss was $101.4 million, compared to $58.5 million in the third quarter of fiscal 2021.
PREDICTED PARAPHRASE



Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' GAAP net loss is $101.4 million.', ' GAAP net loss is $101.4 million.', ' GAAP net loss is $101.4 million.', ' GAAP net loss is $101.4 million.', ' GAAP net loss is $101.4 million.', ' GAAP net loss is $101.4 million.', ' GAAP net loss is $101.4 million.', ' GAAP net loss is $101.4 million.', ' GAAP net loss is $101.4 million.', ' GAAP net loss is $101.4 million.']]


ORIGINAL SENTENCE
 Non-GAAP net income was $24.7 million, compared to $21.4 million in the third quarter of fiscal 2021.
PREDICTED PARAPHRASE



Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[['Non-GAAP net income is $24.7 million.', 'Non-GAAP net income is $24.7 million.', 'Non-GAAP net income is $24.7 million.', 'Non-GAAP net income is $24.7 million.', 'Non-GAAP net income is $24.7 million.', 'Non-GAAP net income is $24.7 million.', 'Non-GAAP net income is $24.7 million.', 'Non-GAAP net income is $24.7 million.', 'Non-GAAP net income is $24.7 million.', 'Non-GAAP net income is $24.7 million.']]


ORIGINAL SENTENCE
  Net income (loss) per share: GAAP net loss per share was $.72, compared to $.43 in the third quarter of fiscal 2021.
PREDICTED PARAPHRASE



Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' GAAP net loss per share is $.72.', ' GAAP net loss per share is $.72.', ' GAAP net loss per share is $.72.', ' GAAP net loss per share is $.72.', ' GAAP net loss per share is $.72.', ' GAAP net loss per share is $.72.', ' GAAP net loss per share is $.72.', ' GAAP net loss per share is $.72.', ' GAAP net loss per share is $.72.', ' GAAP net loss per share is $.72.']]


ORIGINAL SENTENCE
 Non-GAAP net income per share was $.17, compared to $.15 in the third quarter of fiscal 2021.
PREDICTED PARAPHRASE



Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[['Non-GAAP net income per share is $.17.', 'Non-GAAP net income per share is $.17.', 'Non-GAAP net income per share is $.17.', 'Non-GAAP net income per share is $.17.', 'Non-GAAP net income per share is $.17.', 'Non-GAAP net income per share is $.17.', 'Non-GAAP net income per share is $.17.', 'Non-GAAP net income per share is $.17.', 'Non-GAAP net income per share is $.17.', 'Non-GAAP net income per share is $.17.']]


ORIGINAL SENTENCE
  Cash flow: Cash provided by operations was $77.2 million, or 27% of revenue, compared to $73.4 million, or 42% of revenue, in the third quarter of fiscal 2021.
PREDICTED PARAPHRASE



Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Cash flow: Cash flow is $77.2 million.', ' Cash flow: Cash provided by operations is $77.2 million.', ' Cash flow: Cash flow is $77.2 million.', ' Cash flow: Cash provided by operations is $77.2 million.', ' Cash flow: Cash provided by operations is $77.2 million.', ' Cash flow: Cash provided by operations is $77.2 million.', ' Cash flow: Cash flow is $77.2 million.', ' Cash flow: Cash provided by operations is $77.2 million.', ' Cash flow: Cash provided by operations is $77.2 million.', ' Cash flow: Cash provided by operations is $77.2 million.']]


ORIGINAL SENTENCE
 Free cash flow was $43.7 million, or 15% of revenue, compared to $55.8 million, or 32% of revenue, in the third quarter of fiscal 2021.
PREDICTED PARAPHRASE



Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[['Free cash flow is $43.7 million.', 'Free cash flow is $43.7 million.', 'Free cash flow is $43.7 million.', 'Free cash flow is $43.7 million and free cash flow cash flow growth is 15%.', 'Free cash flow is $43.7 million.', 'Free cash flow is $43.7 million.', 'Free cash flow is $43.7 million.', 'Free cash flow is $43.7 million.', 'Free cash flow is $43.7 million.', 'Free cash flow is $43.7 million.']]


ORIGINAL SENTENCE
  Deferred revenue: $818.7 million as of April 30, 2022, an increase of 65% year-over-year.
PREDICTED PARAPHRASE



Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Deferred revenue is $818.7 million as of April 30, 2022.', ' Deferred revenue is $818.7 million as of April 30, 2022.', ' Deferred revenue is $818.7 million as of April 30, 2022.', ' Deferred revenue is $818.7 million as of April 30, 2022.', ' Deferred revenue is $818.7 million as of April 30, 2022.', ' Deferred revenue is $818.7 million as of April 30, 2022.', ' Deferred revenue is $818.7 million as of April 30, 2022.', ' Deferred revenue is $818.7 million as of April 30, 2022.', ' Deferred revenue is $818.7 million as of April 30, 2022.', ' Deferred revenue is $818.7 million as of April 30, 2022.']]


ORIGINAL SENTENCE
  Cash, cash equivalents and short-term investments: $1,657.9 million as of April 30, 2022, an increase of $155.3 million from July 31, 2021.
PREDICTED PARAPHRASE



Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Cash, cash equivalents and short-term investments: $1,657.9 million as of April 30, 2022.', ' Cash, cash equivalents and short-term investments: $1,657.9 million as of April 30, 2022.', ' Cash, cash equivalents and short-term investments is $1,657.9 million as of April 30, 2022.', ' Cash, cash equivalents and short-term investments: $1,657.9 million as of April 30, 2022.', ' Cash, cash equivalents and short-term investments is $1,657.9 million as of April 30, 2022.', ' Cash, cash equivalents and short-term investments is $1,657.9 million as of April 30, 2022.', ' Cash, cash equivalents and short-term investments is $1,657.9 million as of April 30, 2022.', ' Cash, cash equivalents and short-term investments is $1,657.9 million as of April 30, 2022.', ' Cash, cash equivalents and short-term investments: $1,657.9 million as of April 30, 2022.', ' Cash, cash equivalents and short-term investments is $1,657.9 million as of April 30, 2022.']]


ORIGINAL SENTENCE
 Story continues.
PREDICT

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[['Story continues.', 'Story continues.', 'Story continues.', 'Story continues.', 'Story continues.', 'Story continues.', 'Story continues.', 'Story continues.', 'Story continues.', 'Story continues.']]


ORIGINAL SENTENCE
  Recent Business Highlights.
PREDICTED PARAPHRASE



Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Recent Business Highlights.', ' Recent Business Highlights.', ' Recent Business Highlights.', ' Recent Business Highlights.', ' Recent Business Highlights.', ' Recent Business Highlights.', ' Recent Business Highlights.', ' Recent Business Highlights.', ' Recent Business Highlights.', ' Recent Business Highlights.']]


ORIGINAL SENTENCE
  Launched industry-first Security Service Edge (SSE) innovations to protect enterprises from the most sophisticated cyberattacks.
PREDICTED PARAPHRASE



Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Launched industry-first Security Service Edge (SSE) innovations to protect enterprises from the most sophisticated cyberattacks.', ' Launched industry-first Security Service Edge (SSE) innovations to protect enterprises from the most sophisticated cyberattacks.', ' Launched industry-first Security Service Edge (SSE) innovations to protect enterprises from the most sophisticated cyberattacks.', ' Launched industry-first Security Service Edge (SSE) innovations to protect enterprises from the most sophisticated cyberattacks.', ' Launched industry-first Security Service Edge (SSE) innovations to protect enterprises from the most sophisticated cyberattacks.', ' Launched industry-first Security Service Edge (SSE) innovations to protect enterprises from the most sophisticated cyberattacks.', ' Launched industry-first Security Service Edge (SSE) innovations to protect enterprises from the most sophisticated cyberattacks.', ' Launched industry-first Security Service Edge (SSE) innovations t

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[['Delivered as part of the Zscaler Zero Trust Exchange, these innovations establish a new standard for ZTNA to minimize the attack surface, while stopping threats with private app protection, integrated deception, and privileged remote access capabilities for business and OT systems.', 'Delivered as part of the Zscaler Zero Trust Exchange, these innovations establish a new standard for ZTNA to minimize the attack surface, while stopping threats with private app protection, integrated deception, and privileged remote access capabilities for business and OT systems.', 'Delivered as part of the Zscaler Zero Trust Exchange, these innovations establish a new standard for ZTNA to minimize the attack surface, while stopping threats with private app protection, integrated deception, and privileged remote access capabilities for business and OT systems.', 'Delivered as part of the Zscaler Zero Trust Exchange, these innovations establish a new standard for ZTNA to minimize the attack surface, w

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[['Released annual Zscaler ThreatLabZ research revealing up to a 400% increase in Phishing-as-a-Service as the key source of attacks across critical industries and consumers globally, underscoring the urgency for businesses and users to adopt a Zero Trust security model.', 'Released annual Zscaler ThreatLabZ research revealing up to a 400% increase in Phishing-as-a-Service as the key source of attacks across critical industries and consumers globally, underscoring the urgency for businesses and users to adopt a Zero Trust security model.', 'Released annual Zscaler ThreatLabZ research revealing up to a 400% increase in Phishing-as-a-Service as the key source of attacks across critical industries and consumers globally, underscoring the urgency for businesses and users to adopt a Zero Trust security model.', 'Released annual Zscaler ThreatLabZ research revealing up to a 400% increase in Phishing-as-a-Service as the key source of attacks across critical industries and consumers globally, 

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[['Invested in Zscaler’s hypergrowth strategy with the appointment of Brendan Castle, Chief People Officer.', 'Invested in Zscaler’s hypergrowth strategy with the appointment of Brendan Castle, Chief People Officer.', 'Invested in Zscaler’s hypergrowth strategy with the appointment of Brendan Castle, Chief People Officer.', 'Invested in Zscaler’s hypergrowth strategy with the appointment of Brendan Castle, Chief People Officer.', 'Invested in Zscaler’s hypergrowth strategy with the appointment of Brendan Castle, Chief People Officer.', 'Invested in Zscaler’s hypergrowth strategy with the appointment of Brendan Castle, Chief People Officer.', 'Invested in Zscaler’s hypergrowth strategy with the appointment of Brendan Castle, Chief People Officer.', 'Invested in Zscaler’s hypergrowth strategy with the appointment of Brendan Castle, Chief People Officer.', 'Invested in Zscaler’s hypergrowth strategy with the appointment of Brendan Castle, Chief People Officer.', 'Invested in Zscaler’s hyp

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[["Bringing 30 years of experience from organizations like Google and Citibank, Castle will lead the expansion of Zscaler's global People and Culture organization, including talent acquisition, learning and development, internal business partners, diversity, equity and inclusion, and our workplace experience.", "Bringing 30 years of experience from organizations like Google and Citibank, Castle will lead the expansion of Zscaler's global People and Culture organization, including talent acquisition, learning and development, internal business partners, diversity, equity and inclusion, and our workplace experience.", "Bringing 30 years of experience from organizations like Google and Citibank, Castle will lead the expansion of Zscaler's global People and Culture organization, including talent acquisition, learning and development, internal business partners, diversity, equity and inclusion, and our workplace experience.", "Bringing 30 years of experience from organizations like Google a

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[['As a founding member of the Cloud Security Alliance (CSA), Zscaler, along with alliance partners CrowdStrike and Okta, announced the formation of the Zero Trust Advancement Center to bring together existing research and education projects at CSA that will be disseminated online and through its global network', 'As a founding member of the Cloud Security Alliance (CSA), Zscaler, along with alliance partners CrowdStrike and Okta, announced the formation of the Zero Trust Advancement Center to bring together existing research and education projects at CSA that will be disseminated online and through its global network', 'As a founding member of the Cloud Security Alliance (CSA), Zscaler, along with alliance partners CrowdStrike and Okta, announced the formation of the Zero Trust Advancement Center to bring together existing research and education projects at CSA that will be disseminated online and through its global network', 'As a founding member of the Cloud Security Alliance (CSA),

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Financial Outlook.', ' Financial Outlook.', ' Financial Outlook.', ' Financial Outlook.', ' Financial Outlook.', ' Financial Outlook.', ' Financial Outlook.', ' Financial Outlook.', ' Financial Outlook.', ' Financial Outlook.']]


ORIGINAL SENTENCE
  For the fourth quarter of fiscal 2022, we expect:.
PREDICTED PARAPHRASE



Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' For the fourth quarter of fiscal 2022, we expect:.', ' For the fourth quarter of fiscal 2022, we expect:.', ' For the fourth quarter of fiscal 2022, we expect:.', ' For the fourth quarter of fiscal 2022, we expect:.', ' For the fourth quarter of fiscal 2022, we expect:.', ' For the fourth quarter of fiscal 2022, we expect:.', ' For the fourth quarter of fiscal 2022, we expect:.', ' For the fourth quarter of fiscal 2022, we expect:.', ' For the fourth quarter of fiscal 2022, we expect:.', ' For the fourth quarter of fiscal 2022, we expect:.']]


ORIGINAL SENTENCE
  Total revenue of $304 million to $306 million.
PREDICTED PARAPHRASE



Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Total revenue is expected to be between $304 million and $306 million.', ' Total revenue is expected to be between $304 million and $306 million.', ' Total revenue is expected to be between $304 million and $306 million.', ' Total revenue is expected to be between $304 million and $306 million.', ' Total revenue is expected to be between $304 million to $306 million.', ' Total revenue is expected to be between $304 million and $306 million.', ' Total revenue is expected to be between $304 million and $306 million.', ' Total revenue is expected to be between $304 million to $306 million.', ' Total revenue is expected to be between $304 million to $306 million.', ' Total revenue is expected to be between $304 million and $306 million.']]


ORIGINAL SENTENCE
  Non-GAAP income from operations of $33 million to $34 million.
PREDICTED PARAPHRASE



Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Non-GAAP income from operations is expected to be between $33 million and $34 million.', ' Non-GAAP income from operations is expected to be between $33 million to $34 million.', ' Non-GAAP income from operations is expected to be between $33 million and $34 million.', ' Non-GAAP income from operations is expected to be between $33 million and $34 million.', ' Non-GAAP income from operations is expected to be between $33 million and $34 million.', ' Non-GAAP income from operations is expected to be between $33 million and $34 million.', ' Non-GAAP income from operations is expected to be between $33 million and $34 million.', ' Non-GAAP income from operations is expected to be between $33 million and $34 million.', ' Non-GAAP income from operations is expected to be between $33 million and $34 million.', ' Non-GAAP income from operations is expected to be between $33 million and $34 million.']]


ORIGINAL SENTENCE
  Non-GAAP net income per share of approximately $.20 to $.21, assum

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Non-GAAP net income per share is expected to be between $.20 and $.21, assuming approximately 146 million to 147 million common shares outstanding.', ' Non-GAAP net income per share is expected to be between $.20 to $.21, assuming approximately 146 million to 147 million common shares outstanding.', ' Non-GAAP net income per share is expected to be between $.20 to $.21, assuming approximately 146 million to 147 million common shares outstanding.', ' Non-GAAP net income per share is expected to be between $.20 and $.21, assuming approximately 146 million to 147 million common shares outstanding.', ' Non-GAAP net income per share is expected to be between $.20 and $.21, assuming approximately 146 million to 147 million common shares outstanding.', ' Non-GAAP net income per share is expected to be between $.20 and $.21, assuming approximately 146 million to 147 million common shares outstanding.', ' Non-GAAP net income per share is expected to be between $.20 and $.21, assuming approx

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' For the full year fiscal 2022, we expect:.', ' For the full year fiscal 2022, we expect:.', ' For the full year fiscal 2022, we expect:.', ' For the full year fiscal 2022, we expect:.', ' For the full year fiscal 2022, we expect:.', ' For the full year fiscal 2022, we expect:.', ' For the full year fiscal 2022, we expect:.', ' For the full year fiscal 2022, we expect:.', ' For the full year fiscal 2022, we expect:.', ' For the full year fiscal 2022, we expect:.']]


ORIGINAL SENTENCE
  Total revenue of approximately $1.078 billion.
PREDICTED PARAPHRASE



Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Total revenue is expected to be $1.078 billion.', ' Total revenue is expected to be $1.078 billion.', ' Total revenue is expected to be $1.078 billion.', ' Total revenue is expected to be $1.078 billion.', ' Total revenue is expected to be $1.078 billion.', ' Total revenue is expected to be $1.078 billion.', ' Total revenue is expected to be $1.078 billion.', ' Total revenue is expected to be $1.078 billion.', ' Total revenue is expected to be $1.078 billion.', ' Total revenue is expected to be $1.078 billion.']]


ORIGINAL SENTENCE
  Calculated billings of $1.425 billion to $1.430 billion.
PREDICTED PARAPHRASE



Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Calculated billings is expected to be between $1.425 billion and $1$.430 billion.', ' Calculated billings is expected to be between $1.425 billion and $1430 billion.', ' Calculated billings is expected to be between $1.425 billion to $1,430 billion.', ' Calculated billings is expected to be between $1.425 billion and $1."430 billion.', ' Calculated billings is expected to be between $1.425 billion and $1-1.430 billion.', ' Calculated billings is expected to be between $1.425 billion and $1,430 billion.', ' Calculated billings is expected to be between $1.425 billion and $1,430 billion.', ' Calculated billings is expected to be between $1.425 billion to $1,430 billion.', ' Calculated billings is expected to be between $1.425 billion and $1,430 billion.', ' Calculated billings is expected to be between $1.425 billion and $1 billion.']]


ORIGINAL SENTENCE
  Non-GAAP income from operations of $106 million to $108 million.
PREDICTED PARAPHRASE



Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Non-GAAP income from operations is expected to be between $106 million to $108 million.', ' Non-GAAP income from operations is expected to be between $106 million and $108 million.', ' Non-GAAP income from operations is expected to be between $106 million to $108 million.', ' Non-GAAP income from operations is expected to be between $106 million and $108 million.', ' Non-GAAP income from operations is expected to be between $106 million to $108 million.', ' Non-GAAP income from operations is expected to be between $106 million and $108 million.', ' Non-GAAP income from operations is expected to be between $106 million and $108 million.', ' Non-GAAP income from operations is expected to be between $106 million and $108 million.', ' Non-GAAP income from operations is expected to be between $106 million and $108 million.', ' Non-GAAP income from operations is expected to be between $106 million to $108 million.']]


ORIGINAL SENTENCE
  Non-GAAP net income per share of $.64 to $.65, as

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Non-GAAP net income per share is expected to be between $.64 and $.65, assuming approximately 147 million to 148 million common shares outstanding.', ' Non-GAAP net income per share is expected to be between $.64 and $.65, assuming approximately 147 million to 148 million common shares outstanding.', ' Non-GAAP net income per share is expected to be between $.64 and $.65, assuming approximately 147 million to 148 million common shares outstanding.', ' Non-GAAP net income per share is expected to be between $.64 and $.65, assuming approximately 147 million to 148 million common shares outstanding.', ' Non-GAAP net income per share is expected to be between $.64 and $.65, assuming approximately 147 million to 148 million common shares outstanding.', ' Non-GAAP net income per share is expected to be between $.64 and $.65, assuming approximately 147 million to 148 million common shares outstanding.', ' Non-GAAP net income per share is expected to be between $.64 and $.65, assuming appr

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' These statements are forward-looking and actual results may differ materially.', ' These statements are forward-looking and actual results may differ materially.', ' These statements are forward-looking and actual results may differ materially.', ' These statements are forward-looking and actual results may differ materially.', ' These statements are forward-looking and actual results may differ materially.', ' These statements are forward-looking and actual results may differ materially.', ' These statements are forward-looking and actual results may differ materially.', ' These statements are forward-looking and actual results may differ materially.', ' These statements are forward-looking and actual results may differ materially.', ' These statements are forward-looking and actual results may differ materially.']]


ORIGINAL SENTENCE
 Refer to the Forward-Looking Statements safe harbor below for information on the factors that could cause our actual results to differ materially 

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[['Refer to the Forward-Looking Statements safe harbor below for information on the factors that could cause our actual results to differ materially from these forward-looking statements.', 'Refer to the Forward-Looking Statements safe harbor below for information on the factors that could cause our actual results to differ materially from these forward-looking statements.', 'Refer to the Forward-Looking Statements safe harbor below for information on the factors that could cause our actual results to differ materially from these forward-looking statements.', 'Refer to the Forward-Looking Statements safe harbor below for information on the factors that could cause our actual results to differ materially from these forward-looking statements.', 'Refer to the Forward-Looking Statements safe harbor below for information on the factors that could cause our actual results to differ materially from these forward-looking statements.', 'Refer to the Forward-Looking Statements safe harbor below

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Guidance for non-GAAP income from operations excludes stock-based compensation expense and related employer payroll taxes, amortization expense of acquired intangible assets, asset impairment related to facility exit, amutization of debt discount and issuance costs and income tax effects generated by intangible assets acquired in business acquisitions.', ' Guidance for non-GAAP income from operations excludes stock-based compensation expense and related employer payroll taxes, amortization expense of acquired intangible assets, asset impairment related to facility exit, amORTization of debt discount and issuance costs and income tax effects generated by intangible assets acquired in business acquisitions.', ' Guidance for non-GAAP income from operations excludes stock-based compensation expense and related employer payroll taxes, amortization expense of acquired intangible assets, asset impairment related to facility exit, amORTization of debt discount and issuance costs and income

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[['Guidance for non-GAAP net income per share includes the anti-dilutive impact of the capped call transactions entered into in connection with our convertible senior notes.', 'Guidance for non-GAAP net income per share includes the anti-dilutive impact of the capped call transactions entered into in connection with our convertible senior notes.', 'Guidance for non-GAAP net income per share includes the anti-dilutive impact of the capped call transactions entered into in connection with our convertible senior notes.', 'Guidance for non-GAAP net income per share includes the anti-dilutive impact of the capped call transactions entered into in connection with our convertible senior notes.', 'Guidance for non-GAAP net income per share includes the anti-dilutive impact of the capped call transactions entered into in connection with our convertible senior notes.', 'Guidance for non-GAAP net income per share includes the anti-dilutive impact of the capped call transactions entered into in co

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[['We have not reconciled our expectations to non-GAAP income from operations and non-PAAP net income per share to their most directly comparable GAAP measures because certain items are out of our control or cannot be reasonably predicted.', 'We have not reconciled our expectations to non-GAAP income from operations and non-gaAP net income per share to their most directly comparable GAAP measures because certain items are out of our control or cannot be reasonably predicted.', 'We have not reconciled our expectations to non-GAAP income from operations and non-AdjustAP net income per share to their most directly comparable GAAP measures because certain items are out of our control or cannot be reasonably predicted.', 'We have not reconciled our expectations to non-GAAP income from operations and non-DAAP net income per share to their most directly comparable GAAP measures because certain items are out of our control or cannot be reasonably predicted.', 'We have not reconciled our expect

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[['Accordingly, a reconciliation for the guidance for non-GAAP income from operations and non-gaAP net income per share is not available without unreasonable effort.', 'Accordingly, a reconciliation for the guidance for non-GAAP income from operations and non-gaAP net income per share is not available without unreasonable effort.', 'Accordingly, a reconciliation for the guidance for non-GAAP income from operations and non-gaAP net income per share is not available without unreasonable effort.', 'Accordingly, a reconciliation for the guidance for non-GAAP income from operations and non-gaAP net income per share is not available without unreasonable effort.', 'Accordingly, a reconciliation for the guidance for non-GAAP income from operations and non-gaAP net income per share is not available without unreasonable effort.', 'Accordingly, a reconciliation for the guidance for non-GAAP income from operations and non-gaAP net income per share is not available without unreasonable effort.', 'A

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Conference Call and Webcast Information.', ' Conference Call and Webcast Information.', ' Conference Call and Webcast Information.', ' Conference Call and Webcast Information.', ' Conference Call and Webcast Information.', ' Conference Call and Webcast Information.', ' Conference Call and Webcast Information.', ' Conference Call and Webcast Information.', ' Conference Call and Webcast Information.', ' Conference Call and Webcast Information.']]


ORIGINAL SENTENCE
  Zscaler will host a conference call for analysts and investors to discuss its third quarter fiscal 2022 earnings results and outlook for its fourth quarter of fiscal 2022 and full year fiscal 2022 today at 1:30 p.m. Pacific time (4:30 p.m. Eastern time).
PREDICTED PARAPHRASE



Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Zscaler will host a conference call for analysts and investors to discuss its third quarter fiscal 2022 earnings results and outlook for its fourth quarter of fiscal 2022 and full year fiscal 2022 today at 1:30 p.m. Pacific time (4:30 PM. Eastern time).', ' Zscaler will host a conference call for analysts and investors to discuss its third quarter fiscal 2022 earnings results and outlook for its fourth quarter of fiscal 2022 and full year fiscal 2022 today at 1:30 p.m. Pacific time (4:30 PM. Eastern time).', ' Zscaler will host a conference call for analysts and investors to discuss its third quarter fiscal 2022 earnings results and outlook for its fourth quarter of fiscal 2022 and full year fiscal 2022 today at 1:30 p.m. Pacific time (4:30 PM. Eastern time).', ' Zscaler will host a conference call for analysts and investors to discuss its third quarter fiscal 2022 earnings results and outlook for its fourth quarter of fiscal 2022 and full year fiscal 2022 today at 1:30 p.m. Pacifi

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[['Time in Thursday, May 26, 2022 is 1:30 p.m. PT T.', 'Time in Thursday, May 26, 2022 is 1:30 p.m. PT T.', 'Time in Thursday, May 26, 2022 is 1:30 p.m. PT T.', 'Time in Thursday, May 26, 2022 is 1:30 p.m. PT T.', 'Time in Thursday, May 26, 2022 is 1:30 p.m. PT T.', 'Time in Thursday, May 26, 2022 is 1:30 p.m. PT T.', 'Time in Thursday, May 26, 2022 is 1:30 p.m. PT T.', 'Time in Thursday, May 26, 2022 is 1:30 p.m. PT T.', 'Time in Thursday, May 26, 2022 is 1:30 p.m. PT T.', 'Time in Thursday, May 26, 2022 is 1:30 p.m. PT T.']]


ORIGINAL SENTENCE
 Webcast in Thursday, May 26, 2022 is https://ir.zscaler.com .
PREDICTED PARAPHRASE



Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[['Webcast in Thursday, May 26, 2022 is https://ir.zscaler.com.', 'Webcast in Thursday, May 26, 2022 is https://ir.zscaler.com.', 'Webcast in Thursday, May 26, 2022 is https://ir.zscaler.com.', 'Webcast in Thursday, May 26, 2022 is https://ir.zscaler.com.', 'Webcast in Thursday, May 26, 2022 is https://ir.zscaler.com.', 'Webcast in Thursday, May 26, 2022 is https://ir.zscaler.com.', 'Webcast in Thursday, May 26, 2022 is https://ir.zscaler.com.', 'Webcast in Thursday, May 26, 2022 is https://ir.zscaler.com.', 'Webcast in Thursday, May 26, 2022 is https://ir.zscaler.com.', 'Webcast in Thursday, May 26, 2022 is https://ir.zscaler.com.']]


ORIGINAL SENTENCE
 Dial-in Number in Thursday, May 26, 2022 is 918 - 922 - 3018 T .
PREDICTED PARAPHRASE



Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[['Dial-in Number in Thursday, May 26, 2022 is 918 - 922 - 3018 T.', 'Dial-in Number in Thursday, May 26, 2022 is expected to be between 918 - 922 - 3018 T.', 'Dial-in Number in Thursday, May 26, 2022 is 918 - 922 - 3018 T.', 'Dial-in Number in Thursday, May 26, 2022 is expected to be between 918 - 922 - 3018 T.', 'Dial-in Number in Thursday, May 26, 2022 is expected to be between 918 - 922 - 3018 T.', 'Dial-in Number in Thursday, May 26, 2022 is expected to be between 918 - 922 - 3018 T.', 'Dial-in Number in Thursday, May 26, 2022 is expected to be between 918 - 922 - 3018 T.', 'Dial-in Number in Thursday, May 26, 2022 is expected to be between 918 - 922 - 3018 T.', 'Dial-in Number in Thursday, May 26, 2022 is expected to be between 918 - 922 - 3018 T.', 'Dial-in Number in Thursday, May 26, 2022 is 918 - 922 - 3018 T.']]


ORIGINAL SENTENCE
  Upcoming Conferences.
PREDICTED PARAPHRASE



Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Upcoming Conferences.', ' Upcoming Conferences.', ' Upcoming Conferences.', ' Upcoming Conferences.', ' Upcoming Conferences.', ' Upcoming Conferences.', ' Upcoming Conferences.', ' Upcoming Conferences.', ' Upcoming Conferences.', ' Upcoming Conferences.']]


ORIGINAL SENTENCE
  Fourth quarter of fiscal 2022 virtual investor conference participation schedule:.
PREDICTED PARAPHRASE



Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Fourth quarter of fiscal 2022 virtual investor conference participation schedule:.', ' Fourth quarter of fiscal 2022 virtual investor conference participation schedule:.', ' Fourth quarter of fiscal 2022 virtual investor conference participation schedule:.', ' Fourth quarter of fiscal 2022 virtual investor conference participation schedule:.', ' Fourth quarter of fiscal 2022 virtual investor conference participation schedule:.', ' Fourth quarter of fiscal 2022 virtual investor conference participation schedule:.', ' Fourth quarter of fiscal 2022 virtual investor conference participation schedule:.', ' Fourth quarter of fiscal 2022 virtual investor conference participation schedule:.', ' Fourth quarter of fiscal 2022 virtual investor conference participation schedule:.', ' Fourth quarter of fiscal 2022 virtual investor conference participation schedule:.']]


ORIGINAL SENTENCE
  Loop Software Conference Wednesday, June 1, 2022.
PREDICTED PARAPHRASE



Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Loop Software Conference Wednesday, June 1, 2022.', ' Loop Software Conference Wednesday, June 1, 2022.', ' Loop Software Conference Wednesday, June 1, 2022.', ' Loop Software Conference Wednesday, June 1, 2022.', ' Loop Software Conference Wednesday, June 1, 2022.', ' Loop Software Conference Wednesday, June 1, 2022.', ' Loop Software Conference Wednesday, June 1, 2022.', ' Loop Software Conference Wednesday, June 1, 2022.', ' Loop Software Conference Wednesday, June 1, 2022.', ' Loop Software Conference Wednesday, June 1, 2022.']]


ORIGINAL SENTENCE
  Bank of America's 2022 Global Technology Conference Wednesday, June 8, 2022.
PREDICTED PARAPHRASE



Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[" Bank of America's 2022 Global Technology Conference Wednesday, June 8, 2022.", " Bank of America's 2022 Global Technology Conference Wednesday, June 8, 2022.", " Bank of America's 2022 Global Technology Conference Wednesday, June 8, 2022.", " Bank of America's 2022 Global Technology Conference Wednesday, June 8, 2022.", " Bank of America's 2022 Global Technology Conference Wednesday, June 8, 2022.", " Bank of America's 2022 Global Technology Conference Wednesday, June 8, 2022.", " Bank of America's 2022 Global Technology Conference Wednesday, June 8, 2022.", " Bank of America's 2022 Global Technology Conference Wednesday, June 8, 2022.", " Bank of America's 2022 Global Technology Conference Wednesday, June 8, 2022.", " Bank of America's 2022 Global Technology Conference Wednesday, June 8, 2022."]]


ORIGINAL SENTENCE
  4th Annual Mizuho Cybersecurity Summit 2022 Monday, June 13, 2022.
PREDICTED PARAPHRASE



Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' 4th Annual Mizuho Cybersecurity Summit 2022 Monday, June 13, 2022.', ' 4th Annual Mizuho Cybersecurity Summit 2022 Monday, June 13, 2022.', ' 4th Annual Mizuho Cybersecurity Summit 2022 Monday, June 13, 2022.', ' 4th Annual Mizuho Cybersecurity Summit 2022 Monday, June 13, 2022.', ' 4th Annual Mizuho Cybersecurity Summit 2022 Monday, June 13, 2022.', ' 4th Annual Mizuho Cybersecurity Summit 2022 Monday, June 13, 2022.', ' 4th Annual Mizuho Cybersecurity Summit 2022 Monday, June 13, 2022.', ' 4th Annual Mizuho Cybersecurity Summit 2022 Monday, June 13, 2022.', ' 4th Annual Mizuho Cybersecurity Summit 2022 Monday, June 13, 2022.', ' 4th Annual Mizuho Cybersecurity Summit 2022 Monday, June 13, 2022.']]


ORIGINAL SENTENCE
  Sessions which offer a webcast will be available on the Investor Relations section of the Zscaler website at https://ir.zscaler.com.
PREDICTED PARAPHRASE



Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Sessions which offer a webcast will be available on the Investor Relations section of the Zscaler website at https://ir.zscaler.com.', ' Sessions which offer a webcast will be available on the Investor Relations section of the Zscaler website at https://ir.zscaler.com.', ' Sessions which offer a webcast will be available on the Investor Relations section of the Zscaler website at https://ir.zscaler.com.', ' Sessions which offer a webcast will be available on the Investor Relations section of the Zscaler website at https://ir.zscaler.com.', ' Sessions which offer a webcast will be available on the Investor Relations section of the Zscaler website at https://ir.zscaler.com.', ' Sessions which offer a webcast will be available on the Investor Relations section of the Zscaler website at https://ir.zscaler.com.', ' Sessions which offer a webcast will be available on the Investor Relations section of the Zscaler website at https://ir.zscaler.com.', ' Sessions which offer a webcast will b

In [14]:
createTrainingData(sptrainDir, model, writetofile=True)
writeTrainingData(trainSPFile, sptrainDir)

INFO:numexpr.utils:Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.


Input file ../../Summary/DATA/SENTPAIR/Train\APPN_2022-11-03_EP_YH.txt
Input file APPN_2022-11-03_EP_YH
../../Summary/DATA/SENTPAIR/Train
SP file ../../Summary/DATA/SENTPAIR/Train/APPN_2022-11-03_EP_YH_sp.tsv
SP file ..\..\Summary\DATA\SENTPAIR\Train\APPN_2022-11-03_EP_YH_sp.tsv already exists
Input file ../../Summary/DATA/SENTPAIR/Train\MDB_2023-03-08_EP_YH.txt
Input file MDB_2023-03-08_EP_YH
../../Summary/DATA/SENTPAIR/Train
SP file ../../Summary/DATA/SENTPAIR/Train/MDB_2023-03-08_EP_YH_sp.tsv
SP file ..\..\Summary\DATA\SENTPAIR\Train\MDB_2023-03-08_EP_YH_sp.tsv already exists
Input file ../../Summary/DATA/SENTPAIR/Train\OKTA_2022-08-31_EP_YH.txt
Input file OKTA_2022-08-31_EP_YH
../../Summary/DATA/SENTPAIR/Train
SP file ../../Summary/DATA/SENTPAIR/Train/OKTA_2022-08-31_EP_YH_sp.tsv
SP file ..\..\Summary\DATA\SENTPAIR\Train\OKTA_2022-08-31_EP_YH_sp.tsv already exists
Input file ../../Summary/DATA/SENTPAIR/Train\PLTR_2022-11-07_EP_YH.txt
Input file PLTR_2022-11-07_EP_YH
../../Summary/D

In [2]:
createTrainingData(spdevDir, model, writetofile=True)
writeTrainingData(devSPFile, spdevDir)

NameError: name 'createTrainingData' is not defined

In [9]:
train_df = pd.read_csv(trainSPFile, sep="\t", encoding = "utf-8").astype(str)
eval_df = pd.read_csv(devSPFile, sep="\t", encoding = "utf-8").astype(str)

train_df = train_df.rename(
    columns={"Sentence1": "text_a", "Sentence2": "text_b", "Target": "labels"}
)
eval_df = eval_df.rename(
    columns={"Sentence1": "text_a", "Sentence2": "text_b", "Target": "labels"}
)

train_df = train_df[["text_a", "text_b", "labels"]]
eval_df = eval_df[["text_a", "text_b", "labels"]]

#train_df["prefix"] = "sentpair"
#train_df = train_df[["Sentence1", "Sentence2", "Target"]]

#eval_df["prefix"] = "sentpair"
#eval_df = eval_df[["Sentence1", "Sentence2", "Target"]]

train_df = train_df.dropna()
train_df = train_df[train_df['text_a'].notna()]

eval_df = eval_df.dropna()
eval_df = eval_df[eval_df['text_a'].notna()]

train_df["text_a"] = train_df["text_a"].apply(clean_unnecessary_spaces)
train_df["text_b"] = train_df["text_b"].apply(clean_unnecessary_spaces)
train_df["labels"] = train_df["labels"].astype(int)
print("TRAIN DATA ..............")
print(train_df)

eval_df["text_a"] = eval_df["text_a"].apply(clean_unnecessary_spaces)
eval_df["text_b"] = eval_df["text_b"].apply(clean_unnecessary_spaces)
eval_df["labels"] = eval_df["labels"].astype(int)
print("EVAL DATA ..............")
print(eval_df)
#print(train_df["labels"].value_counts())

INFO:numexpr.utils:Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.


TRAIN DATA ..............
                                                text_a  \
0                                  Appian Corporation.   
1     Third quarter cloud subscription revenue incr...   
2          MCLEAN, Va., Nov. 03, 2022 (GLOBE NEWSWIRE)   
3    -- Appian (Nasdaq: APPN) today announced finan...   
4            Third Quarter 2022 Financial Highlights:.   
..                                                 ...   
945  Full fiscal year non-GAAP diluted EPS is expec...   
946  Full fiscal year non-GAAP diluted EPS is expec...   
947  Full fiscal year non-GAAP diluted EPS is expec...   
948  Full fiscal year non-GAAP diluted EPS is expec...   
949  Full fiscal year non-GAAP diluted EPS is expec...   

                                                text_b  labels  
0                                  Appian Corporation.       1  
1     Third quarter cloud subscription revenue is $...       1  
2          MCLEAN, Va., Nov. 03, 2022 (GLOBE NEWSWIRE)       1  
3    -- Appian (N

In [10]:
spmodel_args = ClassificationArgs()
spmodel_args.do_sample = True
spmodel_args.train_batch_size = 4
spmodel_args.use_multiprocessing = False
spmodel_args.num_train_epochs = 5
#spmodel_args.learning_rate = 5e-5
#spmodel_args.no_save = True

spmodel_args.eval_batch_size = 4
spmodel_args.evaluate_generated_text = True
spmodel_args.evaluate_during_training = True
spmodel_args.evaluate_during_training_verbose = True
spmodel_args.evaluate_during_training_steps = 50

spmodel_args.overwrite_output_dir = True
spmodel_args.reprocess_input_data = True
spmodel_args.save_eval_checkpoints = False
spmodel_args.save_model_every_epoch = False
spmodel_args.save_steps = -1

spmodel_args.max_length = 64
#spmodel_args.max_seq_length = 32
#spmodel_args.num_return_sequences = 10
spmodel_args.top_k = 50
spmodel_args.top_p = 0.95
spmodel_args.fp16 = False
#spmodel_args.num_beams = None

spmodel_args.use_early_stopping = True
spmodel_args.early_stopping_delta = 0.01
spmodel_args.early_stopping_metric = "mcc"
spmodel_args.early_stopping_metric_minimize = False
spmodel_args.early_stopping_patience = 5
#spmodel_args.evaluate_during_training_steps = 500
#spmodel_args.wandb_project =  "visualization-demo"

spmodel_args.output_dir = "spoutputs/"

spmodel = ClassificationModel(
    "roberta", 
    "roberta-base",
    args=spmodel_args,
    use_cuda=True,
)

#model = T5Model("t5", "t5-base", args=model_args, use_cuda=True)

# Train the model
spmodel.train_model(
    train_df, eval_df=eval_df, acc=sklearn.metrics.accuracy_score
)
results = spmodel.eval_model(eval_df, acc=sklearn.metrics.accuracy_score)

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_train_roberta_128_2_3


Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/238 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/321 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_roberta_128_2_3
INFO:simpletransformers.classification.classification_model:{'mcc': 0.0, 'tp': 226, 'tn': 0, 'fp': 95, 'fn': 0, 'auroc': 0.7504890544946436, 'auprc': 0.8861886026405987, 'acc': 0.7040498442367601, 'eval_loss': 0.6404289853425673}
INFO:simpletransformers.classification.classification_model: No improvement in mcc
INFO:simpletransformers.classification.classification_model: Current step: 1
INFO:simpletransformers.classification.classification_model: Early stopping patience: 5
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/321 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_roberta_128_2_3
INFO:simpletransformers.classification.classification_model:{'mcc': 0.0, 'tp': 226, 'tn': 0, 'fp': 95, 'fn': 0, 'auroc': 0.7312063344201211, 'auprc': 0.8768380907846781, 'acc': 0.7040498442367601, 'eval_loss': 0.5888809635315413}
INFO:simpletransformers.classification.classification_model: No improvement in mcc
INFO:simpletransformers.classification.classification_model: Current step: 2
INFO:simpletransformers.classification.classification_model: Early stopping patience: 5
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/321 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_roberta_128_2_3
INFO:simpletransformers.classification.classification_model:{'mcc': 0.31809872121085514, 'tp': 156, 'tn': 62, 'fp': 33, 'fn': 70, 'auroc': 0.7379133674895203, 'auprc': 0.8903308873854201, 'acc': 0.6791277258566978, 'eval_loss': 0.5798589151674582}
INFO:simpletransformers.classification.classification_model: No improvement in mcc
INFO:simpletransformers.classification.classification_model: Current step: 3
INFO:simpletransformers.classification.classification_model: Early stopping patience: 5
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/321 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_roberta_128_2_3
INFO:simpletransformers.classification.classification_model:{'mcc': 0.41573068865689744, 'tp': 136, 'tn': 81, 'fp': 14, 'fn': 90, 'auroc': 0.7512808570097811, 'auprc': 0.8921216379224703, 'acc': 0.67601246105919, 'eval_loss': 0.6268648949486239}
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/321 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_roberta_128_2_3
INFO:simpletransformers.classification.classification_model:{'mcc': 0.4342838300312293, 'tp': 113, 'tn': 91, 'fp': 4, 'fn': 113, 'auroc': 0.8100139729855612, 'auprc': 0.9170899746601476, 'acc': 0.6355140186915887, 'eval_loss': 0.8746898634658183}


Running Epoch 1 of 5:   0%|          | 0/238 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/321 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_roberta_128_2_3
INFO:simpletransformers.classification.classification_model:{'mcc': 0.48208301692254146, 'tp': 143, 'tn': 85, 'fp': 10, 'fn': 83, 'auroc': 0.8129017233348859, 'auprc': 0.9143276004416999, 'acc': 0.7102803738317757, 'eval_loss': 0.6389825644063545}
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/321 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_roberta_128_2_3
INFO:simpletransformers.classification.classification_model:{'mcc': 0.4586734067305688, 'tp': 142, 'tn': 83, 'fp': 12, 'fn': 84, 'auroc': 0.7912901723334885, 'auprc': 0.905849579128193, 'acc': 0.7009345794392523, 'eval_loss': 0.6401038119960346}
INFO:simpletransformers.classification.classification_model: No improvement in mcc
INFO:simpletransformers.classification.classification_model: Current step: 1
INFO:simpletransformers.classification.classification_model: Early stopping patience: 5
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/321 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_roberta_128_2_3
INFO:simpletransformers.classification.classification_model:{'mcc': 0.3687909629281912, 'tp': 161, 'tn': 65, 'fp': 30, 'fn': 65, 'auroc': 0.8063809967396367, 'auprc': 0.9149941290144958, 'acc': 0.7040498442367601, 'eval_loss': 0.5280114113824603}
INFO:simpletransformers.classification.classification_model: No improvement in mcc
INFO:simpletransformers.classification.classification_model: Current step: 2
INFO:simpletransformers.classification.classification_model: Early stopping patience: 5
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/321 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_roberta_128_2_3
INFO:simpletransformers.classification.classification_model:{'mcc': 0.3789612798878791, 'tp': 129, 'tn': 80, 'fp': 15, 'fn': 97, 'auroc': 0.7679087098276665, 'auprc': 0.8829458680299361, 'acc': 0.6510903426791277, 'eval_loss': 0.6307815719901779}
INFO:simpletransformers.classification.classification_model: No improvement in mcc
INFO:simpletransformers.classification.classification_model: Current step: 3
INFO:simpletransformers.classification.classification_model: Early stopping patience: 5
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/321 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_roberta_128_2_3
INFO:simpletransformers.classification.classification_model:{'mcc': 0.5150860545376263, 'tp': 185, 'tn': 68, 'fp': 27, 'fn': 41, 'auroc': 0.8531904983698183, 'auprc': 0.9334056421923377, 'acc': 0.7881619937694704, 'eval_loss': 0.5260066050162286}
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/321 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_roberta_128_2_3
INFO:simpletransformers.classification.classification_model:{'mcc': 0.49681448585073573, 'tp': 204, 'tn': 53, 'fp': 42, 'fn': 22, 'auroc': 0.8536096879366558, 'auprc': 0.9295391355561133, 'acc': 0.8006230529595015, 'eval_loss': 0.4740552768847089}


Running Epoch 2 of 5:   0%|          | 0/238 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/321 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_roberta_128_2_3
INFO:simpletransformers.classification.classification_model:{'mcc': 0.40029287546029985, 'tp': 211, 'tn': 37, 'fp': 58, 'fn': 15, 'auroc': 0.8520260829063809, 'auprc': 0.9292783168543419, 'acc': 0.7725856697819314, 'eval_loss': 0.6575259799425157}
INFO:simpletransformers.classification.classification_model: No improvement in mcc
INFO:simpletransformers.classification.classification_model: Current step: 1
INFO:simpletransformers.classification.classification_model: Early stopping patience: 5
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/321 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_roberta_128_2_3
INFO:simpletransformers.classification.classification_model:{'mcc': 0.5420181683530305, 'tp': 185, 'tn': 71, 'fp': 24, 'fn': 41, 'auroc': 0.8470889613414065, 'auprc': 0.9234530187362491, 'acc': 0.7975077881619937, 'eval_loss': 0.4815158004998977}
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/321 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_roberta_128_2_3
INFO:simpletransformers.classification.classification_model:{'mcc': 0.5281006995208005, 'tp': 159, 'tn': 83, 'fp': 12, 'fn': 67, 'auroc': 0.848160223567769, 'auprc': 0.9190901138694909, 'acc': 0.7538940809968847, 'eval_loss': 0.7303289007596717}
INFO:simpletransformers.classification.classification_model: No improvement in mcc
INFO:simpletransformers.classification.classification_model: Current step: 1
INFO:simpletransformers.classification.classification_model: Early stopping patience: 5
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/321 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_roberta_128_2_3
INFO:simpletransformers.classification.classification_model:{'mcc': 0.5901118127612061, 'tp': 198, 'tn': 68, 'fp': 27, 'fn': 28, 'auroc': 0.8619934792734049, 'auprc': 0.9351380555256623, 'acc': 0.8286604361370716, 'eval_loss': 0.5293819735223358}
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/321 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_roberta_128_2_3
INFO:simpletransformers.classification.classification_model:{'mcc': 0.5621156925378231, 'tp': 182, 'tn': 75, 'fp': 20, 'fn': 44, 'auroc': 0.8529110386585934, 'auprc': 0.9257541312435671, 'acc': 0.8006230529595015, 'eval_loss': 0.6518810080122892}
INFO:simpletransformers.classification.classification_model: No improvement in mcc
INFO:simpletransformers.classification.classification_model: Current step: 1
INFO:simpletransformers.classification.classification_model: Early stopping patience: 5
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/321 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_roberta_128_2_3
INFO:simpletransformers.classification.classification_model:{'mcc': 0.549297523531815, 'tp': 166, 'tn': 82, 'fp': 13, 'fn': 60, 'auroc': 0.8625523986958548, 'auprc': 0.9325588327055547, 'acc': 0.7725856697819314, 'eval_loss': 0.7336089677625784}


Running Epoch 3 of 5:   0%|          | 0/238 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/321 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_roberta_128_2_3
INFO:simpletransformers.classification.classification_model:{'mcc': 0.5600733733557253, 'tp': 193, 'tn': 68, 'fp': 27, 'fn': 33, 'auroc': 0.8544946436888683, 'auprc': 0.9306739324423292, 'acc': 0.8130841121495327, 'eval_loss': 0.694684260922921}
INFO:simpletransformers.classification.classification_model: No improvement in mcc
INFO:simpletransformers.classification.classification_model: Current step: 2
INFO:simpletransformers.classification.classification_model: Early stopping patience: 5
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/321 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_roberta_128_2_3
INFO:simpletransformers.classification.classification_model:{'mcc': 0.5102809628007935, 'tp': 189, 'tn': 65, 'fp': 30, 'fn': 37, 'auroc': 0.8627387051700045, 'auprc': 0.9379955715928128, 'acc': 0.7912772585669782, 'eval_loss': 1.0068078761047936}
INFO:simpletransformers.classification.classification_model: No improvement in mcc
INFO:simpletransformers.classification.classification_model: Current step: 3
INFO:simpletransformers.classification.classification_model: Early stopping patience: 5
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/321 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_roberta_128_2_3
INFO:simpletransformers.classification.classification_model:{'mcc': 0.5276141697063341, 'tp': 195, 'tn': 63, 'fp': 32, 'fn': 31, 'auroc': 0.8711690731252911, 'auprc': 0.9409303102499216, 'acc': 0.8037383177570093, 'eval_loss': 0.782242550937183}
INFO:simpletransformers.classification.classification_model: No improvement in mcc
INFO:simpletransformers.classification.classification_model: Current step: 4
INFO:simpletransformers.classification.classification_model: Early stopping patience: 5
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/321 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_roberta_128_2_3
INFO:simpletransformers.classification.classification_model:{'mcc': 0.5185218961622727, 'tp': 207, 'tn': 53, 'fp': 42, 'fn': 19, 'auroc': 0.8659990684676293, 'auprc': 0.9389595813018529, 'acc': 0.8099688473520249, 'eval_loss': 0.8348083303160505}
INFO:simpletransformers.classification.classification_model: No improvement in mcc
INFO:simpletransformers.classification.classification_model: Current step: 5
INFO:simpletransformers.classification.classification_model: Early stopping patience: 5
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/321 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_roberta_128_2_3
INFO:simpletransformers.classification.classification_model:{'mcc': 0.6188597880854524, 'tp': 197, 'tn': 72, 'fp': 23, 'fn': 29, 'auroc': 0.8981369352585002, 'auprc': 0.9545770974533931, 'acc': 0.838006230529595, 'eval_loss': 0.5725787619738207}
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/321 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_roberta_128_2_3
INFO:simpletransformers.classification.classification_model:{'mcc': 0.6188597880854524, 'tp': 197, 'tn': 72, 'fp': 23, 'fn': 29, 'auroc': 0.8979506287843504, 'auprc': 0.9545091772042817, 'acc': 0.838006230529595, 'eval_loss': 0.5715283384420161}


Running Epoch 4 of 5:   0%|          | 0/238 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/321 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_roberta_128_2_3
INFO:simpletransformers.classification.classification_model:{'mcc': 0.6075393296961292, 'tp': 198, 'tn': 70, 'fp': 25, 'fn': 28, 'auroc': 0.8905449464368886, 'auprc': 0.9496699771838208, 'acc': 0.8348909657320872, 'eval_loss': 0.7328953908290714}
INFO:simpletransformers.classification.classification_model: No improvement in mcc
INFO:simpletransformers.classification.classification_model: Current step: 1
INFO:simpletransformers.classification.classification_model: Early stopping patience: 5
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/321 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_roberta_128_2_3
INFO:simpletransformers.classification.classification_model:{'mcc': 0.6075393296961292, 'tp': 198, 'tn': 70, 'fp': 25, 'fn': 28, 'auroc': 0.8872380065207267, 'auprc': 0.9469453910926576, 'acc': 0.8348909657320872, 'eval_loss': 0.7392786660872652}
INFO:simpletransformers.classification.classification_model: No improvement in mcc
INFO:simpletransformers.classification.classification_model: Current step: 2
INFO:simpletransformers.classification.classification_model: Early stopping patience: 5
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/321 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_roberta_128_2_3
INFO:simpletransformers.classification.classification_model:{'mcc': 0.5752057778307412, 'tp': 197, 'tn': 67, 'fp': 28, 'fn': 29, 'auroc': 0.8838379133674896, 'auprc': 0.9449190733388264, 'acc': 0.822429906542056, 'eval_loss': 0.7825780355337041}
INFO:simpletransformers.classification.classification_model: No improvement in mcc
INFO:simpletransformers.classification.classification_model: Current step: 3
INFO:simpletransformers.classification.classification_model: Early stopping patience: 5
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/321 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_roberta_128_2_3
INFO:simpletransformers.classification.classification_model:{'mcc': 0.59272017127421, 'tp': 197, 'tn': 69, 'fp': 26, 'fn': 29, 'auroc': 0.8831858407079647, 'auprc': 0.9444756876248561, 'acc': 0.8286604361370716, 'eval_loss': 0.8106535985551337}
INFO:simpletransformers.classification.classification_model: No improvement in mcc
INFO:simpletransformers.classification.classification_model: Current step: 4
INFO:simpletransformers.classification.classification_model: Early stopping patience: 5
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/321 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_roberta_128_2_3
INFO:simpletransformers.classification.classification_model:{'mcc': 0.5752057778307412, 'tp': 197, 'tn': 67, 'fp': 28, 'fn': 29, 'auroc': 0.8808104331625524, 'auprc': 0.9432597479561418, 'acc': 0.822429906542056, 'eval_loss': 0.8353896502643985}
INFO:simpletransformers.classification.classification_model: Training of roberta model complete. Saved to spoutputs/.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/321 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_roberta_128_2_3


Running Evaluation:   0%|          | 0/81 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model:{'mcc': 0.5752057778307412, 'tp': 197, 'tn': 67, 'fp': 28, 'fn': 29, 'auroc': 0.8808104331625524, 'auprc': 0.9432597479561418, 'acc': 0.822429906542056, 'eval_loss': 0.8353896502643985}


In [30]:
spmodel = ClassificationModel(
    "roberta", "spoutputs", args=spmodel_args
)

predictions, raw_outputs = spmodel.predict(
    [
        [
            "Total revenue was $260.0 million, an increase of 66% year-over-year",
            "Total revenue is $260.0 million"
        ]
    ]
)
print(predictions)

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[1]


In [None]:
files = glob.glob(sptestDir+"/*_EP_YH.txt")
#print(files)
#print(inputDir)
if(len(files) > 0):
    for file in (files):
        print("Input file " + file)
        sentences = getSentences(file, nlp)
        for line in sentences:
            print(line)
            predictPhrase(line, model, spmodel)

Input file ../../Summary/DATA/SENTPAIR/Test\CRWD_2022-11-29_EP_YH.txt
63
https://finance.yahoo.com/news/crowdstrike-reports-third-quarter-fiscal-210500874.html
GAAP Net Loss Per Share is $(0.24) in third quarter 2023 @@@
Cash And Cash Equivalents is $2466551 T in third quarter 2023 @@@
GAAP Gross Profit is $422654 T in third quarter 2023 @@@
GAAP Gross Margin is 72.76% in third quarter 2023 @@@
PG*** Revenue grows 53% year-over-year to reach $581 million.


Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Revenue grows 53% year-over-year to reach $581 million.', [' Revenue is $581 million.', ' Revenue is $581 million.', ' Revenue is $581 million.', ' Revenue is $581 million.', ' Revenue is $581 million.', ' Revenue is $581 million.', ' Revenue grows 53% year-over-year to reach $581 million.', ' Revenue is $581 million.', ' Revenue is $581 million.', ' Revenue is $581 million.']]]
PG*** Ending ARR grows 54% year-over-year to reach $2.34 billion.


Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Ending ARR grows 54% year-over-year to reach $2.34 billion.', [' Ending ARR is $2.34 billion.', ' Ending ARR is $2.34 billion.', ' Ending ARR is $2.34 billion.', ' Ending ARR is $2.34 billion.', ' Ending ARR is $2.34 billion.', ' Ending ARR is $2.34 billion.', ' Ending ARR is $2.34 billion.', ' Ending ARR is $2.34 billion.', ' Ending ARR is $2.34 billion.', ' Ending ARR is $2.34 billion.']]]
PG*** Achieves record cash flow from operations of $243 million and record free cash flow of $174 million.


Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Achieves record cash flow from operations of $243 million and record free cash flow of $174 million.', [' Achieves record cash flow from operations of $243 million and free cash flow is $174 million.', ' Achieves free cash flow of $243 million and free cash Flow is $174 million.', ' Achieves record cash flow from operations of $243 million and free cash flow of $174 million.', ' Achieves record cash flow from operations is $243 million and free cash flow is $174 million.', ' Achieves record cash flow from operations of $243 million and free cash flow is $174 million.', ' Achieves record cash flow from operations of $243 million and free cash flow is $174 million.', ' Achieves record cash flow from operations of $243 million and free cash flow is $174 million.', ' Achieves free cash flow of $243 million and free cash Flow is $174 million.', ' Achieves record cash flow from operations of $243 million and free cash flow is $174 million.', ' Achieves record cash flow from operations of

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' AUSTIN, Texas, November 29, 2022--(BUSINESS WIRE)--CrowdStrike Holdings, Inc. (Nasdaq: CRWD), a global cybersecurity leader that provides cloud-delivered protection of endpoints, cloud workloads, identity and data, today announced financial results for the third quarter fiscal year 2023, ended October 31, 2022.', [' AUSTIN, Texas, November 29, 2022--(BUSINESS WIRE)--CrowdStrike Holdings, Inc. (Nasdaq: CRWD), a global cybersecurity leader that provides cloud-delivered protection of endpoints, cloud workloads, identity and data, today announced financial results', ' AUSTIN, Texas, November 29, 2022--(BUSINESS WIRE)--CrowdStrike Holdings, Inc. (Nasdaq: CRWD), a global cybersecurity leader that provides cloud-delivered protection of endpoints, cloud workloads, identity and data, today announced financial results', ' AUSTIN, Texas, November 29, 2022--(BUSINESS WIRE)--CrowdStrike Holdings, Inc. (Nasdaq: CRWD), a global cybersecurity leader that provides cloud-delivered protection of endp

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[['However, total net new ARR was below our expectations as increased macroeconomic headwinds elongated sales cycles with smaller customers and caused some larger customers to pursue multi-phase subscription start dates, which delays ARR recognition until future quarters.', ['However, total net new ARR is below our expectations as increased macroeconomic headwinds elongated sales cycles with smaller customers and caused some larger customers to pursue multi-phase subscription start dates, which delays ARR recognition until future quarters.', 'However, total net new ARR is below our expectations as increased macroeconomic headwinds elongated sales cycles with smaller customers and caused some larger customers to pursue multi-phase subscription start dates, which delays ARR recognition until future quarters.', 'However, total net new ARR is below our expectations as increased macroeconomic headwinds elongated sales cycles with smaller customers and caused some larger customers to pursue 

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[['As a platform consolidator with industry leading efficacy, we differentiate ourselves from the competition and empower customers, which we believe positions us to capture enduring industry trends and generate durable long-term growth.".', ['As a platform consolidator with industry leading efficacy, we differentiate ourselves from the competition and empower customers, which we believe positions us to capture enduring industry trends and generate durable long-term growth.".', 'As a platform consolidator with industry leading efficacy, we differentiate ourselves from the competition and empower customers, which we believe positions us to capture enduring industry trends and generate durable long-term growth.".', 'As a platform consolidator with industry leading efficacy, we differentiate ourselves from the competition and empower customers, which we believe positions us to capture enduring industry trends and generate durable long-term growth.".', 'As a platform consolidator with indu

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[['We will continue to focus on delivering strong unit economics as we balance growth with profitability and free cash flow.".', ['We will continue to focus on delivering strong unit economics as we balance growth with profitability and free cash flow.".', 'We will continue to focus on delivering strong unit economics as we balance growth with profitability and free cash flow.".', 'We will continue to focus on delivering strong unit economics as we balance growth with profitability and free cash flow.".', 'We will continue to focus on delivering strong unit economics as we balance growth with profitability and free cash flow.".', 'We will continue to focus on delivering strong unit economics as we balance growth with profitability and free cash flow.".', 'We will continue to focus on delivering strong unit economics as we balance growth with profitability and free cash flow.".', 'We will continue to focus on delivering strong unit economics as we balance growth with profitability and f

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Third Quarter Fiscal 2023 Financial Highlights.', [' Third Quarter Fiscal 2023 Financial Highlights.', ' Third Quarter Fiscal 2023 Financial Highlights.', ' Third Quarter Fiscal 2023 Financial Highlights.', ' Third Quarter Fiscal 2023 Financial Highlights.', ' Third Quarter Fiscal 2023 Financial Highlights.', ' Third Quarter Fiscal 2023 Financial Highlights.', ' Third Quarter Fiscal 2023 Financial Highlights.', ' Third Quarter Fiscal 2023 Financial Highlights.', ' Third Quarter Fiscal 2023 Financial Highlights.', ' Third Quarter Fiscal 2023 Financial Highlights.']]]
PG*** Revenue: Total revenue was $580.9 million, a 53% increase, compared to $380.1 million in the third quarter of fiscal 2022.


Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Revenue: Total revenue was $580.9 million, a 53% increase, compared to $380.1 million in the third quarter of fiscal 2022.', [' Total revenue is $580.9 million.', ' Total revenue is $580.9 million in the third quarter of fiscal 2022.', ' Total revenue is $580.9 million.', ' Total revenue is $580.9 million.', ' Total revenue is $580.9 million.', ' Total revenue is $580.9 million.', ' Total revenue is $580.9 million.', ' Total revenue is $580.9 million.', ' Total revenue is $580.9 million.', ' Total revenue is $580.9 million.']]]
Subscription revenue was $547.4 million, a 53% increase, compared to $357.0 million in the third quarter of fiscal 2022.


Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[['Subscription revenue was $547.4 million, a 53% increase, compared to $357.0 million in the third quarter of fiscal 2022.', ['Subscription revenue is $547.4 million.', 'Subscription revenue is $547.4 million.', 'Subscription revenue is $547.4 million.', 'Subscription revenue is $547.4 million.', 'Subscription revenue is $547.4 million.', 'Subscription revenue is $547.4 million.', 'Subscription revenue is $547.4 million.', 'Subscription revenue is $547.4 million.', 'Subscription revenue is $547.4 million.', 'Subscription revenue is $547.4 million.']]]
PG*** Annual Recurring Revenue (ARR) increased 54% year-over-year and grew to $2.34 billion as of October 31, 2022, of which $198.1 million was net new ARR added in the quarter.


Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Annual Recurring Revenue (ARR) increased 54% year-over-year and grew to $2.34 billion as of October 31, 2022, of which $198.1 million was net new ARR added in the quarter.', [' Annual Recurring Revenue (ARR) is $2.34 billion as of October 31, 2022.', ' Annual Recurring Revenue (ARR) is $2.34 billion as of October 31, 2022.', ' Annual Recurring Revenue (ARR) is $2.34 billion as of October 31, 2022.', ' Annual Recurring Revenue (ARR) is $2.34 billion as of October 31, 2022.', ' Annual Recurring Revenue (ARR) is $2.34 billion as of October 31, 2022.', ' Annual Recurring Revenue (ARR) is $2.34 billion as of October 31, 2022.', ' Annual Recurring Revenue (ARR) is $2.34 billion as of October 31, 2022, of which $198.1 million is net new ARR added in the quarter.', ' Annual Recurring Revenue (ARR) is $2.34 billion as of October 31, 2022.', ' Annual Recurring Revenue (ARR) is $2.34 billion as of October 31, 2022.', ' Annual Recurring Revenue (ARR) is $2.34 billion as of October 31, 2022.']]

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[['The acquisition of Reposify contributed less than $1.0 million to net new ARR in the third quarter of fiscal 2023.', ['The acquisition of Reposify contributed less than $1.0 million to net new ARR in the third quarter of fiscal 2023.', 'The acquisition of Reposify contributed less than $1.0 million to net new ARR in the third quarter of fiscal 2023.', 'The acquisition of Reposify contributed less than $1.0 million to net new ARR in the third quarter of fiscal 2023.', 'The acquisition of Reposify contributed less than $1.0 million to net new ARR in the third quarter of fiscal 2023.', 'The acquisition of Reposify contributed less than $1.0 million to net new ARR in the third quarter of fiscal 2023.', 'The acquisition of Reposify contributed less than $1.0 million to net new ARR in the third quarter of fiscal 2023.', 'The acquisition of Reposify contributed less than $1.0 million to net new ARR in the third quarter of fiscal 2023.', 'The acquisition of Reposify contributed less than $1

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Subscription Gross Margin: GAAP subscription gross margin was 75%, compared to 76% in the third quarter of fiscal 2022.', [' GAAP subscription gross margin is 75%.', ' GAAP subscription gross margin is 75%.', ' GAAP subscription gross margin is 75%.', ' GAAP subscription gross margin is 75%.', ' GAAP subscription gross margin is 75%.', ' GAAP subscription gross margin is 75%.', ' GAAP subscription gross margin is 75%.', ' GAAP subscription gross margin is 75%.', ' GAAP subscription gross margin is 75%.', ' GAAP subscription gross margin is 75%.']]]
Non-GAAP subscription gross margin was 78%, compared to 79% in the third quarter of fiscal 2022.


Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[['Non-GAAP subscription gross margin was 78%, compared to 79% in the third quarter of fiscal 2022.', ['Non-GAAP subscription gross margin is 78%.', 'Non-GAAP subscription gross margin is 78%.', 'Non-GAAP subscription gross margin is 78%.', 'Non-GAAP subscription gross margin is 78%.', 'Non-GAAP subscription gross margin is 78%.', 'Non-GAAP subscription gross margin is 78%.', 'Non-GAAP subscription gross margin is 78%.', 'Non-GAAP subscription gross margin is 78%.', 'Non-GAAP subscription gross margin is 78%.', 'Non-GAAP subscription gross margin is 78%.']]]
PG*** Income/Loss from Operations: GAAP loss from operations was $56.4 million, compared to $40.3 million in the third quarter of fiscal 2022.


Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Income/Loss from Operations: GAAP loss from operations was $56.4 million, compared to $40.3 million in the third quarter of fiscal 2022.', [' GAAP loss from operations is $56.4 million.', ' GAAP loss from operations is $56.4 million.', ' GAAP loss from operations is $56.4 million.', ' GAAP loss from operations is $56.4 million.', ' GAAP loss from operations is $56.4 million.', ' GAAP loss from operations is $56.4 million.', ' GAAP loss from operations is $56.4 million.', ' GAAP loss from operations is $56.4 million.', ' GAAP loss from operations is $56.4 million.', ' GAAP loss from operations is $56.4 million.']]]
Non-GAAP income from operations was $89.7 million, compared to $50.7 million in the third quarter of fiscal 2022.


Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[['Non-GAAP income from operations was $89.7 million, compared to $50.7 million in the third quarter of fiscal 2022.', ['Non-GAAP income from operations is $89.7 million.', 'Non-GAAP income from operations is $89.7 million.', 'Non-GAAP income from operations is $89.7 million.', 'Non-GAAP income from operations is $89.7 million.', 'Non-GAAP income from operations is $89.7 million.', 'Non-GAAP income from operations is $89.7 million.', 'Non-GAAP income from operations is $89.7 million.', 'Non-GAAP income from operations is $89.7 million.', 'Non-GAAP income from operations is $89.7 million.', 'Non-GAAP income from operations is $89.7 million.']]]
PG*** Net Income/Loss Attributable to CrowdStrike: GAAP net loss attributable to CrowdStrike was $55.0 million, compared to $50.5 million in the third quarter of fiscal 2022.


Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Net Income/Loss Attributable to CrowdStrike: GAAP net loss attributable to CrowdStrike was $55.0 million, compared to $50.5 million in the third quarter of fiscal 2022.', [' GAAP net loss is $55.0 million.', ' GAAP net loss is $55.0 million.', ' GAAP net loss is $55.0 million.', ' GAAP net loss is $55.0 million.', ' GAAP net loss is $55.0 million.', ' GAAP net loss is $55.0 million.', ' GAAP net loss is $55.0 million.', ' GAAP net loss is $55.0 million.', ' GAAP net loss is $55.0 million.', ' GAAP net loss is $55.0 million.']]]
GAAP net loss per share attributable to CrowdStrike, basic and diluted was $0.24, compared to $0.22 in the third quarter of fiscal 2022.


Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[['GAAP net loss per share attributable to CrowdStrike, basic and diluted was $.24, compared to $.22 in the third quarter of fiscal 2022.', ['GAAP net loss per share attributable to CrowdStrike, basic and diluted is $.24.', 'GAAP net loss per share attributable to CrowdStrike, basic and diluted is $.24.', 'GAAP net loss per share attributable to CrowdStrike, basic and diluted is $.24.', 'GAAP net loss per share is $.24.', 'GAAP net loss per share is $.24.', 'GAAP net loss per share attributable to CrowdStrike, basic and diluted is $.24.', 'GAAP net loss per share attributable to CrowdStrike, basic and diluted is $.24.', 'GAAP net loss per share is $.24.', 'GAAP net loss per share attributable to CrowdStrike, basic and diluted is $.24.', 'GAAP net loss per share is $.24.']]]
Non-GAAP net income attributable to CrowdStrike was $96.1 million, compared to $41.1 million in the third quarter of fiscal 2022.


Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[['Non-GAAP net income attributable to CrowdStrike was $96.1 million, compared to $41.1 million in the third quarter of fiscal 2022.', ['Non-GAAP net income is $96.1 million.', 'Non-GAAP net income is $96.1 million.', 'Non-GAAP net income is $96.1 million.', 'Non-GAAP net income is $96.1 million.', 'Non-GAAP net income is $96.1 million.', 'Non-GAAP net income is $96.1 million.', 'Non-GAAP net income is $96.1 million.', 'Non-GAAP net income is $96.1 million.', 'Non-GAAP net income attributable to CrowdStrike is $96.1 million.', 'Non-GAAP net income attributable to CrowdStrike is $96.1 million.']]]
Non-GAAP net income attributable to CrowdStrike per share, diluted, was $0.40, compared to $0.17 in the third quarter of fiscal 2022.


Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[['Non-GAAP net income attributable to CrowdStrike per share, diluted, was $.40, compared to $.17 in the third quarter of fiscal 2022.', ['Non-GAAP net income per share, diluted, is $.40.', 'Non-GAAP net income per share is $.40.', 'Non-GAAP net income per share, diluted, is $.40.', 'Non-GAAP net income per share, diluted, is $.40.', 'Non-GAAP net income per share, diluted, is $.40.', 'Non-GAAP net income per share is $.40.', 'Non-GAAP net income per share, diluted, is $.40.', 'Non-GAAP net income per share is $.40.', 'Non-GAAP net income per share, diluted, is $.40.', 'Non-GAAP net income per share is $.40.']]]
PG*** Cash Flow: Net cash generated from operations was $242.9 million, compared to $159.1 million in the third quarter of fiscal 2022.


Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Cash Flow: Net cash generated from operations was $242.9 million, compared to $159.1 million in the third quarter of fiscal 2022.', [' Net cash generated from operations is $242.9 million.', ' Net cash flow generated from operations is $242.9 million.', ' Net cash generated from operations is $242.9 million.', ' Net cash generated from operations is $242.9 million.', ' Net cash generated from operations is $242.9 million.', ' Net cash flow generated from operations is $242.9 million.', ' Net cash generated from operations is $242.9 million.', ' Net cash generated from operations is $242.9 million.', ' Net cash generated from operations is $242.9 million.', ' Net cash generated from operations is $242.9 million.']]]
Free cash flow was $174.1 million, compared to $123.5 million in the third quarter of fiscal 2022.


Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[['Free cash flow was $174.1 million, compared to $123.5 million in the third quarter of fiscal 2022.', ['Free cash flow is $174.1 million.', 'Free cash flow is $174.1 million.', 'Free cash flow is $174.1 million.', 'Free cash flow is $174.1 million.', 'Free cash flow is $174.1 million.', 'Free cash flow is $174.1 million.', 'Free cash flow is $174.1 million.', 'Free cash flow is $174.1 million.', 'Free cash flow is $174.1 million.', 'Free cash flow is $174.1 million.']]]
PG*** Cash and Cash Equivalents was $2.47 billion as of October 31, 2022.


Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Cash and Cash Equivalents was $2.47 billion as of October 31, 2022.', [' Cash and Cash Equivalents is $2.47 billion as of October 31, 2022.', ' Cash and Cash Equivalents is $2.47 billion as of October 31, 2022.', ' Cash and Cash Equivalents is $2.47 billion as of October 31, 2022.', ' Cash and Cash Equivalents is $2.47 billion as of October 31, 2022.', ' Cash and Cash Equivalents is $2.47 billion as of October 31, 2022.', ' Cash and Cash Equivalents is $2.47 billion as of October 31, 2022.', ' Cash and Cash Equivalents is $2.47 billion as of October 31, 2022.', ' Cash and Cash Equivalents is $2.47 billion as of October 31, 2022.', ' Cash and Cash Equivalents is $2.47 billion as of October 31, 2022.', ' Cash and Cash Equivalents is $2.47 billion as of October 31, 2022.']]]
PG***
Story continues.


Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[['Story continues.', ['Story continues.', 'Story continues.', 'Story continues.', 'Story continues.', 'Story continues.', 'Story continues.', 'Story continues.', 'Story continues.', 'Story continues.', 'Story continues.']]]
SCBQ*** Recent Highlights.


Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Recent Highlights.', [' Recent Highlights.', ' Recent Highlights.', ' Recent Highlights.', ' Recent Highlights.', ' Recent Highlights.', ' Recent Highlights.', ' Recent Highlights.', ' Recent Highlights.', ' Recent Highlights.', ' Recent Highlights.']]]
PG*** Added 1,460 net new subscription customers in the quarter for a total of 21,146 subscription customers as of October 31, 2022, representing 44% growth year-over-year.


Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Added 1,460 net new subscription customers in the quarter for a total of 21,146 subscription customers as of October 31, 2022, representing 44% growth year-over-year.', ['Net new subscription customers are 21,146 as of October 31, 2022.', ' Added 1,460 net new subscription customers in the quarter for a total of 21,146 subscription customers as of October 31, 2022.', ' Added 1,460 net new subscription customers in the quarter for a total of 21,146 subscription customers as of October 31, 2022.', ' Added 1,460 net new subscription customers in the quarter for a total of 21,146 subscription customers as of October 31, 2022.', ' Added 1,460 net new subscription customers in the quarter for a total of 21,146 subscription customers as of October 31, 2022.', ' Added 1,460 net new subscription customers in the quarter for a total of 21,146 subscription customers as of October 31, 2022, and total subscription customers are 21,432.', ' Added 1,460 net new subscription customers in the quart

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' CrowdStrike’s subscription customers that have adopted five or more, six or more and seven or more modules was 60%, 36% and 21%, respectively, as of October 31, 2022.', [' CrowdStrike’s subscription customers that have adopted five or more, six or more and seven or more modules is 60%, 36% and 21%, respectively, as of October 31, 2022.', ' CrowdStrike’s subscription customers that have adopted five or more, six or more and seven or more modules is 60%, 36% and 21%. respectively, as of October 31, 2022.', ' CrowdStrike’s subscription customers that have adopted five or more, six or more and seven or more modules is 60%, 36% and 21%.', ' CrowdStrike’s subscription customers that have adopted five or more, six or more and seven or more modules is 60%, 36% and 21%, respectively, as of October 31, 2022.', ' CrowdStrike’s subscription customers that have adopted five or more, six or more and seven or more modules is 60%, 36% and 21%. respectively, as of October 31, 2022.', ' CrowdStrike’

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[" Acquired Reposify, an external attack surface management (EASM) platform, to help CrowdStrike's customers bolster visibility and reduce the risk exposure of external assets.", [" Acquired Reposify, an external attack surface management (EASM) platform, to help CrowdStrike's customers bolster visibility and reduce the risk exposure of external assets.", " Acquired Reposify, an external attack surface management (EASM) platform, to help CrowdStrike's customers bolster visibility and reduce the risk exposure of external assets.", " Acquired Reposify, an external attack surface management (EASM) platform, to help CrowdStrike's customers bolster visibility and reduce the risk exposure of external assets.", " Acquired Reposify, an external attack surface management (EASM) platform, to help CrowdStrike's customers bolster visibility and reduce the risk exposure of external assets.", " Acquired Reposify, an external attack surface management (EASM) platform to help CrowdStrike's customers 

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Highlights of the sixth annual Fal.', [' Highlights of the sixth annual Fal.', ' Highlights of the sixth annual Fal.', ' Highlights of the sixth annual Fal.', ' Highlights of the sixth annual Fal.', ' Highlights of the sixth annual Fal.', ' Highlights of the sixth annual Fal.', ' Highlights of the sixth annual Fal.', ' Highlights of the sixth annual Fal.', ' Highlights of the sixth annual Fal.', ' Highlights of the sixth annual Fal.']]]
Con cybersecurity conference included Falcon Insight XDR, Falcon Discover for IoT, Falcon LogScale, Falcon Complete LogScale and new Cloud Native Application Protection Platform (CNAPP) capabilities.


Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[['Con cybersecurity conference included Falcon Insight XDR, Falcon Discover for IoT, Falcon LogScale, Falcon Complete LogScale and new Cloud Native Application Protection Platform (CNAPP) capabilities.', ['Con cybersecurity conference included Falcon Insight XDR, Falcon Discover for IoT, Falcon LogScale, Falcon Complete LogScale and new Cloud Native Application Protection Platform (CNAPP) capabilities.', 'Con cybersecurity conference included Falcon Insight XDR, Falcon Discover for IoT, Falcon LogScale, Falcon Complete LogScale and new Cloud Native Application Protection Platform (CNAPP) capabilities.', 'Con cybersecurity conference included Falcon Insight XDR, Falcon Discover for IoT, Falcon LogScale, Falcon Complete LogScale and new Cloud Native Application Protection Platform (CNAPP) capabilities.', 'Con cybersecurity conference included Falcon Insight XDR, Falcon Discover for IoT, Falcon LogScale, Falcon Complete LogScale and new Cloud Native Application Protection Platform (CNAPP

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Expanded the CrowdStrike and EY global alliance to deliver Cloud Security and Observability services.', [' Expanded the CrowdStrike and EY global alliance to deliver Cloud Security and Observability services.', ' Expanded the CrowdStrike and EY global alliance to deliver Cloud Security and Observability services.', ' Expanded the CrowdStrike and EY global alliance to deliver Cloud Security and Observability services.', ' Expanded the CrowdStrike and EY global alliance to deliver Cloud Security and Observability services.', ' Expanded the CrowdStrike and EY global alliance to deliver Cloud Security and Observability services.', ' Expanded the CrowdStrike and EY global alliance to deliver Cloud Security and Observability services.', ' Expanded the CrowdStrike and EY global alliance to deliver Cloud Security and Observability services.', ' Expanded the CrowdStrike and EY global alliance to deliver Cloud Security and Observability services.', ' Expanded the CrowdStrike and EY global al

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Achieved 99% detection coverage of adversary behavior in the MITRE Engenuity ATT&CK Evaluations for Security Service Providers.', [' Achieved 99% detection coverage of adversary behavior in the MITRE Engenuity ATT&CK Evaluations for Security Service Providers.', ' Achieved 99% detection coverage of adversary behavior in the MITRE Engenuity ATT&CK Evaluations for Security Service Providers.', ' Achieved 99% detection coverage of adversary behavior in the MITRE Engenuity ATT&CK Evaluations for Security Service Providers.', ' Achieved 99% detection coverage of adversary behavior in the MITRE Engenuity ATT&CK Evaluations for Security Service Providers.', ' Achieved 99% detection coverage of adversary behavior in the MITRE Engenuity ATT&CK Evaluations for Security Service Providers.', ' Achieved 99% detection coverage of adversary behavior in the MITRE Engenuity ATT&CK Evaluations for Security Service Providers.', ' Achieved 99% detection coverage of adversary behavior in the MITRE Enge

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Received a AAA rating in SE Labs Enterprise Advanced Security Ransomware Test and achieved 100% Protection with zero false positives.', [' Received a AAA rating in SE Labs Enterprise Advanced Security Ransomware Test and achieved 100% Protection with zero false positives.', ' Received a AAA rating in SE Labs Enterprise Advanced Security Ransomware Test and achieved 100% Protection with zero false positives.', ' Received a AAA rating in SE Labs Enterprise Advanced Security Ransomware Test and achieved 100% Protection with zero false positives.', ' Received a AAA rating in SE Labs Enterprise Advanced Security Ransomware Test and achieved 100% Protection with zero false positives.', ' Received a AAA rating in SE Labs Enterprise Advanced Security Ransomware Test and achieved 100% Protection with zero false positives.', ' Received a AAA rating in SE Labs Enterprise Advanced Security Ransomware Test and achieved 100% Protection with zero false positives.', ' Received a AAA rating in SE L

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Chosen as a winner for the 2022 CRN Tech Innovator Awards for CrowdStrike Cloud Security, a Cloud Native Application Protection Platform (CNAPP) solution.', [' Chosen as a winner for the 2022 CRN Tech Innovator Awards for CrowdStrike Cloud Security, a Cloud Native Application Protection Platform (CNAPP) solution.', ' Chosen as a winner for the 2022 CRN Tech Innovator Awards for CrowdStrike Cloud Security, a Cloud Native Application Protection Platform (CNAPP) solution.', ' Chosen as a winner for the 2022 CRN Tech Innovator Awards for CrowdStrike Cloud Security, a Cloud Native Application Protection Platform (CNAPP) solution.', ' Chosen as a winner for the 2022 CRN Tech Innovator Awards for CrowdStrike Cloud Security, a Cloud Native Application Protection Platform (CNAPP) solution.', ' Chosen as a winner for the 2022 CRN Tech Innovator Awards for CrowdStrike Cloud Security, a Cloud Native Application Protection Platform (CNAPP) solution.', ' Chosen as a winner for the 2022 CRN Tech 

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Named One of the Best Workplaces for Women by Great Place to Work® and Fortune Magazine 2022.', [' Named One of the Best Workplaces for Women by Great Place to Work® and Fortune Magazine 2022.', ' Named One of the Best Workplaces for Women by Great Place to Work® and Fortune Magazine 2022.', ' Named One of the Best Workplaces for Women by Great Place to Work® and Fortune Magazine 2022.', ' Named One of the Best Workplaces for Women by Great Place to Work® and Fortune Magazine 2022.', ' Named One of the Best Workplaces for Women by Great Place to Work® and Fortune Magazine 2022.', ' Named One of the Best Workplaces for Women by Great Place to Work® and Fortune Magazine 2022.', ' Named One of the Best Workplaces for Women by Great Place to Work® and Fortune Magazine 2022.', ' Named One of the Best Workplaces for Women by Great Place to Work® and Fortune Magazine 2022.', ' Named One of the Best Workplaces for Women by Great Place to Work® and Fortune Magazine 2022.', ' Named One of th

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[[' Expanded charitable giving strategy to include a partnership with Operation Motorsport Foundation and NextGen Scholarship awards.', [' Expanded charitable giving strategy to include a partnership with Operation Motorsport Foundation and NextGen Scholarship awards.', ' Expanded charitable giving strategy to include a partnership with Operation Motorsport Foundation and NextGen Scholarship awards.', ' Expanded charitable giving strategy to include a partnership with Operation Motorsport Foundation and NextGen Scholarship awards.', ' Expanded charitable giving strategy to include a partnership with Operation Motorsport Foundation and NextGen Scholarship awards.', ' Expanded charitable giving strategy to include a partnership with Operation Motorsport Foundation and NextGen Scholarship awards.', ' Expanded charitable giving strategy to include a partnership with Operation Motorsport Foundation and NextGen Scholarship awards.', ' Expanded charitable giving strategy to include a partners

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[['Total Revenue in Q4 Fy23guidance is $619.1 MN - $628.2 million  .', ['Total Revenue is expected to be between $619.1 MN - $628.2 million in Q4 Full Year.', 'Total Revenue in Q4 is expected to be between $619.1 MN - $628.2 million.', 'Total Revenue is expected to be between $619.1 MN - $628.2 million for Q4.', 'Total Revenue is expected to be between $619.1 MN - $628.2 million in Q4.', 'Total Revenue is expected to be between $619.1 MN - $628.2 million for Q4.', 'Total Revenue in Q4 Full Year is expected to be between $619.1 MN - $628.2 million.', 'Total Revenue is expected to be between $619.1 MN - $628.2 million in Q4 of Fiscal Year.', 'Total Revenue is expected to be between $619.1 MN - $628.2 million for Q4.', 'Total Revenue in Q4 Full Year Fy23guidance is expected to be between $619.1 MN - $628.2 million.', 'Total Revenue is expected to be between $619.1 MN and $628.2 million in Q4.']]]
Total Revenue in Full Year Fy23guidance is $2223.0 MN - $2232.0 million  .


Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

[['Total Revenue in Full Year Fy23guidance is $2223.0 MN - $2232.0 million  .', ['Total Revenue is expected to be between $2223.0 MN - $2232.0 million for Full Year.', 'Total Revenue is expected to be between $2223.0 MN and $2232.0 million in Full Year Full Year.', 'Total Revenue is expected to be between $2223.0 MN - $2232.0 million.', 'Total Revenue in Full Year Full Year is expected to be between $2223.0 MN - $2232.0 million .', 'Total Revenue is expected to be between $2223.0 MN - $2232.0 million for Full Year.', 'Total Revenue in Full Year is expected to be between $2223.0 MN - $2232.0 million in Full year.', 'Total Revenue in Full Year Fy23guidance is expected to be between $2223.0 MN - $2232.0 million.', 'Total Revenue is expected to be between $2223.0 MN - $2232.0 million in Full Year.', 'Total Revenue is expected to be between $2223.0 MN and $2232.0 million for Full Year.', 'Total Revenue in Full Year Full Year is expected to be between $2223.0 MN - $2232.0 million.']]]
Non-GA

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.ERROR)

model = Seq2SeqModel(
    encoder_decoder_type="bart", encoder_decoder_name="outputs"
)


while True:
    original = input("Enter text to paraphrase: ")
    to_predict = [original]

    preds = model.predict(to_predict)

    print("---------------------------------------------------------")
    print(original)

    print()
    print("Predictions >>>")
    for pred in preds[0]:
        print(pred)

    print("---------------------------------------------------------")
    print()

In [10]:
def load_data(file_path, input_text_column, target_text_column, label_column, keep_label=1):
    df = pd.read_csv(file_path, sep="\t", error_bad_lines=False)
    df = df.loc[df[label_column] == keep_label]
    df = df.rename(
        columns={input_text_column: "input_text", target_text_column: "target_text"}
    )
    df = df[["input_text", "target_text"]]
    df["prefix"] = "paraphrase"

    return df

In [6]:
train_df = pd.read_csv("../../Summary/ParaPhrase/final/train.tsv", sep="\t").astype(str)
eval_df = pd.read_csv("../../Summary/ParaPhrase/final/dev.tsv", sep="\t").astype(str)

train_df = train_df.loc[train_df["label"] == "1"]
eval_df = eval_df.loc[eval_df["label"] == "1"]

train_df = train_df.rename(
    columns={"sentence1": "input_text", "sentence2": "target_text"}
)
eval_df = eval_df.rename(
    columns={"sentence1": "input_text", "sentence2": "target_text"}
)

train_df = train_df[["input_text", "target_text"]]
eval_df = eval_df[["input_text", "target_text"]]

train_df["prefix"] = "paraphrase"
eval_df["prefix"] = "paraphrase"

In [7]:
train_df = train_df[["prefix", "input_text", "target_text"]]
eval_df = eval_df[["prefix", "input_text", "target_text"]]

train_df = train_df.dropna()
eval_df = eval_df.dropna()
train_df["input_text"] = train_df["input_text"].apply(clean_unnecessary_spaces)
train_df["target_text"] = train_df["target_text"].apply(clean_unnecessary_spaces)

eval_df["input_text"] = eval_df["input_text"].apply(clean_unnecessary_spaces)
eval_df["target_text"] = eval_df["target_text"].apply(clean_unnecessary_spaces)

train_df = (train_df.loc[1:10000])
eval_df = (eval_df.loc[1:1000])
print(eval_df)

INFO:numexpr.utils:Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.


         prefix                                         input_text  \
1    paraphrase  They were there to enjoy us and they were ther...   
2    paraphrase  After the end of the war in June 1902, Higgins...   
3    paraphrase  From the merger of the Four Rivers Council and...   
4    paraphrase  The group toured extensively and became famous...   
5    paraphrase  Kathy and her husband Pete Beale ( Peter Dean ...   
..          ...                                                ...   
990  paraphrase  After his service Lockhart lived in Texas but ...   
991  paraphrase  After medical treatment, Strozzi started takin...   
992  paraphrase  In December 1969 became the 49th Army - Divisi...   
993  paraphrase  In `` The Stand '' by Glen Pequod Bateman, Woo...   
996  paraphrase  Dora Rangelova is the current captain of the B...   

                                           target_text  
1    They were there for us to enjoy and they were ...  
2    In August, after the end of the war in J

In [8]:
#os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:64"
model_args = Seq2SeqArgs()
model_args.do_sample = True
#model_args.eval_batch_size = 64
model_args.eval_batch_size = 32
model_args.evaluate_during_training = True
#model_args.evaluate_during_training_steps = 2500
model_args.evaluate_during_training_steps = 1000
model_args.evaluate_during_training_verbose = True
model_args.fp16 = False
model_args.learning_rate = 5e-5
#model_args.max_length = 128
model_args.max_length = 32
#model_args.max_seq_length = 128
model_args.max_seq_length = 32
model_args.num_beams = None
model_args.num_return_sequences = 3
model_args.num_train_epochs = 2
model_args.overwrite_output_dir = True
model_args.reprocess_input_data = True
model_args.save_eval_checkpoints = False
model_args.save_steps = -1
model_args.top_k = 50
model_args.top_p = 0.95
#model_args.train_batch_size = 8
model_args.train_batch_size = 4
model_args.use_multiprocessing = False
#model_args.wandb_project = "Paraphrasing with BART"


model = Seq2SeqModel(
    encoder_decoder_type="bart",
    encoder_decoder_name="facebook/bart-base",
    args=model_args,
)
model.train_model(train_df, eval_data=eval_df)

to_predict = [
    prefix + ": " + str(input_text)
    for prefix, input_text in zip(eval_df["prefix"].tolist(), eval_df["input_text"].tolist())
]
truth = eval_df["target_text"].tolist()

preds = model.predict(to_predict)

INFO:simpletransformers.seq2seq.seq2seq_utils: Creating features from dataset file at cache_dir/


  0%|          | 0/4491 [00:00<?, ?it/s]

INFO:simpletransformers.seq2seq.seq2seq_model: Training started


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/1123 [00:00<?, ?it/s]

INFO:simpletransformers.seq2seq.seq2seq_utils: Creating features from dataset file at cache_dir/


  0%|          | 0/451 [00:00<?, ?it/s]

INFO:simpletransformers.seq2seq.seq2seq_model:{'eval_loss': 0.5714342097441355}
INFO:simpletransformers.seq2seq.seq2seq_model:Saving model into outputs/best_model
INFO:simpletransformers.seq2seq.seq2seq_model:Saving model into outputs/checkpoint-1123-epoch-1
INFO:simpletransformers.seq2seq.seq2seq_utils: Creating features from dataset file at cache_dir/


  0%|          | 0/451 [00:00<?, ?it/s]

INFO:simpletransformers.seq2seq.seq2seq_model:{'eval_loss': 0.5561367531617483}
INFO:simpletransformers.seq2seq.seq2seq_model:Saving model into outputs/best_model


Running Epoch 1 of 2:   0%|          | 0/1123 [00:00<?, ?it/s]

INFO:simpletransformers.seq2seq.seq2seq_utils: Creating features from dataset file at cache_dir/


  0%|          | 0/451 [00:00<?, ?it/s]

INFO:simpletransformers.seq2seq.seq2seq_model:{'eval_loss': 0.5267087161540985}
INFO:simpletransformers.seq2seq.seq2seq_model:Saving model into outputs/best_model
INFO:simpletransformers.seq2seq.seq2seq_model:Saving model into outputs/checkpoint-2246-epoch-2
INFO:simpletransformers.seq2seq.seq2seq_utils: Creating features from dataset file at cache_dir/


  0%|          | 0/451 [00:00<?, ?it/s]

INFO:simpletransformers.seq2seq.seq2seq_model:{'eval_loss': 0.52150759100914}
INFO:simpletransformers.seq2seq.seq2seq_model:Saving model into outputs/best_model
INFO:simpletransformers.seq2seq.seq2seq_model:Saving model into outputs/
INFO:simpletransformers.seq2seq.seq2seq_model: Training of facebook/bart-base model complete. Saved to outputs/.


Generating outputs:   0%|          | 0/15 [00:00<?, ?it/s]

RuntimeError: CUDA out of memory. Tried to allocate 222.00 MiB (GPU 0; 4.00 GiB total capacity; 2.68 GiB already allocated; 0 bytes free; 3.44 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
print(preds)

In [24]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")

In [25]:
sequence = "Non-GAAP net loss per share is expected to be between $(0.36) and $(0.42)"
tokens = tokenizer.tokenize(sequence)

print(tokens, len(tokens))

['Non', '-', 'GA', 'AP', 'Ġnet', 'Ġloss', 'Ġper', 'Ġshare', 'Ġis', 'Ġexpected', 'Ġto', 'Ġbe', 'Ġbetween', 'Ġ$(', '0', '.', '36', ')', 'Ġand', 'Ġ$(', '0', '.', '42', ')'] 24
