In [29]:
import os
from datetime import datetime
import logging

import pandas as pd
from sklearn.model_selection import train_test_split
from simpletransformers.seq2seq import Seq2SeqModel, Seq2SeqArgs
import warnings
import pandas as pd
import os
from datetime import datetime
import logging
import glob
from pathlib import Path

In [41]:
devDir = "../../Summary/DATA/PARAPHRASE/Dev"
rplStr = ["PG***", "CS***", "ED***", "SCHQ***", "SCBQ***", "SCBF***", "SCHF***", "SCG***", "GF***", "GQ***"]

files = glob.glob(devDir+"/*_EP_YH.txt")
#print(files)
#print(devDir)
if(len(files) > 0):
    for file in (files):
        print("Input file " + file)
        basefile = os.path.basename(file)
        inputfile = os.path.splitext(basefile)[0]
        #print(inputfile)
        outfilePath = inputfile + "_phrase.txt"
        outfilePath = devDir + "/" + outfilePath
        print("Phrase file " + outfilePath)
        outfile = Path(outfilePath)
        if outfile.is_file():
            print("Phrase file " + str(outfile) + " already exists")
            continue
        with open(outfile, "w") as of:
            with open(file, encoding="utf-8") as f:
                line = f.readline()
                while line:
                    for s in rplStr:
                        line = line.replace(s, "")
                    if("@@@" in line or line == "\n" or "https://finance.yahoo.com" in line):
                        line = f.readline()
                        continue
                    #print(line)
                    of.write(line + "\n")
                    line = f.readline()

Input file ../../Summary/DATA/PARAPHRASE/Dev\APPN_2023-02-16_EP_YH.txt
Phrase file ../../Summary/DATA/PARAPHRASE/Dev/APPN_2023-02-16_EP_YH_phrase.txt
Phrase file ..\..\Summary\DATA\PARAPHRASE\Dev\APPN_2023-02-16_EP_YH_phrase.txt already exists


In [2]:
def load_data(file_path, input_text_column, target_text_column, label_column, keep_label=1):
    df = pd.read_csv(file_path, sep="\t", error_bad_lines=False)
    df = df.loc[df[label_column] == keep_label]
    df = df.rename(
        columns={input_text_column: "input_text", target_text_column: "target_text"}
    )
    df = df[["input_text", "target_text"]]
    df["prefix"] = "paraphrase"

    return df

In [3]:
def clean_unnecessary_spaces(out_string):
    if not isinstance(out_string, str):
        warnings.warn(f">>> {out_string} <<< is not a string.")
        out_string = str(out_string)
    out_string = (
        out_string.replace(" .", ".")
        .replace(" ?", "?")
        .replace(" !", "!")
        .replace(" ,", ",")
        .replace(" ' ", "'")
        .replace(" n't", "n't")
        .replace(" 'm", "'m")
        .replace(" 's", "'s")
        .replace(" 've", "'ve")
        .replace(" 're", "'re")
    )
    return out_string

In [4]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.ERROR)

In [5]:
train_df = pd.read_csv("../../Summary/ParaPhrase/final/train.tsv", sep="\t").astype(str)
eval_df = pd.read_csv("../../Summary/ParaPhrase/final/dev.tsv", sep="\t").astype(str)

train_df = train_df.loc[train_df["label"] == "1"]
eval_df = eval_df.loc[eval_df["label"] == "1"]

train_df = train_df.rename(
    columns={"sentence1": "input_text", "sentence2": "target_text"}
)
eval_df = eval_df.rename(
    columns={"sentence1": "input_text", "sentence2": "target_text"}
)

train_df = train_df[["input_text", "target_text"]]
eval_df = eval_df[["input_text", "target_text"]]

train_df["prefix"] = "paraphrase"
eval_df["prefix"] = "paraphrase"

In [6]:
train_df = train_df[["prefix", "input_text", "target_text"]]
eval_df = eval_df[["prefix", "input_text", "target_text"]]

train_df = train_df.dropna()
eval_df = eval_df.dropna()
train_df["input_text"] = train_df["input_text"].apply(clean_unnecessary_spaces)
train_df["target_text"] = train_df["target_text"].apply(clean_unnecessary_spaces)

eval_df["input_text"] = eval_df["input_text"].apply(clean_unnecessary_spaces)
eval_df["target_text"] = eval_df["target_text"].apply(clean_unnecessary_spaces)

train_df = (train_df.loc[1:10000])
eval_df = (eval_df.loc[1:1000])
print(train_df)

INFO:numexpr.utils:Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.


          prefix                                         input_text  \
1     paraphrase  The NBA season of 1975 -- 76 was the 30th seas...   
3     paraphrase  When comparable rates of flow can be maintaine...   
4     paraphrase  It is the seat of Zerendi District in Akmola R...   
5     paraphrase  William Henry Henry Harman was born on 17 Febr...   
7     paraphrase  With a discrete amount of probabilities Formul...   
...          ...                                                ...   
9992  paraphrase  The 500 Hispanic settlers who had lived near L...   
9996  paraphrase  In the wake of the Herat affair, Great Britain...   
9997  paraphrase  Psalm 79 ( Greek numbering : Psalm 78 ) is the...   
9998  paraphrase  It is located in the Annapolis Valley in Kings...   
9999  paraphrase  The team kits for the 2005 -- 06 season are pr...   

                                            target_text  
1     The 1975 -- 76 season of the National Basketba...  
3     The results are high when

In [None]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:64"
model_args = Seq2SeqArgs()
model_args.do_sample = True
#model_args.eval_batch_size = 64
model_args.eval_batch_size = 32
model_args.evaluate_during_training = True
#model_args.evaluate_during_training_steps = 2500
model_args.evaluate_during_training_steps = 1000
model_args.evaluate_during_training_verbose = True
model_args.fp16 = False
model_args.learning_rate = 5e-5
#model_args.max_length = 128
model_args.max_length = 32
#model_args.max_seq_length = 128
model_args.max_seq_length = 32
model_args.num_beams = None
model_args.num_return_sequences = 3
model_args.num_train_epochs = 2
model_args.overwrite_output_dir = True
model_args.reprocess_input_data = True
model_args.save_eval_checkpoints = False
model_args.save_steps = -1
model_args.top_k = 50
model_args.top_p = 0.95
#model_args.train_batch_size = 8
model_args.train_batch_size = 4
model_args.use_multiprocessing = False
model_args.wandb_project = "Paraphrasing with BART"


model = Seq2SeqModel(
    encoder_decoder_type="bart",
    encoder_decoder_name="facebook/bart-base",
    args=model_args,
)
model.train_model(train_df, eval_data=eval_df)

to_predict = [
    prefix + ": " + str(input_text)
    for prefix, input_text in zip(eval_df["prefix"].tolist(), eval_df["input_text"].tolist())
]
truth = eval_df["target_text"].tolist()

preds = model.predict(to_predict)

INFO:simpletransformers.seq2seq.seq2seq_utils: Creating features from dataset file at cache_dir/


  0%|          | 0/4491 [00:00<?, ?it/s]

INFO:simpletransformers.seq2seq.seq2seq_model: Training started


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
wandb: Currently logged in as: ankan-ghosh. Use `wandb login --relogin` to force relogin


Running Epoch 0 of 2:   0%|          | 0/1123 [00:00<?, ?it/s]

INFO:simpletransformers.seq2seq.seq2seq_utils: Creating features from dataset file at cache_dir/


  0%|          | 0/451 [00:00<?, ?it/s]

INFO:simpletransformers.seq2seq.seq2seq_model:{'eval_loss': 0.5751085778077444}
INFO:simpletransformers.seq2seq.seq2seq_model:Saving model into outputs/best_model
INFO:simpletransformers.seq2seq.seq2seq_model:Saving model into outputs/checkpoint-1123-epoch-1
INFO:simpletransformers.seq2seq.seq2seq_utils: Creating features from dataset file at cache_dir/


  0%|          | 0/451 [00:00<?, ?it/s]

INFO:simpletransformers.seq2seq.seq2seq_model:{'eval_loss': 0.5639134407043457}
INFO:simpletransformers.seq2seq.seq2seq_model:Saving model into outputs/best_model


Running Epoch 1 of 2:   0%|          | 0/1123 [00:00<?, ?it/s]

In [8]:
print(preds)

[['paraphrase: They were there to enjoy us and they were there for pray for us.', 'paraphrase: They were there to enjoy us and they were there for pray for us.', 'paraphrase: They were there to enjoy us and they were there for pray for us.'], ["paraphrase: After the end of the war in June 1902, Higgins left Southampton in the `` SSBavarian '' and returned to Southampton", "paraphrase: After the end of the war in June 1902, Higgins left Southampton in the `` SSBavarian '' and returned to Southampton", "paraphrase: After the end of the war in June 1902, Higgins left Southampton in the `` SSBavarian '' and returned to Southampton"], ['paraphrase: From the merger of the Four Rivers Council and the Audubon Council, the Shawnee Trails Council was born.', 'paraphrase: From the merger of the Four Rivers Council and the Audubon Council, the Shawnee Trails Council was born.', 'paraphrase: From the merger of the Four Rivers Council and the Audubon Council, the Shawnee Trails Council was born.'], 

In [9]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.ERROR)

model = Seq2SeqModel(
    encoder_decoder_type="bart", encoder_decoder_name="outputs"
)


while True:
    original = input("Enter text to paraphrase: ")
    to_predict = [original]

    preds = model.predict(to_predict)

    print("---------------------------------------------------------")
    print(original)

    print()
    print("Predictions >>>")
    for pred in preds[0]:
        print(pred)

    print("---------------------------------------------------------")
    print()

Enter text to paraphrase: Revenue: Total revenue was $637.4 million, a 48% increase, compared to $431.0 million in the fourth quarter of fiscal 2022.


Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

---------------------------------------------------------
Revenue: Total revenue was $637.4 million, a 48% increase, compared to $431.0 million in the fourth quarter of fiscal 2022.

Predictions >>>
Revenue: Total revenue was $637.4 million, a 48% increase, compared to $431.0 million in the fourth quarter
Revenue: Total revenue was $637.4 million, a 48% increase, compared to $431.0 million in the fourth quarter
Revenue: Total revenue was $637.4 million, a 48% increase, compared to $431.0 million in the fourth quarter
---------------------------------------------------------

Enter text to paraphrase: Income/Loss from Operations: GAAP loss from operations was $61.5 million, compared to $23.5 million in the fourth quarter of fiscal 2022.


Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

---------------------------------------------------------
Income/Loss from Operations: GAAP loss from operations was $61.5 million, compared to $23.5 million in the fourth quarter of fiscal 2022.

Predictions >>>
Income/Loss from Operations: GAAP loss from operations was $61.5 million, compared to $23.5m in the
Income/Loss from Operations: GAAP loss from operations was $61.5 million, compared to $23.5m in the
Income/Loss from Operations: GAAP loss from operations was $61.5 million, compared to $23.5m in the
---------------------------------------------------------



KeyboardInterrupt: Interrupted by user