In [9]:
# !python examples/paraphraser/paraphrase.py \
#     --en2fr examples/translation_moe/src/paraphraser.en-fr \
#     --fr2en examples/translation_moe/src/paraphraser.fr-en.hMoEup \
#     --files input_fairseq

import pandas as pd
from tqdm.notebook import tqdm
from ipywidgets import IntProgress
from nltk.translate.bleu_score import sentence_bleu
import os
import numpy as np
import torch
os.environ["CUDA_VISIBLE_DEVICES"]='2'

In [37]:
#!/usr/bin/env python3 -u

import argparse
import fileinput
import logging
import os
import sys

from fairseq.models.transformer import TransformerModel


# logging.getLogger().setLevel(logging.INFO)


def paraphaser(text_list):
    en2fr = '../translation_moe/src/paraphraser.en-fr'
    fr2en = '../translation_moe/src/paraphraser.fr-en.hMoEup'
    user_dir = None
    num_experts = 10
    
    if user_dir is None:
        user_dir = os.path.join(
            os.path.dirname(os.path.dirname(os.path.abspath('examples'))),  # examples/
            "translation_moe",
            "translation_moe_src",
        )
        if os.path.exists(user_dir):
            logging.info("found user_dir:" + user_dir)
        else:
            raise RuntimeError(
                "cannot find fairseq examples/translation_moe/src "
                "(tried looking here: {})".format(user_dir)
            )

    logging.info("loading en2fr model from:" + en2fr)
    en2fr = TransformerModel.from_pretrained(
        model_name_or_path=en2fr,
        tokenizer="moses",
        bpe="sentencepiece",
    ).eval()

    logging.info("loading fr2en model from:" + fr2en)
    fr2en = TransformerModel.from_pretrained(
        model_name_or_path=fr2en,
        tokenizer="moses",
        bpe="sentencepiece",
        user_dir=user_dir,
        task="translation_moe",
    ).eval()

    def gen_paraphrases(en):
        fr = en2fr.translate(en)
        return [
            fr2en.translate(fr, inference_step_args={"expert": i})
            for i in range(num_experts)
        ]

    en2fr = en2fr.cuda()
    fr2en = fr2en.cuda()
    augmented = []
    for text in tqdm(text_list):
#         for paraphrase in gen_paraphrases(text):
        augmented.extend(gen_paraphrases(text))
    return augmented


In [42]:
abspath = '/mount/experiment/clustering_git/datasets/'

def get_highest_index(scores):
    return np.argsort(scores)[-1]

def get_mid_index(scores):
    return np.argsort(scores)[len(scores)//2]

def get_lowest_index(scores):
    return np.argsort(scores)[0]

def get_list_BLEU(input_text, augmented, expts = 10):

    augmented_hig_list = []
    augmented_mid_list = []
    augmented_low_list = []
    
    for i, inp in enumerate(input_text):
        scores = []
        for j in range(i*expts, (i*expts) + expts):
            #Append each BLEU-score
            scores.append(sentence_bleu([inp.split()], augmented[j].split()))

        #Find Index of the Highest score (of every 10th)
        high_idx = get_highest_index(scores) + (i*expts)
        
        #Find Index of the middle score (of every 10th)
        mid_idx = get_mid_index(scores) + (i*expts)
        
        #Find Index of the lowest score (of every 10th)
        low_idx = get_lowest_index(scores) + (i*expts)

        augmented_hig_list.append(augmented[high_idx])
        augmented_mid_list.append(augmented[mid_idx])
        augmented_low_list.append(augmented[low_idx])
        
    return augmented_hig_list, augmented_mid_list, augmented_low_list

def get_paraphaser(path_to_dataset, output_name):    
    df = pd.read_csv(abspath + path_to_dataset, sep = '\t', names = ['label', 'text0', 'text1'])
    text0 = df.text0.values
    augmented = paraphaser(text0)
    augmented_hig_list, augmented_mid_list, augmented_low_list = get_list_BLEU(text0, augmented, 10)
    #text1 = low, text2 = median, text3 = high
    df['text1'] = augmented_low_list
    df['text2'] = augmented_mid_list
    df['text3'] = augmented_hig_list
    df.to_csv(abspath + 'augmented/paraphaser/' + output_name, index=False, sep = '\t')

In [43]:
# #search_snipplet
# get_paraphaser('search_snippets/search_snippets_true_text.csv', 'search_snippets')

# #stack_overflow
# get_paraphaser('stackoverflow/stackoverflow_true_text', 'stackoverflow')

# #biomedical
# get_paraphaser('biomedical/biomedical_true_text', 'biomedical')

# #agnews
# get_paraphaser('agnewsdataraw-8000', 'agnews')

# #googleS
# get_paraphaser('S', 'S')

# #googleT
# get_paraphaser('T', 'T')

#googleTS
get_paraphaser('TS', 'TS')

#tweet
get_paraphaser('tweet_remap_label', 'tweet')

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




ERROR:fairseq.dataclass.utils:Error when composing. Overrides: ['common.no_progress_bar=False', 'common.log_interval=10', "common.log_format='json'", 'common.log_file=null', "common.tensorboard_logdir='/checkpoint/myleott/tensorboard_logs/2019-03-12/paraphrase.bpe40k.fp16.maxupd80000.en_fr.transformer_vaswani_wmt_en_fr_big.shareemb.adam.beta0.9_0.98.initlr1e-07.warmup4000.lr0.0007.clip0.0.drop0.1.wd0.0.ls0.1.maxtok4608.upfreq4.seed2.ngpu32'", 'common.wandb_project=null', 'common.azureml_logging=False', 'common.seed=2', 'common.cpu=False', 'common.tpu=False', 'common.bf16=False', 'common.memory_efficient_bf16=False', 'common.fp16=True', 'common.memory_efficient_fp16=False', 'common.fp16_no_flatten_grads=False', 'common.fp16_init_scale=128', 'common.fp16_scale_window=null', 'common.fp16_scale_tolerance=0.0', 'common.on_cpu_convert_precision=False', 'common.min_loss_scale=0.0001', 'common.threshold_loss_scale=null', 'common.amp=False', 'common.amp_batch_retries=2', 'common.amp_init_scale=

KeyboardInterrupt: 

In [None]:
for i in tqdm(range(0,1000)):
    pass

In [32]:
df = pd.read_csv('/mount/experiment/clustering_git/datasets/' + 'search_snippets/search_snippets_true_text.csv', sep = '\t', names = ['label', 'text0', 'text1'])

In [44]:
df = pd.read_csv('/mount/experiment/clustering_git/datasets/' + 'augmented/paraphaser/search_snippets', sep = '\t')

In [45]:
df

Unnamed: 0,label,text0,text1,text2,text3
0,1,turtlesoft goldenseal software reference bankt...,Turtlesoft Gold Ensign Banking Reference Softw...,"L' Turtlesoft Gold Ensign, banktran banking re...",L' Gold of Turtlesoft banktran banking referen...
1,1,zeroonezero services author bank trans service...,Zero Author Services Authorized Secure Banking...,Zero author services trans banking authorized ...,Zero author services bank service authorized s...
2,1,csharpfriends articles getarticle articleid ba...,The articles are expected to obtain article of...,Sharpfriends articles get article banking proc...,The following articles are to obtain article o...
3,1,standardchartered cib transaction standard cha...,Standard Charter Banking Standard Chartered Ba...,Standard-charted banking transaction Standard-...,Chartered bank transaction Standard charter ba...
4,1,standardchartered global news press transactio...,Consecutive Client Satisfaction Survey of Voti...,Consecutive customer satisfaction survey of vo...,Follow-up survey on customer satisfaction with...
