In [11]:
# !python examples/paraphraser/paraphrase.py \
#     --en2fr examples/translation_moe/src/paraphraser.en-fr \
#     --fr2en examples/translation_moe/src/paraphraser.fr-en.hMoEup \
#     --files input_fairseq

import pandas as pd
from tqdm.notebook import tqdm
from ipywidgets import IntProgress
from nltk.translate.bleu_score import sentence_bleu
import os
import numpy as np
import torch
os.environ["CUDA_VISIBLE_DEVICES"]='4'

In [12]:
#!/usr/bin/env python3 -u

import argparse
import fileinput
import logging
import os
import sys

from fairseq.models.transformer import TransformerModel


# logging.getLogger().setLevel(logging.INFO)


def paraphaser(text_list):
    en2fr = '../translation_moe/src/paraphraser.en-fr'
    fr2en = '../translation_moe/src/paraphraser.fr-en.hMoEup'
    user_dir = None
    num_experts = 10
    
    if user_dir is None:
        user_dir = os.path.join(
            os.path.dirname(os.path.dirname(os.path.abspath('examples'))),  # examples/
            "translation_moe",
            "translation_moe_src",
        )
        if os.path.exists(user_dir):
            logging.info("found user_dir:" + user_dir)
        else:
            raise RuntimeError(
                "cannot find fairseq examples/translation_moe/src "
                "(tried looking here: {})".format(user_dir)
            )

    logging.info("loading en2fr model from:" + en2fr)
    en2fr = TransformerModel.from_pretrained(
        model_name_or_path=en2fr,
        tokenizer="moses",
        bpe="sentencepiece",
    ).eval()

    logging.info("loading fr2en model from:" + fr2en)
    fr2en = TransformerModel.from_pretrained(
        model_name_or_path=fr2en,
        tokenizer="moses",
        bpe="sentencepiece",
        user_dir=user_dir,
        task="translation_moe",
    ).eval()

    def gen_paraphrases(en):
        fr = en2fr.translate(en)
        return [
            fr2en.translate(fr, inference_step_args={"expert": i})
            for i in range(num_experts)
        ]

    en2fr = en2fr.cuda()
    fr2en = fr2en.cuda()
    augmented = []
    for text in tqdm(text_list):
#         for paraphrase in gen_paraphrases(text):
        augmented.extend(gen_paraphrases(text))
    return augmented


In [13]:
abspath = '/mount/clustering/datasets/'

def get_highest_index(scores):
    return np.argsort(scores)[-1]

def get_mid_index(scores):
    return np.argsort(scores)[len(scores)//2]

def get_lowest_index(scores):
    return np.argsort(scores)[0]

def get_list_BLEU(input_text, augmented, expts = 10):

    augmented_hig_list = []
    augmented_mid_list = []
    augmented_low_list = []
    
    for i, inp in enumerate(input_text):
        scores = []
        for j in range(i*expts, (i*expts) + expts):
            #Append each BLEU-score
            scores.append(sentence_bleu([inp.split()], augmented[j].split()))

        #Find Index of the Highest score (of every 10th)
        high_idx = get_highest_index(scores) + (i*expts)
        
        #Find Index of the middle score (of every 10th)
        mid_idx = get_mid_index(scores) + (i*expts)
        
        #Find Index of the lowest score (of every 10th)
        low_idx = get_lowest_index(scores) + (i*expts)

        augmented_hig_list.append(augmented[high_idx])
        augmented_mid_list.append(augmented[mid_idx])
        augmented_low_list.append(augmented[low_idx])
        
    return augmented_hig_list, augmented_mid_list, augmented_low_list

def get_paraphaser(path_to_dataset, output_name):    
    df = pd.read_csv(abspath + path_to_dataset, sep = '\t', names = ['label', 'text0', 'text1'])
    text0 = df.text0.values
    augmented = paraphaser(text0)
    augmented_hig_list, augmented_mid_list, augmented_low_list = get_list_BLEU(text0, augmented, 10)
    #text1 = low, text2 = median, text3 = high
    df['text1'] = augmented_low_list
    df['text2'] = augmented_mid_list
    df['text3'] = augmented_hig_list
    df.to_csv(abspath + 'augmented/paraphraser/' + output_name, index=False, sep = '\t')

In [None]:
#search_snipplet
# get_paraphaser('search_snippets/search_snippets_true_text.csv', 'search_snippets')

#stack_overflow
# get_paraphaser('stackoverflow/stackoverflow_true_text', 'stackoverflow')

# #biomedical
# get_paraphaser('biomedical/biomedical_true_text', 'biomedical')

# #agnews
# get_paraphaser('agnewsdataraw-8000', 'agnews')

# #googleS
get_paraphaser('S', 'S')

# #googleT
get_paraphaser('T', 'T')

# #googleTS
get_paraphaser('TS', 'TS')

# #tweet
get_paraphaser('tweet_remap_label', 'tweet')

  0%|          | 0/11108 [00:00<?, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
for i in tqdm(range(0,1000)):
    pass

In [None]:
df = pd.read_csv('/mount/experiment/clustering_git/datasets/' + 'search_snippets/search_snippets_true_text.csv', sep = '\t', names = ['label', 'text0', 'text1'])

In [None]:
df = pd.read_csv('/mount/experiment/clustering_git/datasets/' + 'augmented/paraphaser/search_snippets', sep = '\t')

In [None]:
df

In [None]:
#googleS
get_paraphaser('S', 'S')

#googleT
get_paraphaser('T', 'T')

#googleTS
get_paraphaser('TS', 'TS')

#tweet
get_paraphaser('tweet_remap_label', 'tweet')

In [6]:
augmented_hig_list

NameError: name 'augmented_hig_list' is not defined