In [None]:
import pandas as pd

data =pd.read_csv(
    filepath_or_buffer='../data/test.csv',
    sep='\t',
    encoding='utf8',
    names=["ID", "Label", "Tweet"]
    )


In [None]:
from nltk.corpus import wordnet, stopwords
from nltk.tokenize import TweetTokenizer
import numpy as np

tk = TweetTokenizer()
stop_words = set(stopwords.words('english'))

def token_pipeline(tweet):
    tweet_tokens = tk.tokenize(tweet)
    # filter(lambda token: token not in stop_words, tweet_tokens)
    tweet_token_without_stopwords = list(tweet_tokens)
    return tweet_token_without_stopwords

data["Tweet_Token"] = data["Tweet"].apply(token_pipeline)

data = pd.DataFrame(np.repeat(data.values, 5, axis=0), columns=data.columns)
data

In [None]:
import random

def create_synthetic_tweet_synonyms(tweet):
    tweet_synonyms = []
    for token in tweet:
        word_synonyms=[]
        for word_synonym in wordnet.synsets(token):
            for word_lemma in word_synonym.lemmas():
                word_synonyms.append(word_lemma.name())
        if word_synonyms:
            tweet_synonyms.append(random.choice(list(set(word_synonyms))))
        else:
            tweet_synonyms.append(token)

    return " ".join(tweet_synonyms)

data["Synonyms_Synthetic"] = data["Tweet_Token"].apply(lambda tweet: create_synthetic_tweet_synonyms(tweet))

data[["Tweet", "Synonyms_Synthetic", "Tweet_Token"]]

In [None]:
from gensim.models import fasttext


fasttext_model = fasttext.load_facebook_model(path="H:\\wiki.simple.bin")


In [None]:
import numpy as np


def create_synthetic_tweet_word_embeddings(tokens, model, percentage=0.2):
    snyth_token = []
    num_words_to_replace = int(len(tokens) * percentage)
    words_to_replace = random.sample(range(len(tokens)), num_words_to_replace)

    for idx in words_to_replace:
        word = tokens[idx]
        similar_words = model.wv.most_similar(word, topn=6)
        similar_words = [w for w, _ in similar_words if w.lower() != word.lower()]

        if similar_words:
            new_word = np.random.choice(similar_words)
            snyth_token.append(new_word)
            continue
        snyth_token.append(tokens[idx])
    return " ".join(snyth_token)

data["fasttext_Synthetic"] = data["Tweet_Token"].apply(lambda tweet: create_synthetic_tweet_word_embeddings(tweet, fasttext_model, percentage=1))
data[["Tweet_Token", "fasttext_Synthetic"]]

In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

def generate_sentence(tweet_token, seed_percent=0.5):
    # Set the pad_token to be the same as the eos_token
    length_of_seed_tokens = int(len(tweet_token) * seed_percent)
    seed = " ".join(tweet_token[0:length_of_seed_tokens])

    tokenizer.pad_token = tokenizer.eos_token
    input_text = tokenizer.encode(seed, return_tensors="pt", padding=True)
    attention_mask = torch.ones_like(input_text)
    #attention_mask = (input_text != tokenizer.pad_token_id).float()
    output = model.generate(input_text, max_length=70, num_return_sequences=1, do_sample=True, temperature=0.7,
                                attention_mask=attention_mask)
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    return generated_text

data["GPT2_Synthetic"] = data["Tweet_Token"].apply(lambda tweet_tokens: generate_sentence(tweet_tokens))
data[["Tweet_Token", "GPT2_Synthetic"]]

In [None]:
from transformers import MarianMTModel, MarianTokenizer

tgt_languages = ["fr", "de", "jap"]
# Load forward and backward models and tokenizers

def back_translate(text, forw_tokenizer, forw_model, backw_tokenizer, backw_model):
    # Perform forward translation
    print(f"Starting Back Translation to ")
    forward_input = forw_tokenizer.encode(text, return_tensors="pt")
    forward_output = forw_model.generate(forward_input)
    forward_translation = forw_tokenizer.decode(forward_output[0], skip_special_tokens=True)

    # Perform backward translation
    backward_input = backw_tokenizer.encode(forward_translation, return_tensors="pt")
    backward_output = backw_model.generate(backward_input)
    backward_translation = backw_tokenizer.decode(backward_output[0], skip_special_tokens=True)

    return backward_translation

for tgt_lang in tgt_languages:
    forward_model_name = f'Helsinki-NLP/opus-mt-en-{tgt_lang}'
    backward_model_name = f'Helsinki-NLP/opus-mt-{tgt_lang}-en'
    forward_tokenizer = MarianTokenizer.from_pretrained(forward_model_name)
    forward_model = MarianMTModel.from_pretrained(forward_model_name)
    backward_tokenizer = MarianTokenizer.from_pretrained(backward_model_name)
    backward_model = MarianMTModel.from_pretrained(backward_model_name)
    data[f"{tgt_lang} Back_Translate_Synthetic"] = data["Tweet"].apply(lambda tweet: back_translate(tweet,forward_tokenizer, forward_model, backward_tokenizer, backward_model))
    data[["Tweet", "Back_Translate_Synthetic"]]


In [None]:
import tensorflow_hub as hub
import Levenshtein
import numpy as np

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

def semantic_similarity(original_tweet, synthetic_tweet):
    original_embedding = embed([original_tweet])
    synthetic_embedding =embed([synthetic_tweet])

    return np.inner(original_embedding, synthetic_embedding)[0][0]

data["Synonyms_Sim_Score"] = data[["Tweet", "Synonyms_Synthetic"]].apply(lambda x: semantic_similarity(x["Tweet"], x["Synonyms_Synthetic"]), axis=1)
data["Synonyms_Levenshtein_Score"] = data[["Tweet", "Synonyms_Synthetic"]].apply(lambda x: Levenshtein.distance(x["Tweet"], x["Synonyms_Synthetic"]), axis=1)

data["fasttext_Sim_Score"] = data[["Tweet", "fasttext_Synthetic"]].apply(lambda x: semantic_similarity(x["Tweet"], x["fasttext_Synthetic"]), axis=1)
data["fasttext_Levenshtein_Score"] = data[["Tweet", "fasttext_Synthetic"]].apply(lambda x: Levenshtein.distance(x["Tweet"], x["fasttext_Synthetic"]), axis=1)

data["GPT2_Sim_Score"] = data[["Tweet", "GPT2_Synthetic"]].apply(lambda x: semantic_similarity(x["Tweet"], x["GPT2_Synthetic"]), axis=1)
data["GPT2_Levenshtein_Score"] =  data[["Tweet", "GPT2_Synthetic"]].apply(lambda x: Levenshtein.distance(x["Tweet"], x["GPT2_Synthetic"]), axis=1)
for lang in tgt_languages:
    data[f"{lang} Back_Translate_Synthetic_Sim_Score"] = data[["Tweet", f"{lang} Back_Translate_Synthetic"]].apply(lambda x: semantic_similarity(x["Tweet"], x[f"{lang} Back_Translate_Synthetic"]), axis=1)
    data[f"{lang} Back_Translate_Synthetic_Levenshtein_Score"] = data[["Tweet", f"{lang} Back_Translate_Synthetic"]].apply(lambda x: Levenshtein.distance(x["Tweet"], x[f"{lang} Back_Translate_Synthetic"]), axis=1)

In [None]:
data.to_csv('../output/synth-output-GP2.csv')