In [None]:
import pandas as pd

INPUT_FILE = "test-2"

data =pd.read_csv(
    filepath_or_buffer=f"../data/{INPUT_FILE}.csv",
    sep='\t',
    encoding='utf8',
    names=["ID", "Label", "Tweet"]
    )


In [None]:
from nltk.corpus import wordnet, stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
import numpy as np
import re

DATA_MULTIPLIER = 3

tweet_tokenizer = TweetTokenizer()

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def token_pipeline(tweet):
    # Lowercase the tweet
    tweet = tweet.lower()

    # Remove URLs
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet, flags=re.MULTILINE)

    # Remove user mentions
    tweet = re.sub(r'\@\w+', '', tweet)

    # Remove hashtags
    tweet = re.sub(r'\#\w+', '', tweet)

    # Remove special characters and punctuation
    tweet = re.sub(r'\W', ' ', tweet)

    # Remove digits and numbers
    tweet = re.sub(r'\d', '', tweet)

    tokens = tweet_tokenizer.tokenize(tweet)

    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return tokens

data["Tweet_Token"] = data["Tweet"].apply(token_pipeline)

data = pd.DataFrame(np.repeat(data.values, DATA_MULTIPLIER, axis=0), columns=data.columns)
data

# Create synthetic data with Synonyms

In [None]:
import random

SYNONYM_PERCENTAGE = 0.2
synonym_col_name = f"Synonyms_Synthetic {SYNONYM_PERCENTAGE}"
def replace_words_with_synonyms(tweet_tokens, percentage=0.2):
    tmp_tokens = tweet_tokens.copy()

    num_to_replace = int(len(tmp_tokens) * percentage)

    for i in range(num_to_replace):
        rand_index = random.randint(0, len(tmp_tokens) - 1)

        word = tmp_tokens[rand_index]

        synsets = wordnet.synsets(word)
        synonyms = set()
        for synset in synsets:
            for lemma in synset.lemmas():
                synonyms.add(lemma.name())

        if len(synonyms) > 0:
            tmp_tokens[rand_index] = random.choice(list(synonyms))

    return ' '.join(tmp_tokens)

data[synonym_col_name] = data["Tweet_Token"].apply(lambda tweet: replace_words_with_synonyms(tweet, SYNONYM_PERCENTAGE))

data[["Tweet", synonym_col_name, "Tweet_Token"]]

# Create synthetic data with Word Embeddings

In [None]:
from gensim.models import fasttext, KeyedVectors


fasttext_model = fasttext.load_facebook_model(path="H:\\wiki.simple.bin").wv
word2vec_model = KeyedVectors.load("H:\\word2vec-google-news-300")


In [None]:
import random

EMBEDDING_PERCENTAGE = 0.5
fasttext_col_name = f"fasttext_Synthetic {EMBEDDING_PERCENTAGE}"
word2vec_col_name = f"word2vec_Synthetic {EMBEDDING_PERCENTAGE}"
def create_synthetic_tweet_word_embeddings(tokens, model, percentage=0.2):
    tmp_tokens = tokens.copy()
    num_words_to_replace = int(len(tmp_tokens) * percentage)
    words_to_replace = random.sample(range(len(tmp_tokens)), num_words_to_replace)

    for idx in words_to_replace:
        word = tmp_tokens[idx]
        try:
            similar_words = model.most_similar(word, topn=3)
            similar_words = [w for w, _ in similar_words if w.lower() != word.lower()]

            if similar_words:
                new_word = np.random.choice(similar_words)
                tmp_tokens[idx] = new_word
        except KeyError:
            continue
    return " ".join(tmp_tokens)

#data[fasttext_col_name] = data["Tweet_Token"].apply(lambda tweet: create_synthetic_tweet_word_embeddings(tweet, fasttext_model, EMBEDDING_PERCENTAGE))
data[word2vec_col_name] = data["Tweet_Token"].apply(lambda tweet: create_synthetic_tweet_word_embeddings(tweet, word2vec_model, EMBEDDING_PERCENTAGE))
data[["Tweet_Token", fasttext_col_name, word2vec_col_name]]

# Create synthetic data with GPT2

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

SEED_PERCENTAGE = 0.5

model_name = "gpt2"
model_col_name = f"GPT2_Synthetic {SEED_PERCENTAGE}"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

def generate_sentence(tweet_token, seed_percent=0.5):
    length_of_seed_tokens = int(len(tweet_token) * seed_percent)
    seed = " ".join(tweet_token[0:length_of_seed_tokens])

    tokenizer.pad_token = tokenizer.eos_token
    input_text = tokenizer.encode(seed, return_tensors="pt", padding=True)

    output = model.generate(input_text, max_length=len(tweet_token)*3, num_return_sequences=1, do_sample=True, temperature=0.7)
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    generated_text = generated_text.replace("\n", "")
    return generated_text

data[model_col_name] = data["Tweet_Token"].apply(lambda tweet_tokens: generate_sentence(tweet_tokens, SEED_PERCENTAGE))
data[["Tweet_Token", model_col_name]]

# Create synthetic data with Back Translation

In [None]:
from transformers import MarianMTModel, MarianTokenizer
import itertools

tgt_languages = ["fr", "de", "es"]
language_combinations = list(itertools.combinations(tgt_languages, 2))


def back_translate(text, forw_tokenizer, forw_model, backw_tokenizer, backw_model):
    forward_input = forw_tokenizer.encode(text, return_tensors="pt")
    forward_output = forw_model.generate(forward_input)
    forward_translation = forw_tokenizer.decode(forward_output[0], skip_special_tokens=True)

    backward_input = backw_tokenizer.encode(forward_translation, return_tensors="pt")
    backward_output = backw_model.generate(backward_input)
    backward_translation = backw_tokenizer.decode(backward_output[0], skip_special_tokens=True)

    return backward_translation

def multiple_back_translate(text, first_forw_tokenizer, first_forw_model, second_forw_tokenizer, second_forw_model, second_backw_tokenizer, second_backw_model, first_backw_tokenizer, first_backw_model):
    first_forward_input = first_forw_tokenizer.encode(text, return_tensors="pt")
    first_forward_output = first_forw_model.generate(first_forward_input)
    first_forward_translation = first_forw_tokenizer.decode(first_forward_output[0], skip_special_tokens=True)

    second_forward_input = second_forw_tokenizer.encode(first_forward_translation, return_tensors="pt")
    second_forward_output = second_forw_model.generate(second_forward_input)
    second_forward_translation = second_forw_tokenizer.decode(second_forward_output[0], skip_special_tokens=True)

    second_backward_input = second_backw_tokenizer.encode(second_forward_translation, return_tensors="pt")
    second_backward_output = second_backw_model.generate(second_backward_input)
    second_backward_translation = second_backw_tokenizer.decode(second_backward_output[0], skip_special_tokens=True)

    first_backward_input = first_backw_tokenizer.encode(second_backward_translation, return_tensors="pt")
    first_backward_output = first_backw_model.generate(first_backward_input)
    first_backward_translation = first_backw_tokenizer.decode(first_backward_output[0], skip_special_tokens=True)

    return first_backward_translation

forward_models, forward_tokenizers = {}, {}
backward_models, backward_tokenizers = {}, {}

for tgt_lang in tgt_languages:
    forward_model_name = f'Helsinki-NLP/opus-mt-en-{tgt_lang}'
    backward_model_name = f'Helsinki-NLP/opus-mt-{tgt_lang}-en'
    forward_tokenizers[("en",tgt_lang)] = MarianTokenizer.from_pretrained(forward_model_name)
    forward_models[("en",tgt_lang)] = MarianMTModel.from_pretrained(forward_model_name)
    backward_tokenizers[(tgt_lang, "en")] = MarianTokenizer.from_pretrained(backward_model_name)
    backward_models[(tgt_lang, "en")] = MarianMTModel.from_pretrained(backward_model_name)

for tgt_lang_1, tgt_lang_2 in language_combinations:
    forward_model_name = f'Helsinki-NLP/opus-mt-{tgt_lang_1}-{tgt_lang_2}'
    backward_model_name = f'Helsinki-NLP/opus-mt-{tgt_lang_2}-{tgt_lang_1}'
    forward_tokenizers[(tgt_lang_1, tgt_lang_2)] = MarianTokenizer.from_pretrained(forward_model_name)
    forward_models[(tgt_lang_1, tgt_lang_2)] = MarianMTModel.from_pretrained(forward_model_name)
    backward_tokenizers[(tgt_lang_2, tgt_lang_1)] = MarianTokenizer.from_pretrained(backward_model_name)
    backward_models[(tgt_lang_2, tgt_lang_1)] = MarianMTModel.from_pretrained(backward_model_name)


translation_cache = {}

def translate_tweet(tweet, tgt_lang, second_tgt_lang=None):
    cache_key = (tweet, tgt_lang, second_tgt_lang) if second_tgt_lang else (tweet, tgt_lang)

    if cache_key in translation_cache:
        return translation_cache[cache_key]
    else:
        if second_tgt_lang:
            translation = multiple_back_translate(
                tweet,
                forward_tokenizers[("en", tgt_lang)],
                forward_models[("en", tgt_lang)],
                forward_tokenizers[(tgt_lang, second_tgt_lang)],
                forward_models[(tgt_lang, second_tgt_lang)],
                backward_tokenizers[(second_tgt_lang, tgt_lang)],
                backward_models[(second_tgt_lang, tgt_lang)],
                backward_tokenizers[(tgt_lang, "en")],
                backward_models[(tgt_lang, "en")],
            )
        else:
            translation = back_translate(
                tweet,
                forward_tokenizers[("en", tgt_lang)],
                forward_models[("en", tgt_lang)],
                backward_tokenizers[(tgt_lang, "en")],
                backward_models[(tgt_lang, "en")],
            )
        translation_cache[cache_key] = translation
        return translation

for tgt_lang in tgt_languages:
    data[f"{tgt_lang} Back_Translate_Synthetic"] = data["Tweet"].apply(lambda tweet: translate_tweet(tweet, tgt_lang))

for tgt_lang_1, tgt_lang_2 in language_combinations:
   data[f"{tgt_lang_1} - {tgt_lang_2} Back_Translate_Synthetic"] = data["Tweet"].apply(lambda tweet: translate_tweet(tweet, tgt_lang_1, tgt_lang_2))

# Calculate similarity scores

In [None]:
import tensorflow_hub as hub
import Levenshtein
import numpy as np

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

def semantic_similarity(original_tweet, synthetic_tweet):
    original_embedding = embed([original_tweet])
    synthetic_embedding = embed([synthetic_tweet])
    return np.inner(original_embedding, synthetic_embedding)[0][0]

def compute_scores(data, col1, col2, prefix):
    data[f"{prefix}_Sim_Score"] = data[[col1, col2]].apply(lambda x: semantic_similarity(x[col1], x[col2]), axis=1)
    data[f"{prefix}_Levenshtein_Score"] = data[[col1, col2]].apply(lambda x: Levenshtein.distance(x[col1], x[col2]), axis=1)

compute_scores(data, "Tweet", synonym_col_name, "Synonyms")
compute_scores(data, "Tweet", fasttext_col_name, "fasttext")
compute_scores(data, "Tweet", word2vec_col_name, "word2vec")
compute_scores(data, "Tweet", model_col_name, "GPT2")

for lang in tgt_languages:
    compute_scores(data, "Tweet", f"{lang} Back_Translate_Synthetic", f"{lang} Back_Translate")

In [None]:
output_file = f"synth-output-{INPUT_FILE}-syn-{SYNONYM_PERCENTAGE}-emb-{EMBEDDING_PERCENTAGE}-gen-{SEED_PERCENTAGE}-lang-{tgt_languages}.csv"

data.to_csv(f"../output/{output_file}", index=False)