In [None]:
import pandas as pd

data =pd.read_csv(
    filepath_or_buffer='../data/test.csv',
    sep='\t',
    encoding='utf8',
    names=["ID", "Label", "Tweet"]
    )


In [None]:
from nltk.corpus import wordnet, stopwords
from nltk.tokenize import TweetTokenizer
import numpy as np

tk = TweetTokenizer()
stop_words = set(stopwords.words('english'))

def token_pipeline(tweet):
    tweet_tokens = tk.tokenize(tweet)
    # filter(lambda token: token not in stop_words, tweet_tokens)
    tweet_token_without_stopwords = list(tweet_tokens)
    return tweet_token_without_stopwords

data["Tweet_Token"] = data["Tweet"].apply(token_pipeline)

data = pd.DataFrame(np.repeat(data.values, 5, axis=0), columns=data.columns)
data

In [None]:
import random

def create_synthetic_tweet_synonyms(tweet):
    tweet_synonyms = []
    for token in tweet:
        word_synonyms=[]
        for word_synonym in wordnet.synsets(token):
            for word_lemma in word_synonym.lemmas():
                word_synonyms.append(word_lemma.name())
        if word_synonyms:
            tweet_synonyms.append(random.choice(list(set(word_synonyms))))
        else:
            tweet_synonyms.append(token)

    return " ".join(tweet_synonyms)

data["Synonyms_Synthetic"] = data["Tweet_Token"].apply(lambda tweet: create_synthetic_tweet_synonyms(tweet))

data[["Tweet", "Synonyms_Synthetic", "Tweet_Token"]]

In [None]:
from gensim.models import fasttext


fasttext_model = fasttext.load_facebook_model(path="H:\\wiki.simple.bin")


In [None]:
import numpy as np


def create_synthetic_tweet_word_embeddings(tokens, model, percentage=0.2):
    snyth_token = []
    num_words_to_replace = int(len(tokens) * percentage)
    words_to_replace = random.sample(range(len(tokens)), num_words_to_replace)

    for idx in words_to_replace:
        word = tokens[idx]
        similar_words = model.wv.most_similar(word, topn=6)
        similar_words = [w for w, _ in similar_words if w.lower() != word.lower()]

        if similar_words:
            new_word = np.random.choice(similar_words)
            snyth_token.append(new_word)
            continue
        snyth_token.append(tokens[idx])
    return " ".join(snyth_token)

data["fasttext_Synthetic"] = data["Tweet_Token"].apply(lambda tweet: create_synthetic_tweet_word_embeddings(tweet, fasttext_model, percentage=1))
data[["Tweet_Token", "fasttext_Synthetic"]]

In [None]:
import tensorflow_hub as hub
import Levenshtein

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

def semantic_similarity(original_tweet, synthetic_tweet):
    original_embedding = embed([original_tweet])
    synthetic_embedding =embed([synthetic_tweet])

    return np.inner(original_embedding, synthetic_embedding)[0][0]

data["Synonyms_Sim_Score"] = data[["Tweet", "Synonyms_Synthetic"]].apply(lambda X: semantic_similarity(X["Tweet"], X["Synonyms_Synthetic"]), axis=1)
data["Synonyms_Levenshtein_Score"] = data[["Tweet", "Synonyms_Synthetic"]].apply(lambda X: Levenshtein.distance(X["Tweet"], X["Synonyms_Synthetic"]), axis=1)

data["fasttext_Sim_Score"] = data[["Tweet", "fasttext_Synthetic"]].apply(lambda X: semantic_similarity(X["Tweet"], X["fasttext_Synthetic"]), axis=1)
data["fasttext_Levenshtein_Score"] = data[["Tweet", "fasttext_Synthetic"]].apply(lambda X: Levenshtein.distance(X["Tweet"], X["fasttext_Synthetic"]), axis=1)

In [None]:
data.to_csv('../output/synth-output_new.csv')