In [5]:
import pandas as pd

data =pd.read_csv(
    filepath_or_buffer='../data/test.csv',
    sep='\t',
    encoding='utf8',
    names=["ID", "Label", "Tweet"]
    )

data

Unnamed: 0,ID,Label,Tweet
0,619950566786113536,neutral,"Picturehouse's, Pink Floyd's, 'Roger Waters: T..."
1,619971047195045888,negative,If these runway renovations at the airport pre...
2,619994586182619136,positive,"Excited to read ""Go Set a Watchman"" on Tuesday..."
3,620015649151021056,neutral,Trump said June 30th that he'd be at Miss USA ...
4,620067692066828288,negative,Bad Blood may have the absolute worst lyricism...
5,620063502682599425,positive,No. It's okay. You can go attend Miss USA! Tru...


In [6]:
from nltk.corpus import wordnet, stopwords
from nltk.tokenize import TweetTokenizer

tk = TweetTokenizer()
stop_words = set(stopwords.words('english'))

def token_pipeline(tweet):
    tweet_tokens = tk.tokenize(tweet)
    tweet_token_without_stopwords = list(filter(lambda token: token not in stop_words, tweet_tokens))
    return tweet_token_without_stopwords

data["Tweet_Token"] = data["Tweet"].apply(token_pipeline)

data

Unnamed: 0,ID,Label,Tweet,Tweet_Token
0,619950566786113536,neutral,"Picturehouse's, Pink Floyd's, 'Roger Waters: T...","[Picturehouse's, ,, Pink, Floyd's, ,, ', Roger..."
1,619971047195045888,negative,If these runway renovations at the airport pre...,"[If, runway, renovations, airport, prevent, se..."
2,619994586182619136,positive,"Excited to read ""Go Set a Watchman"" on Tuesday...","[Excited, read, "", Go, Set, Watchman, "", Tuesd..."
3,620015649151021056,neutral,Trump said June 30th that he'd be at Miss USA ...,"[Trump, said, June, 30th, he'd, Miss, USA, pag..."
4,620067692066828288,negative,Bad Blood may have the absolute worst lyricism...,"[Bad, Blood, may, absolute, worst, lyricism, I..."
5,620063502682599425,positive,No. It's okay. You can go attend Miss USA! Tru...,"[No, ., It's, okay, ., You, go, attend, Miss, ..."


In [7]:
import random

def find_synonyms(tweet):
    tweet_synonyms = []
    for token in tweet:
        word_synonyms=[]
        for word_synonym in wordnet.synsets(token):
            for word_lemma in word_synonym.lemmas():
                word_synonyms.append(word_lemma.name())
        if word_synonyms:
            tweet_synonyms.append(random.choice(list(set(word_synonyms))))
        else:
            tweet_synonyms.append(token)

    return tweet_synonyms

data["synthetic"] = data["Tweet_Token"].apply(find_synonyms)
data["synthetic"] = data["synthetic"].apply(" ".join)
data[["synthetic", "Tweet"]]

Unnamed: 0,synthetic,Tweet
0,"Picturehouse's , ping Floyd's , ' Roger Waters...","Picturehouse's, Pink Floyd's, 'Roger Waters: T..."
1,If track refurbishment aerodrome keep visualiz...,If these runway renovations at the airport pre...
2,"commove translate "" exit circle watchman "" Tue...","Excited to read ""Go Set a Watchman"" on Tuesday..."
3,scoop read June 30th he'd young_woman America ...,Trump said June 30th that he'd be at Miss USA ...
4,regretful blood_line Crataegus_laevigata right...,Bad Blood may have the absolute worst lyricism...
5,no_more . It's approve . You snuff_it look ove...,No. It's okay. You can go attend Miss USA! Tru...
