# Data augmentation of the original data

In [1]:
import pandas as pd

import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas

import emoji

In [2]:
pd.set_option('display.max_colwidth', None)

## Read data

In [3]:
train_data = pd.read_csv('../dataset/train.tsv', sep='\t')
train_data.head(3)

Unnamed: 0,tweet_no,tweet_text,q1_label,q2_label,q3_label,q4_label,q5_label,q6_label,q7_label,language,tweet_link,tweet_link_count,preprocessed_tweet_text,emojis,translated_emojis,tweet_link_domain,tweet_link_path
0,1,For the average American the best way to tell if you have covid-19 is to cough in a rich person’s face and wait for their test results,no,,,,,no,no,en,[],0,For the average American the best way to tell if you have covid-19 is to cough in a rich person’s face and wait for their test results,,,,
1,2,this is fucking bullshit,no,,,,,no,no,en,[],0,this is fucking bullshit,,,,
2,3,Can y’all please just follow the government’s instructions so we can knock this COVID-19 out and be done?! I feel like a kindergartner that keeps losing more recess time because one or two kids can’t follow directions.,no,,,,,no,no,en,[],0,Can y’all please just follow the government’s instructions so we can knock this COVID-19 out and be done?! I feel like a kindergartner that keeps losing more recess time because one or two kids can’t follow directions.,,,,


In [4]:
text = '👉 fun fact: its tradition for europeans to spread a potentially fatal disease to every other country not fully inhabited by white people &amp; not take accountability for the subsequent devastation &amp; lives lost 👉'
print(text)

👉 fun fact: its tradition for europeans to spread a potentially fatal disease to every other country not fully inhabited by white people &amp; not take accountability for the subsequent devastation &amp; lives lost 👉


### Character augmenter

#### Keyboard Augmenter
Substitute character by keyboard distance

In [5]:
aug = nac.KeyboardAug()
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
👉 fun fact: its tradition for europeans to spread a potentially fatal disease to every other country not fully inhabited by white people &amp; not take accountability for the subsequent devastation &amp; lives lost 👉
Augmented Text:
👉 fun faft: its tradition for europeans to wpread a pptehtiallj fatal disease to every othDr coujhry not fulIy inhabited by wyite people & amp; not tak3 acc(hHtQbility for the suNseWu#nt devastation & amp; lives lost 👉


#### Swap character randomly

In [6]:
aug = nac.RandomCharAug(action="swap")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
👉 fun fact: its tradition for europeans to spread a potentially fatal disease to every other country not fully inhabited by white people &amp; not take accountability for the subsequent devastation &amp; lives lost 👉
Augmented Text:
👉 fun fcat: its tradition for europaesn to psread a potentially aftal disease to every other country not fully inhbaietd by whiet peopel & amp; not atke accountability for the sbuseuqetn devastation & amp; lives lots 👉


#### Apply on train, validation and test dataframe

In [7]:
def character_augment_data(row, train_augmented_rows):
    keyboard_aug = nac.KeyboardAug()
    swap_char_aug = nac.RandomCharAug(action="swap")
    keyboard_augmented_text = keyboard_aug.augment(row['tweet_text'])
    swap_char_augmented_text = swap_char_aug.augment(row['tweet_text'])
    new_row_1 = [row['tweet_no'], keyboard_augmented_text, row['q1_label'], row['q2_label'], row['q3_label'], row['q4_label'], 
                row['language'], row['tweet_link'], row['tweet_link_count'], row['tweet_link_domain'], row['tweet_link_path']]
    new_row_2 = [row['tweet_no'], swap_char_augmented_text, row['q1_label'], row['q2_label'], row['q3_label'], row['q4_label'], 
                row['language'], row['tweet_link'], row['tweet_link_count'], row['tweet_link_domain'], row['tweet_link_path']]
    train_augmented_rows.append(new_row_1)
    train_augmented_rows.append(new_row_2)

In [8]:
columns = ['tweet_no', 'tweet_text', 'q1_label', 'q2_label', 'q3_label', 'q4_label', 
                                          'language', 'tweet_link', 'tweet_link_count', 'tweet_link_domain', 'tweet_link_path']
train_augmented_rows = []

In [9]:
train_data.apply(lambda row: character_augment_data(row, train_augmented_rows), axis=1)

0       None
1       None
2       None
3       None
4       None
        ... 
6400    None
6401    None
6402    None
6403    None
6404    None
Length: 6405, dtype: object

In [10]:
train_augmented = pd.DataFrame(train_augmented_rows, columns = columns)

In [11]:
train_augmented.head(6)

Unnamed: 0,tweet_no,tweet_text,q1_label,q2_label,q3_label,q4_label,language,tweet_link,tweet_link_count,tweet_link_domain,tweet_link_path
0,1,For the average ZmeriSan the bes^ way to tell if you hXve coviS - 19 is to cougN in a ricM person ’ s facW and waiy for their test r2zults,no,,,,en,[],0,,
1,1,For the average American the bets way to tell if you haev cvoid - 19 is to cough in a rcih person ’ s afce and wati for hteir tset rueslts,no,,,,en,[],0,,
2,2,fhis is fucking bullshit,no,,,,en,[],0,,
3,2,this is fcuikng bullshit,no,,,,en,[],0,,
4,3,Can y ’ all please just follow the government ’ s instr^cfi)ns so we can knock tMis COVID - 19 out and be dobe? ! I fefl like a kindergartner tNat kee(s loaing morD recess time N2cause one or two kics can ’ t follow directions.,no,,,,en,[],0,,
5,3,Can y ’ all lpease juts flolow the governemnt ’ s instructions so we can konck this COVID - 19 out and be done? ! I feel like a kniedgrartner htat keeps olsing more recess time because one or two kids can ’ t ofllow directions.,no,,,,en,[],0,,


### Word Augmenter

#### Word Embeddings Augmenter
Substitute word by word2vec similarity

In [12]:
# model_type: word2vec, glove or fasttext
aug = naw.WordEmbsAug(
    model_type='word2vec', model_path='../utils/GoogleNews-vectors-negative300.txt',
    action="substitute")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
👉 fun fact: its tradition for europeans to spread a potentially fatal disease to every other country not fully inhabited by white people &amp; not take accountability for the subsequent devastation &amp; lives lost 👉
Augmented Text:
👉 fun furthermore: its forebearers for man_utd to spread a extraordinarily fatal virus to every other country not optimally inhabited Josh_Sharlow white people & amp; not take accountability tostart the subsequent devastation & amp; Counsell_Geyer defeated 👉


#### Contextual Word Embeddings Augmenter
Substitute word by contextual word embeddings (BERT, DistilBERT, RoBERTA or XLNet)

In [13]:
aug = naw.ContextualWordEmbsAug(
    model_path='bert-base-uncased', action="substitute")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
👉 fun fact: its tradition for europeans to spread a potentially fatal disease to every other country not fully inhabited by white people &amp; not take accountability for the subsequent devastation &amp; lives lost 👉
Augmented Text:
3rd see fact : australian tradition for europeans to spread a potentially fatal disease behind every other people not fully inhabited starring human people & americans ; not take accountability for a subsequent devastation & sickness ; lives lost [UNK]


In [14]:
aug = naw.ContextualWordEmbsAug(
    model_path='distilbert-base-uncased', action="substitute")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
👉 fun fact: its tradition for europeans to spread a potentially fatal disease to every other country not fully inhabited by white people &amp; not take accountability for the subsequent devastation &amp; lives lost 👉
Augmented Text:
see 花 [UNK] : its tradition guided europeans to explain a potentially fatal disease to every japanese country while continuously inhabited by white people & amp ; not take accountability back the subsequent death & amp ; lives lost [UNK]


In [15]:
aug = naw.ContextualWordEmbsAug(
    model_path='roberta-base', action="substitute")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
👉 fun fact: its tradition for europeans to spread a potentially fatal disease to every other country not fully inhabited by white people &amp; not take accountability for the subsequent devastation &amp; lives lost 👉
Augmented Text:
👉 fun fact: its tradition for euro to spread a potentially fatal disease to every other country not fully inhabited by white people &amp; not take accountability for the subsequent devastation &amp; lives lost �


#### Synonym Augmenter
Substitute word by WordNet's synonym

In [16]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\abuinoschi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [17]:
aug = naw.SynonymAug(aug_src='wordnet')
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
👉 fun fact: its tradition for europeans to spread a potentially fatal disease to every other country not fully inhabited by white people &amp; not take accountability for the subsequent devastation &amp; lives lost 👉
Augmented Text:
👉 fun fact: its tradition for european to spread a potentially fatal disease to every former land non fully inhabited by white hot people & amp; not take accountability for the subsequent devastation & amp; lives lost 👉


In [18]:
aug = naw.SynonymAug(aug_src='ppdb', model_path='../utils/ppdb-2.0-s-all/ppdb-2.0-s-all')
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
👉 fun fact: its tradition for europeans to spread a potentially fatal disease to every other country not fully inhabited by white people &amp; not take accountability for the subsequent devastation &amp; lives lost 👉
Augmented Text:
👉 funny fact: its tradition for pro europeans to spread a potentially mortal illness to every other country not fully habitat by white peoples & amp; not take duties for the subsequent desolation & amp; leaves sold 👉


#### Apply on train, validation and test dataframe

In [19]:
w2v_word_aug = naw.WordEmbsAug(model_type='word2vec', model_path='../utils/GoogleNews-vectors-negative300.txt', action="substitute")
context_bert_word_aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="substitute")
context_distilbert_word_aug = naw.ContextualWordEmbsAug(model_path='distilbert-base-uncased', action="substitute")
context_roberta_word_aug = naw.ContextualWordEmbsAug(model_path='roberta-base', action="substitute")
wordnet_synonym_aug = naw.SynonymAug(aug_src='wordnet')
ppdb_synonym_aug = naw.SynonymAug(aug_src='ppdb', model_path='../utils/ppdb-2.0-s-all/ppdb-2.0-s-all')

def word_augment_data(row, train_augmented_rows):
    w2v_word_augmented_text = w2v_word_aug.augment(row['tweet_text'])
    context_bert_word_augmented_text = context_bert_word_aug.augment(row['tweet_text'])
    context_distilbert_word_augmented_text = context_distilbert_word_aug.augment(row['tweet_text'])
    context_roberta_word_augmented_text = context_roberta_word_aug.augment(row['tweet_text'])
    wordnet_synonym_augmented_text = wordnet_synonym_aug.augment(row['tweet_text'])
    ppdb_synonym_augmented_text = ppdb_synonym_aug.augment(row['tweet_text'])
    common_columns = [row['q1_label'], row['q2_label'], row['q3_label'], row['q4_label'], 
                row['language'], row['tweet_link'], row['tweet_link_count'], row['tweet_link_domain'], row['tweet_link_path']]
    row1 = [row['tweet_no'], w2v_word_augmented_text] + common_columns
    row2 = [row['tweet_no'], context_bert_word_augmented_text] + common_columns
    row3 = [row['tweet_no'], context_distilbert_word_augmented_text] + common_columns
    row4 = [row['tweet_no'], context_roberta_word_augmented_text] + common_columns
    row5 = [row['tweet_no'], wordnet_synonym_augmented_text] + common_columns
    row6 = [row['tweet_no'], ppdb_synonym_augmented_text] + common_columns
    rows = [row1, row2, row3, row4, row5, row6]
    if row['tweet_no'] % 100 == 0:
        print(row['tweet_no'])
    train_augmented_rows += rows

In [20]:
train_data.apply(lambda row: word_augment_data(row, train_augmented_rows), axis=1)

100
200
300
400
600
700
800
900
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
100
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500


0       None
1       None
2       None
3       None
4       None
        ... 
6400    None
6401    None
6402    None
6403    None
6404    None
Length: 6405, dtype: object

In [21]:
train_augmented = pd.DataFrame(train_augmented_rows, columns = columns)
train_augmented[-20:]

Unnamed: 0,tweet_no,tweet_text,q1_label,q2_label,q3_label,q4_label,language,tweet_link,tweet_link_count,tweet_link_domain,tweet_link_path
51220,2553,"Chinese President 11 Jinping said Monday that since COVID - nineteen is still an epidemic and vaccines play a major role in mankind & # 39; s victory over it, China is ready to strengthen cooperation with Morocco in the development and production of the COVID - nineteen vaccinum. Xi made the remarks in a telephone conversation with King Mahomet sestet https: / / t. co / IjVPCo7crJ",yes,no,yes,no,ar,['https://twitter.com/PointCoom/status/1300557257546104840/photo/1'],1,twitter,Point Coom status photo
51221,2553,"Chinese President Xi Jinping exactly Monday that since COVID - 19 indicates still an epidemic and immunizations displaying a important role in mankind & # 39; s victory over it, China recommendations willing to strengthen cooperation with Morocco in the development and production of the COVID - 19 vaccination. Xi maketh the remarks in a telephone conversation with King Mohammed VI https: / / t. contractors / IjVPCo7crJ",yes,no,yes,no,ar,['https://twitter.com/PointCoom/status/1300557257546104840/photo/1'],1,twitter,Point Coom status photo
51222,2554,"Trump: The Food and Nonprescription Administration has Authorizing, in emergency cases, the use of recovered ###Hz_LCD to neural_muscular_skeletal bronchial_thermoplasty INDIANS_PROBABLE_STARTER the Corona virus - we theyre liaising_closely to remove else that may delay the development of a vaccine for## the Corona virus",yes,no,yes,no,ar,[],0,,
51223,2554,"trump : australian military and drug administration has authorized, using emergency cases, the use of recovered plasma to treat cancers with the corona virus - ministers are working to eliminate everything that may delay the arising of a tumor for primary corona online",yes,no,yes,no,ar,[],0,,
51224,2554,"trump : mainstream food and drug administration has authorized, in chronic condition, positive benefits of recovered medicine to treat patients with the corona virus - we are working to remove vaccines that may delay the validation of invasive vaccine for the medicare virus",yes,no,yes,no,ar,[],0,,
51225,2554,"Trump: The Food and Drug Administration has authorized, in emergency cases, the use of recovered plasma to treat patients with the Corona virus - we are working to remove everything that may delay the development of a vaccine for the Corona virus",yes,no,yes,no,ar,[],0,,
51226,2554,"Trump: The Food and Drug Presidential term have pass, in parking brake cases, the use of recovered plasma to treat patients with the Corona virus - we are working to remove everything that may delay the growing of a vaccine for the Corona virus",yes,no,yes,no,ar,[],0,,
51227,2554,"Trump: The Food and Drug Administration saves authority, in emergency cases, the used of recovered plasma to handled payments with the Corona virus - we available years to remove anything that may impeding the development of a vaccination for the Corona virus",yes,no,yes,no,ar,[],0,,
51228,2555,"Consequently World Health Organization: - Announces the formation of a team to monitor “ mutations ” of the Bakersfield H#N#_influenza. . - Confirms that there is a wecan for much research on the mutation of time Corona virus. . - It hopes that the Corona epidemic will end in less than seventeen years - A vaccine is expected to be reached at the end of the annual 2020, aided that the vaccine `_ll be available to everyone in 2021..",yes,no,yes,no,ar,[],0,,
51229,2555,"the world health organization : - announces the formation of a team ) monitor “ existing ” of the corona virus.. - show who there is a need for more research on the mutation of the corona virus.. - it hopes that the growing epidemic will end in less than two years - a vaccine is expected to be made wherever the faster than the end 2020, provided that the vaccine will be available to everyone in 2021..",yes,no,yes,no,ar,[],0,,


In [24]:
train_augmented.to_csv('../dataset/train_augmented.tsv', sep='\t', index=False)

### Sentence Augmentation
#### Contextual Word Embeddings for Sentence Augmenter
Insert sentence by contextual word embeddings (GPT2 or XLNet)

In [25]:
# model_path: xlnet-base-cased or gpt2
aug = nas.ContextualWordEmbsForSentenceAug(model_path='xlnet-base-cased', min_length=25, max_length=250)
augmented_texts = aug.augment(text)
print("Original:")
print(text)
print("Augmented Texts:")
print(augmented_texts)

Original:
👉 fun fact: its tradition for europeans to spread a potentially fatal disease to every other country not fully inhabited by white people &amp; not take accountability for the subsequent devastation &amp; lives lost 👉
Augmented Texts:
👉 fun fact: its tradition for europeans to spread a potentially fatal disease to every other country not fully inhabited by white people &amp; not take accountability for the subsequent devastation &amp; lives lost 👉 fun fact: europeans are very sick. This is the main reason why europeans do not want to be known.  fun fact:  fun fact:  fun fact:  fun fact:  fun fact:  fun fact:  fun fact:  fun fact:  fun fact:  fun fact:  fun fact:  fun fact:  fun fact:  fun fact:  fun fact:  fun fact:  fun fact:  fun fact:  fun fact:  fun fact:  fun fact:  fun fact:  fun fact:  fun fact:  fun fact:  fun fact:  fun fact:  fun fact:  fun fact:  fun fact:  fun fact:  fun fact:  fun fact:  fun fact:  fun fact:  fun fact


In [26]:
aug = nas.ContextualWordEmbsForSentenceAug(model_path='gpt2', min_length=25, max_length=250)
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
👉 fun fact: its tradition for europeans to spread a potentially fatal disease to every other country not fully inhabited by white people &amp; not take accountability for the subsequent devastation &amp; lives lost 👉
Augmented Text:
👉 fun fact: its tradition for europeans to spread a potentially fatal disease to every other country not fully inhabited by white people &amp; not take accountability for the subsequent devastation &amp; lives lost 👉

I'll have the video, just make sure to follow @dontblaze

https://twitter.com/#!/DontBlaze


In [27]:
aug = nas.ContextualWordEmbsForSentenceAug(model_path='distilgpt2', min_length=25, max_length=250)
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
👉 fun fact: its tradition for europeans to spread a potentially fatal disease to every other country not fully inhabited by white people &amp; not take accountability for the subsequent devastation &amp; lives lost 👉
Augmented Text:
👉 fun fact: its tradition for europeans to spread a potentially fatal disease to every other country not fully inhabited by white people &amp; not take accountability for the subsequent devastation &amp; lives lost 👉

Advertisements


## Extract emoji from this augmented data

In [3]:
augmented_train_data = pd.read_csv('../dataset/train_augmented.tsv', sep='\t')
augmented_train_data.head(2)

Unnamed: 0,tweet_no,tweet_text,q1_label,q2_label,q3_label,q4_label,language,tweet_link,tweet_link_count,tweet_link_domain,tweet_link_path
0,1,For the average ZmeriSan the bes^ way to tell if you hXve coviS - 19 is to cougN in a ricM person ’ s facW and waiy for their test r2zults,no,,,,en,[],0,,
1,1,For the average American the bets way to tell if you haev cvoid - 19 is to cough in a rcih person ’ s afce and wati for hteir tset rueslts,no,,,,en,[],0,,


In [4]:
def preprocess_tweet_text_with_emojis(dataframe):
    dataframe['preprocessed_tweet_text'] = dataframe.apply(lambda row: ''.join([(' ' + emoji.demojize(c) + ' ' if c in emoji.UNICODE_EMOJI['en'] else c) for c in row['tweet_text']]), axis=1)
    dataframe['emojis'] = dataframe.apply(lambda row: ''.join(c for c in row['tweet_text'] if c in emoji.UNICODE_EMOJI['en']), axis=1)
    dataframe['translated_emojis'] = dataframe.apply(lambda row: ''.join(' ' + emoji.demojize(c) + ' ' for c in row['tweet_text'] if c in emoji.UNICODE_EMOJI['en']), axis=1)
    return dataframe

In [5]:
augmented_train_data = preprocess_tweet_text_with_emojis(augmented_train_data)

In [6]:
augmented_train_data.to_csv('../dataset/train_augmented.tsv', sep='\t', index=False)