In [2]:
!pip install textaugment &> /dev/null
!pip install torch &> /dev/null
!pip install nlpaug &> /dev/null
!pip install wget &> /dev/null
!pip install torchtext &> /dev/null
!pip install --no-cache-dir transformers sentencepiece &> /dev/null


In [3]:
import nltk

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/eishmaheshwari/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/eishmaheshwari/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/eishmaheshwari/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/eishmaheshwari/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [4]:
from textaugment import EDA, Translate
import nlpaug.augmenter.word as naw
import random

# SENTENCE-WIDE AUGMENTATION
class DataAug:
    def __init__(self):
        self.eda = EDA()
                
    def random_oversampling(self, text):
        # Baseline
        return text    
    
    def synonym_replacement(self, text):
        return self.eda.synonym_replacement(text)
    
    def random_deletion(self, text, p=0.2):
        return self.eda.random_deletion(text, p)
    
    def random_swap(self, text):
        return self.eda.random_swap(text)
    
    def random_insertion(self, text):
        return self.eda.random_insertion(text)
    
    def eda_combine(self, text):
        def transform(sentence):
            idx = random.randint(0, 3)
            if idx == 0:
                return str(self.synonym_replacement(sentence))
            elif idx == 1:
                return str(self.random_deletion(sentence))
            elif idx == 2:
                return str(self.random_swap(sentence))
            else:
                return str(self.random_insertion(sentence))
                
        sentences = nltk.sent_tokenize(text)
        new_sentences = [transform(str(s)) for s in sentences]
        return ' '.join(new_sentences)
                
    def shuffle_sentences(self, text):
        sentences = nltk.sent_tokenize(text)
        random.shuffle(sentences)
        return ' '.join(sentences)
    
    def contextual_embedding(self, text):
        # BERT
        sentences = nltk.sent_tokenize(text)
        aug = naw.ContextualWordEmbsAug(
            model_path='bert-base-uncased', action="insert")
        new_sentences = [str(aug.augment(str(s))) for s in sentences]
        return ' '.join(new_sentences)
    
    def back_translation(self, text, source_lang='en', target_lang='es'):
        sentences = nltk.sent_tokenize(text)
        t = Translate(src=source_lang, to=target_lang)      
        new_sentences = [str(t.augment(str(s))) for s in sentences]
        return ' '.join(new_sentences)

In [5]:
import pandas as pd

# get scouting data
d = pd.read_csv('data/twtc.csv')
scouting_data = d[['text', 'label']]

# separate labeled and unlabeled data
print("Pos", scouting_data[scouting_data["label"] == 0].shape[0])
print("Neg", scouting_data[scouting_data["label"] == 1].shape[0])
print("No data", scouting_data[scouting_data["label"] == -1].shape[0])

Pos 5664
Neg 2114
No data 1397


# Dataset Creation

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

labeled = pd.read_csv('data/labeled_scouting.csv')
train, test = train_test_split(labeled, test_size=0.2)

In [9]:
# def augment(training_data, augment_function, verbose):
#     augmented_train = training_data.copy()
#     pos = augmented_train[augmented_train["label"] == 1]
#     neg = augmented_train[augmented_train["label"] == 0]
#     for i in range(len(neg) - len(pos)):
#         row = pos.sample().copy(deep=True)       
#         row['text'] = row['text'].map(augment_function)  
#         augmented_train = augmented_train.append(row, ignore_index=True)    
#         if verbose and i % 10 == 0:
#             print(f"{i*100/(len(neg) - len(pos))} % done")
#     return shuffle(augmented_train)

def augment(training_data, augment_function, verbose):
    augmented_train = training_data.copy()
    pos = augmented_train[augmented_train["label"] == 1]
    neg = augmented_train[augmented_train["label"] == 0]
    
    i = 0
    while i < len(neg) - len(pos):
        try:
            row = pos.sample().copy(deep=True)       
            row['text'] = row['text'].map(augment_function)  
            augmented_train = augmented_train.append(row, ignore_index=True)    
            if verbose and i % 10 == 0:
                print(f"{i*100/(len(neg) - len(pos))} % done")
            i += 1
        except AttributeError:
            pass
        
    return shuffle(augmented_train)
    
def write_augment_csv(aug_function_name, aug_function, verbose=False):
    augmented_train_eda = augment(train, aug_function, verbose)
    augmented_train_eda.to_csv(f'data/augment/train_{aug_function_name}.csv', index=False)
    test.to_csv(f'data/augment/test.csv', index=False)
    
aug = DataAug()
# write_augment_csv("eda", aug.eda_combine)
# write_augment_csv("shuffle_sentences", aug.shuffle_sentences)
# write_augment_csv("contextual_embedding", aug.contextual_embedding, verbose=True)
write_augment_csv("back_translation", aug.back_translation, verbose=True)

0.0 % done


KeyboardInterrupt: 