In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install numpy requests nlpaug
!pip install torch>=1.6.0 transformers>=4.11.3 sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[K     |████████████████████████████████| 410 kB 5.3 MB/s 
Installing collected packages: nlpaug
Successfully installed nlpaug-1.1.11


In [None]:
import nlpaug.augmenter.word as naw
from nlpaug.util import Action

import random

In [None]:
from tqdm import tqdm
import pandas as pd
import torch

import nltk
import re

nltk.download('wordnet')
nltk.download('universal_tagset')
nltk.download('averaged_perceptron_tagger')

from nltk.corpus import wordnet as wn

import random

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

nltk.download('punkt')
from nltk.tokenize import word_tokenize


def get_word_count(text):

  tokens = word_tokenize(text)

  nonPunct = re.compile('.*[A-Za-z0-9].*')  # must contain a letter or digit

  filtered = [w for w in tokens if nonPunct.match(w)]

  return len(filtered)

stop_words = set(stopwords.words('english'))

def synonym_replacement(words, n, stop_words):
    fail_count=0
    #words = words.split()

    new_words = words.copy()
    random_word_list = []
    for word in words:
      if word[1] in ["NOUN", "ADJ", "ADV", "VERB"]:
        random_word_list.append(word)
    #random_word_list = list(set([word for word in words if word not in stop_words]))
    random.shuffle(random_word_list)
    num_replaced = 0

    for random_word in random_word_list:
      synonyms = get_synonyms(random_word)

      if len(synonyms) >= 1:
          synonym = random.choice(list(synonyms))
          new_words = [synonym if word == random_word else word[0] for word in words]
          num_replaced += 1

      if num_replaced >= n: #only replace up to n words
          break
    try:
      sentence = ' '.join(new_words)
    except TypeError as e:
      print(e)
      print(new_words)
      old_words = [word[0] for word in words]
      sentence = ' '.join(old_words)
      fail_count+=1

    return sentence


def get_synonyms(word):
    """
    Get synonyms of a word
    """
    synonyms = set()
    if word[1]=="NOUN":
      param = wn.NOUN
    elif word[1]=="VERB":
      param = wn.VERB
    elif word[1]=="ADV":
      param=wn.ADV
    elif word[1]=="ADJ":
      param=wn.ADJ
    else:
      ## do something TODO
      return []
    for syn in wn.synsets(word[0], pos=param):
        for l in syn.lemmas():
            synonym = l.name().replace("_", " ").replace("-", " ").lower()
            synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
            synonyms.add(synonym)


    if word[0] in synonyms:
        synonyms.remove(word[0])
    #print("INSIDE GET_SYNONYMS: ")
    #print(synonyms)
    return list(synonyms)




[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
def augment_dataset(data_frame, output_filepath, percentage=0.2):

  augmented_dataset = []

  for article in tqdm(data_frame.itertuples()):

    body_texts = article.text

    # body_texts = body_texts.split(".")

    ## --- temporary ---

    sr_article = article.text_perturb

    ## --- temporary ---

    # new_sent = []


    ## synonym replacement:

    # for sent in body_texts:

    #   word_count = get_word_count(sent)
    #   n = int(word_count*percentage)
    #   sent_tokens = word_tokenize(sent)
    #   sent_with_pos = nltk.pos_tag(sent_tokens, tagset='universal')
    #   new_sent.append(synonym_replacement(sent_with_pos, n, stop_words))

    # sr_article = ". ".join(new_sent)


    ## Random Swap of aug_p% or fraction of words

    aug = naw.RandomWordAug(action="swap", aug_p=0.3)
    random_swap = aug.augment(body_texts)

    ## Random Deletion of aug_p% or fraction of words


    aug = naw.RandomWordAug(action="delete", aug_max=None, aug_p=0.3)
    random_del = aug.augment(body_texts)


    ## Random Crop of aug_p fraction length span


    aug = naw.RandomWordAug(action="crop", aug_max=None, aug_p=0.3)
    try:
      random_crop = aug.augment(body_texts)
    except ValueError as e:
      print(e)
      continue

    ## Random spelling mistakes

    aug = naw.SpellingAug(aug_max=None, aug_p=0.3)
    random_spelling = aug.augment(body_texts)

    try:
      augmented_dataset.append([article.text, sr_article, random_swap[0], random_del[0], random_crop[0],
                                random_spelling[0], article.label])
    except IndexError as e:
      print(e)
      print("original")
      print(body_texts)
      print(random_swap, random_del, random_crop, random_spelling)
      continue

    # break
  augmented_frame = pd.DataFrame(augmented_dataset, columns=["original","synonym_replacement", "random_swap",
                                                             "random_delete", "random_crop", "random_spelling", "label"])

  jsonl_data = augmented_frame.to_json(orient='records', lines=True)

  with open(output_filepath, "w") as text_file:
    text_file.write(jsonl_data)

  return augmented_frame

In [None]:
path = "/content/drive/Shareddrives/DARPA/Data Perturbations/Synonym_Replacement/"

real_train = pd.read_json(path + "neural_news_real.train.jsonl", lines=True, orient="records")
real_test = pd.read_json(path + "neural_news_real.test.jsonl", lines=True, orient="records")
real_holdout = pd.read_json(path + "neural_news_real.holdout.jsonl", lines=True, orient="records")

fake_train = pd.read_json(path + "neural_news_fake.train.jsonl", lines=True, orient="records")
fake_test = pd.read_json(path + "neural_news_fake.test.jsonl", lines=True, orient="records")
fake_holdout = pd.read_json(path + "neural_news_fake.holdout.jsonl", lines=True, orient="records")

In [None]:
augment_dataset(real_train, output_filepath=path+"five_augs.neural_news_real.train.jsonl")

In [None]:
augment_dataset(real_test, output_filepath=path+"five_augs.neural_news_real.test.jsonl")

augment_dataset(fake_train, output_filepath=path+"five_augs.neural_news_fake.train.jsonl")
augment_dataset(fake_test, output_filepath=path+"five_augs.neural_news_fake.test.jsonl")
augment_dataset(fake_holdout, output_filepath=path+"five_augs.neural_news_fake.holdout.jsonl")

In [None]:
augment_dataset(real_holdout, output_filepath=path+"five_augs.neural_news_real.holdout.jsonl")