<a href="https://colab.research.google.com/github/aflah02/Easy-Data-Augmentation-Implementation/blob/main/EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
import random

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


In [2]:
def eda_SR(originalSentence, n):
  """
  Paper Methodology -> Randomly choose n words from the sentence that are not stop words. 
                       Replace each of these words with one of its synonyms chosen at random.
  originalSentence -> The sentence on which EDA is to be applied
  n -> The number of words to be chosen for random synonym replacement
  """
  stops = set(stopwords.words('english'))
  splitSentence = list(originalSentence.split(" "))
  splitSentenceCopy = splitSentence.copy()
  # Since We Make Changes to The Original Sentence List The Indexes Change and Hence an initial copy proves useful to get values
  ls_nonStopWordIndexes = []
  for i in range(len(splitSentence)):
    if splitSentence[i].lower() not in stops:
      ls_nonStopWordIndexes.append(i)
  if (n > len(ls_nonStopWordIndexes)):
    raise Exception("The number of replacements exceeds the number of non stop word words")
  for i in range(n):
    indexChosen = random.choice(ls_nonStopWordIndexes)
    ls_nonStopWordIndexes.remove(indexChosen)
    synonyms = []
    originalWord = splitSentenceCopy[indexChosen]
    for synset in wordnet.synsets(originalWord):
      for lemma in synset.lemmas():
        if lemma.name() != originalWord:
          synonyms.append(lemma.name())
    splitSentence[indexChosen] = random.choice(synonyms).replace('_', ' ')
  return " ".join(splitSentence)

In [3]:
print(eda_SR("I love to play football", 2))

I have sex to romp football


In [4]:
def eda_RI(originalSentence, n):
  """
  Paper Methodology -> Find a random synonym of a random word in the sentence that is not a stop word. 
                       Insert that synonym into a random position in the sentence. Do this n times
  originalSentence -> The sentence on which EDA is to be applied
  n -> The number of times the process has to be repeated
  """
  stops = set(stopwords.words('english'))
  splitSentence = list(originalSentence.split(" "))
  splitSentenceCopy = splitSentence.copy() 
  # Since We Make Changes to The Original Sentence List The Indexes Change and Hence an initial copy proves useful to get values
  ls_nonStopWordIndexes = []
  for i in range(len(splitSentence)):
    if splitSentence[i].lower() not in stops:
      ls_nonStopWordIndexes.append(i)
  if (n > len(ls_nonStopWordIndexes)):
    raise Exception("The number of replacements exceeds the number of non stop word words")
  WordCount = len(splitSentence)
  for i in range(n):
    indexChosen = random.choice(ls_nonStopWordIndexes)
    ls_nonStopWordIndexes.remove(indexChosen)
    synonyms = []
    originalWord = splitSentenceCopy[indexChosen]
    for synset in wordnet.synsets(originalWord):
      for lemma in synset.lemmas():
        if lemma.name() != originalWord:
          synonyms.append(lemma.name())
    splitSentence.insert(random.randint(0,WordCount-1), random.choice(synonyms).replace('_', ' '))
  return " ".join(splitSentence)

In [5]:
print(eda_RI("I love to play football", 2))

I love to making love play football game football


In [6]:
def eda_RS(originalSentence, n):
  """
  Paper Methodology -> Find a random synonym of a random word in the sentence that is not a stop word. 
                       Insert that synonym into a random position in the sentence. Do this n times
  originalSentence -> The sentence on which EDA is to be applied
  n -> The number of times the process has to be repeated
  """
  splitSentence = list(originalSentence.split(" "))
  WordCount = len(splitSentence)
  if (WordCount == 1):
    raise Exception("No Swaps Possible in One Word Sentences")
  for i in range(n):
    firstIndex = random.randint(0,WordCount-1)
    secondIndex = random.randint(0,WordCount-1)
    while (secondIndex == firstIndex):
      secondIndex = random.randint(0,WordCount-1)
    splitSentence[firstIndex], splitSentence[secondIndex] = splitSentence[secondIndex], splitSentence[firstIndex]
  return " ".join(splitSentence)

In [7]:
print(eda_RS("I love to play football", 2))

play I to love football


In [8]:
def eda_RD(originalSentence, p):
  """
  Paper Methodology -> Randomly remove each word in the sentence with probability p.
  originalSentence -> The sentence on which EDA is to be applied
  p -> Probability of a Word Being Removed
  """
  if (p == 1):
      raise Exception("Always an Empty String Will Be Returned") 
  if (p > 1 or p < 0):
    raise Exception("Improper Probability Value")
  splitSentence = list(originalSentence.split(" "))
  lsIndexesRemoved = []
  WordCount = len(splitSentence)
  if (WordCount == 1):
    raise Exception("No Swaps Possible in One Word Sentences")
  for i in range(WordCount):
    randomDraw = random.random()
    if randomDraw <= p:
      lsIndexesRemoved.append(i)
  lsRetainingWords = []
  for i in range(len(splitSentence)):
    if i not in lsIndexesRemoved:
      lsRetainingWords.append(splitSentence[i])
  return " ".join(lsRetainingWords)

In [9]:
print(eda_RD("I love to play football", 0.3))

football


### Building Dataset

In [10]:
!pip install -q tfds-nightly

[K     |████████████████████████████████| 4.2 MB 3.4 MB/s 
[K     |████████████████████████████████| 76 kB 2.3 MB/s 
[?25h

In [31]:
!wget -q https://raw.githubusercontent.com/clairett/pytorch-sentiment-classification/master/data/SST2/train.tsv
!wget -q https://raw.githubusercontent.com/clairett/pytorch-sentiment-classification/master/data/SST2/test.tsv
!wget -q https://raw.githubusercontent.com/clairett/pytorch-sentiment-classification/master/data/SST2/dev.tsv

In [21]:
import pandas as pd
df_train = pd.read_csv('train.tsv', delimiter = '\t', names = ['Sentence', 'Label'])
df_train['Split'] = 'Train'
df_dev = pd.read_csv('dev.tsv', delimiter = '\t', names = ['Sentence', 'Label'])
df_dev['Split'] = 'Test' # Since The Original Dataset Does Not Use an Explicit Dev Set It's Considered To Be A Part of Test Set
df_test = pd.read_csv('test.tsv', delimiter = '\t', names = ['Sentence', 'Label'])
df_test['Split'] = 'Test'
df = pd.concat([df_train, df_dev, df_test])
df = df.reset_index(drop=True)
df = df.loc[:, ["Label","Split","Sentence"]]
print(df)
df.to_csv('dataset.csv')

      Label  Split                                           Sentence
0         1  Train  a stirring , funny and finally transporting re...
1         0  Train  apparently reassembled from the cutting room f...
2         0  Train  they presume their audience wo n't sit still f...
3         1  Train  this is a visually stunning rumination on love...
4         1  Train  jonathan parker 's bartleby should have been t...
...     ...    ...                                                ...
9608      0   Test  an often deadly boring , strange reading of a ...
9609      0   Test  the problem with concept films is that if the ...
9610      0   Test  safe conduct , however ambitious and well inte...
9611      0   Test  a film made with as little wit , interest , an...
9612      0   Test  but here 's the real damn it is n't funny , ei...

[9613 rows x 3 columns]


In [13]:
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow import keras
from tensorflow.keras import layers
import pickle

In [27]:
def filter_train(line):
    split_line = tf.strings.split(line, ",", maxsplit=3)
    dataset_belonging = split_line[2]  
    sentiment_category = split_line[1]  

    return (
        True
        if dataset_belonging == "Train"
        else False
    )


def filter_test(line):
    split_line = tf.strings.split(line, ",", maxsplit=3)
    dataset_belonging = split_line[2] 
    sentiment_category = split_line[1] 

    return (
        True if dataset_belonging == "Test"
        else False
    )

In [28]:
ds_train = tf.data.TextLineDataset('dataset.csv').filter(filter_train)
ds_test = tf.data.TextLineDataset('dataset.csv').filter(filter_test)

In [30]:
for line in ds_test.skip(1).take(2):
  print(line)

tf.Tensor(b'6921,0,Test,"if you \'ve ever entertained the notion of doing what the title of this film implies , what sex with strangers actually shows may put you off the idea forever"', shape=(), dtype=string)
tf.Tensor(b'6922,0,Test,k 19 exploits our substantial collective fear of nuclear holocaust to generate cheap hollywood tension', shape=(), dtype=string)


### Creating Vocabulary

In [35]:
tokenizer = tfds.deprecated.text.Tokenizer()

In [36]:
def build_vocabulary(ds_train, threshold=200):
    """ Build a vocabulary """
    frequencies = {}
    vocabulary = set()
    vocabulary.update(["sostoken"])
    vocabulary.update(["eostoken"])

    for line in ds_train.skip(1):
        split_line = tf.strings.split(line, ",", maxsplit=3)
        review = split_line[-1]
        tokenized_text = tokenizer.tokenize(review.numpy().lower())

        for word in tokenized_text:
            if word not in frequencies:
                frequencies[word] = 1

            else:
                frequencies[word] += 1

            # if we've reached the threshold
            if frequencies[word] == threshold:
                vocabulary.update(tokenized_text)

    return vocabulary
  
vocabulary = build_vocabulary(ds_train)
vocab_file = open("vocabulary.obj", "wb")
pickle.dump(vocabulary, vocab_file)

In [None]:
# vocab_file = open("vocabulary.obj", "rb")
# vocabulary = pickle.load(vocab_file)

### Tokenizing

In [39]:
encoder = tfds.deprecated.text.TokenTextEncoder(
    list(vocabulary), oov_token="<UNK>", lowercase=True, tokenizer=tokenizer,
)

In [41]:
def my_encoder(text_tensor, label):
    encoded_text = encoder.encode(text_tensor.numpy())
    return encoded_text, label


def encode_map_fn(line):
    split_line = tf.strings.split(line, ",", maxsplit=3)
    label_str = split_line[1]  # neg, pos
    review = "sostoken " + split_line[3] + " eostoken"
    label = 1 if label_str == "1" else 0

    (encoded_text, label) = tf.py_function(
        my_encoder, inp=[review, label], Tout=(tf.int64, tf.int32),
    )

    encoded_text.set_shape([None])
    label.set_shape([])
    return encoded_text, label

In [42]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
ds_train = ds_train.map(encode_map_fn, num_parallel_calls=AUTOTUNE).cache()
ds_train = ds_train.shuffle(25000)
ds_train = ds_train.padded_batch(32, padded_shapes=([None], ()))

ds_test = ds_test.map(encode_map_fn)
ds_test = ds_test.padded_batch(32, padded_shapes=([None], ()))

## RNN Model

https://www.tensorflow.org/text/tutorials/text_classification_rnn

Paper Uses - 
The architecture used in this paper is as follows: input layer, bi-directional hidden layer with 64 LSTM cells, dropout layer with p=0.5, bi-directional layer of 32 LSTM cells, dropout layer with p=0.5, dense layer of 20 hidden units with ReLU activation, softmax output layer. We initialize this network with random normal weights and train against the categorical crossentropy loss function with the adam optimizer. We use early stopping with a patience of 3 epochs.

In [14]:
import numpy as np

import tensorflow_datasets as tfds
import tensorflow as tf

tfds.disable_progress_bar()

In [15]:
import matplotlib.pyplot as plt


def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])