<a href="https://colab.research.google.com/github/aflah02/Easy-Data-Augmentation-Implementation/blob/main/EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
import random

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


In [2]:
def eda_SR(originalSentence, n):
  """
  Paper Methodology -> Randomly choose n words from the sentence that are not stop words. 
                       Replace each of these words with one of its synonyms chosen at random.
  originalSentence -> The sentence on which EDA is to be applied
  n -> The number of words to be chosen for random synonym replacement
  """
  stops = set(stopwords.words('english'))
  splitSentence = list(originalSentence.split(" "))
  splitSentenceCopy = splitSentence.copy()
  # Since We Make Changes to The Original Sentence List The Indexes Change and Hence an initial copy proves useful to get values
  ls_nonStopWordIndexes = []
  for i in range(len(splitSentence)):
    if splitSentence[i].lower() not in stops:
      ls_nonStopWordIndexes.append(i)
  if (n > len(ls_nonStopWordIndexes)):
    raise Exception("The number of replacements exceeds the number of non stop word words")
  for i in range(n):
    indexChosen = random.choice(ls_nonStopWordIndexes)
    ls_nonStopWordIndexes.remove(indexChosen)
    synonyms = []
    originalWord = splitSentenceCopy[indexChosen]
    for synset in wordnet.synsets(originalWord):
      for lemma in synset.lemmas():
        if lemma.name() != originalWord:
          synonyms.append(lemma.name())
    splitSentence[indexChosen] = random.choice(synonyms).replace('_', ' ')
  return " ".join(splitSentence)

In [3]:
print(eda_SR("I love to play football", 2))

I dear to play football game


In [4]:
def eda_RI(originalSentence, n):
  """
  Paper Methodology -> Find a random synonym of a random word in the sentence that is not a stop word. 
                       Insert that synonym into a random position in the sentence. Do this n times
  originalSentence -> The sentence on which EDA is to be applied
  n -> The number of times the process has to be repeated
  """
  stops = set(stopwords.words('english'))
  splitSentence = list(originalSentence.split(" "))
  splitSentenceCopy = splitSentence.copy() 
  # Since We Make Changes to The Original Sentence List The Indexes Change and Hence an initial copy proves useful to get values
  ls_nonStopWordIndexes = []
  for i in range(len(splitSentence)):
    if splitSentence[i].lower() not in stops:
      ls_nonStopWordIndexes.append(i)
  if (n > len(ls_nonStopWordIndexes)):
    raise Exception("The number of replacements exceeds the number of non stop word words")
  WordCount = len(splitSentence)
  for i in range(n):
    indexChosen = random.choice(ls_nonStopWordIndexes)
    ls_nonStopWordIndexes.remove(indexChosen)
    synonyms = []
    originalWord = splitSentenceCopy[indexChosen]
    for synset in wordnet.synsets(originalWord):
      for lemma in synset.lemmas():
        if lemma.name() != originalWord:
          synonyms.append(lemma.name())
    splitSentence.insert(random.randint(0,WordCount-1), random.choice(synonyms).replace('_', ' '))
  return " ".join(splitSentence)

In [5]:
print(eda_RI("I love to play football", 2))

I dear recreate love to play football


In [6]:
def eda_RS(originalSentence, n):
  """
  Paper Methodology -> Find a random synonym of a random word in the sentence that is not a stop word. 
                       Insert that synonym into a random position in the sentence. Do this n times
  originalSentence -> The sentence on which EDA is to be applied
  n -> The number of times the process has to be repeated
  """
  splitSentence = list(originalSentence.split(" "))
  WordCount = len(splitSentence)
  if (WordCount == 1):
    raise Exception("No Swaps Possible in One Word Sentences")
  for i in range(n):
    firstIndex = random.randint(0,WordCount-1)
    secondIndex = random.randint(0,WordCount-1)
    while (secondIndex == firstIndex):
      secondIndex = random.randint(0,WordCount-1)
    splitSentence[firstIndex], splitSentence[secondIndex] = splitSentence[secondIndex], splitSentence[firstIndex]
  return " ".join(splitSentence)

In [7]:
print(eda_RS("I love to play football", 2))

I to love football play


In [8]:
def eda_RD(originalSentence, p):
  """
  Paper Methodology -> Randomly remove each word in the sentence with probability p.
  originalSentence -> The sentence on which EDA is to be applied
  p -> Probability of a Word Being Removed
  """
  if (p == 1):
      raise Exception("Always an Empty String Will Be Returned") 
  if (p > 1 or p < 0):
    raise Exception("Improper Probability Value")
  splitSentence = list(originalSentence.split(" "))
  lsIndexesRemoved = []
  WordCount = len(splitSentence)
  if (WordCount == 1):
    raise Exception("No Swaps Possible in One Word Sentences")
  for i in range(WordCount):
    randomDraw = random.random()
    if randomDraw <= p:
      lsIndexesRemoved.append(i)
  lsRetainingWords = []
  for i in range(len(splitSentence)):
    if i not in lsIndexesRemoved:
      lsRetainingWords.append(splitSentence[i])
  return " ".join(lsRetainingWords)

In [9]:
print(eda_RD("I love to play football", 0.3))

I love to play football


### Building Dataset

In [10]:
!wget -q https://raw.githubusercontent.com/clairett/pytorch-sentiment-classification/master/data/SST2/train.tsv
!wget -q https://raw.githubusercontent.com/clairett/pytorch-sentiment-classification/master/data/SST2/test.tsv
!wget -q https://raw.githubusercontent.com/clairett/pytorch-sentiment-classification/master/data/SST2/dev.tsv

In [11]:
!mkdir train

In [12]:
import pandas as pd
df_train = pd.read_csv('train.tsv', delimiter = '\t', names = ['Sentence', 'Label'])
# df_train['Split'] = 'Train'
df_dev = pd.read_csv('dev.tsv', delimiter = '\t', names = ['Sentence', 'Label'])
# df_dev['Split'] = 'Train' # Since The Original Dataset Does Not Use an Explicit Dev Set It's Considered To Be A Part of Test Set
df_test = pd.read_csv('test.tsv', delimiter = '\t', names = ['Sentence', 'Label'])
# df_test['Split'] = 'Test'
df_train = pd.concat([df_train, df_dev])
df_train = df_train.reset_index(drop=True)
df_train = df_train.loc[:, ["Label", "Sentence"]]
df_train.to_csv('datasettrain.csv', index=False)
df_test = df_test.loc[:, ["Label", "Sentence"]]
df_test.to_csv('datasettest.csv', index=False)

In [13]:
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
import pickle

In [45]:
column_names = ['Label','Sentence']
batchsize = 1
label = column_names[0]
train_dataset = tf.data.experimental.make_csv_dataset(
    'datasettrain.csv',
    batchsize,
    column_names = column_names,
    label_name = label,
    num_epochs=1
)

In [18]:
column_names = ['Label','Sentence']
batchsize = 32
label = column_names[0]
test_dataset = tf.data.experimental.make_csv_dataset(
    'datasettest.csv',
    batchsize,
    column_names = column_names,
    label_name = label,
    num_epochs=1
)

In [67]:
print(type(train_dataset))

<class 'tensorflow.python.data.ops.dataset_ops.PrefetchDataset'>


In [65]:
ls = []
for example, label in train_dataset:
  ls.append([example['Sentence'][0],label[0]])

In [75]:
ls

[[<tf.Tensor: shape=(), dtype=string, numpy=b'an often watchable , though goofy and lurid , blast of a costume drama set in the late 15th century'>,
  <tf.Tensor: shape=(), dtype=int32, numpy=1>],
 [<tf.Tensor: shape=(), dtype=string, numpy=b'a little melodramatic , but with enough hope to keep you engaged'>,
  <tf.Tensor: shape=(), dtype=int32, numpy=1>],
 [<tf.Tensor: shape=(), dtype=string, numpy=b"director tom dey demonstrated a knack for mixing action and idiosyncratic humor in his charming 2000 debut shanghai noon , but showtime 's uninspired send up of tv cop show cliches mostly leaves him shooting blanks">,
  <tf.Tensor: shape=(), dtype=int32, numpy=1>],
 [<tf.Tensor: shape=(), dtype=string, numpy=b'while the film is not entirely successful , it still manages to string together enough charming moments to work'>,
  <tf.Tensor: shape=(), dtype=int32, numpy=1>],
 [<tf.Tensor: shape=(), dtype=string, numpy=b"once ice t sticks his mug in the window of the couple 's bmw and begins ha

## RNN Model

https://www.tensorflow.org/text/tutorials/text_classification_rnn

Paper Uses - 
The architecture used in this paper is as follows: input layer, bi-directional hidden layer with 64 LSTM cells, dropout layer with p=0.5, bi-directional layer of 32 LSTM cells, dropout layer with p=0.5, dense layer of 20 hidden units with ReLU activation, softmax output layer. We initialize this network with random normal weights and train against the categorical crossentropy loss function with the adam optimizer. We use early stopping with a patience of 3 epochs.

In [19]:
model = keras.models.Sequential()
model.add(layers.Bidirectional(layers.LSTM(64, return_sequences=True)))
model.add(layers.Dropout(0.5))
model.add(layers.Bidirectional(layers.LSTM(32, return_sequences=False)))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(20, activation='relu'))
model.add(layers.Dense(2, kernel_initializer='normal', activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [74]:
callbacks = [EarlyStopping(monitor='val_loss', patience=3)]

model.fit(	
      ls,
      epochs=3, 
      callbacks=callbacks,
      batch_size=1024, 
)

IndexError: ignored