### Imports

In [1]:
import re
import csv
import random
import pandas as pd
from sklearn.utils import shuffle
from collections import Counter

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from textattack.transformations import WordSwapRandomCharacterDeletion
from textattack.transformations import WordSwapQWERTY
from textattack.transformations import CompositeTransformation
from textattack.transformations import WordSwapChangeLocation
from textattack.transformations import WordSwapChangeName
from textattack.transformations import WordSwapChangeNumber
from textattack.transformations import WordSwapContract
from textattack.transformations import WordSwapWordNet
from textattack.transformations import WordSwapRandomCharacterSubstitution
from textattack.constraints.pre_transformation import RepeatModification
from textattack.constraints.pre_transformation import StopwordModification
from textattack.augmentation import Augmenter

### Data Preprocessing

In [None]:
df = pd.read_csv('1/train.tsv', sep='\t', quoting=csv.QUOTE_NONE,encoding='latin')
df = df.iloc[1:]
df = df.drop(df.columns[0], axis=1)
df = shuffle(df)
train_size = int(0.7 * len(df))
val_size = int(0.2 * len(df))
test_size = len(df) - train_size - val_size

train_data = df[:train_size]
val_data = df[train_size:train_size + val_size]
test_data = df[train_size + val_size:]

train_data.columns = ['text', 'label']
val_data.columns = ['text', 'label']
test_data.columns = ['text', 'label']

train_data.to_csv('1/train.csv', index=False ,encoding='latin')
val_data.to_csv('1/validate.csv', index=False,encoding='latin')
test_data.to_csv('1/test.csv', index=False,encoding='latin')

In [None]:
tweet_file = "1/train.csv"
lowercase = False
tweet_df = pd.read_csv(tweet_file, delimiter=',', index_col=False, encoding='latin1')
d = {"label":tweet_df['label'], "text":tweet_df['text']}

df = pd.DataFrame(data = d)
for i in range(0, len(df)-1):
  if "&amp;" in df["text"][i]:
    df["text"][i] = df["text"][i].replace("&amp;", "&")
  if lowercase:
    df["text"][i] = df["text"][i].lower()

In [8]:
print("There are ", len(df[df["label"]==1]) , "positive examples in this dataset.")
print("There are ", len(df[df["label"]==0]), "negative examples in this dataset.")

There are  951 positive examples in this dataset.
There are  4648 negative examples in this dataset.


### Data Augmentation

* Use following transformations:
  * Random character swap
  * Character swap by adjacent QWERTY keyboard characters
  * Perform contractions (For example: "I am"->"I'm")
  * Swap words by Word Net synonyms 

In [10]:
transformation = CompositeTransformation([WordSwapRandomCharacterSubstitution(), WordSwapQWERTY(), WordSwapWordNet(), WordSwapContract()])
constraints = [RepeatModification(), StopwordModification()]
augmenter = Augmenter(transformation=transformation, constraints=constraints, pct_words_to_swap=0.1, transformations_per_example=10)
print_var = 1
print_count = 0
i = 0
neg = 0
for index, row in df.iterrows():
  if(row["label"]==1):
    try:
      text = re.sub('@\w+', '@', row["text"])
      text = text.replace("'", "")
      l = augmenter.augment(text)
      for n in range(len(l)):
        l[n] = l[n].replace("@", "@USER____")
      new_serie = pd.Series(l)
      X = pd.concat([X, new_serie], ignore_index=True)
      ade_serie = pd.Series([1]*10)
      y = pd.concat([y, ade_serie], ignore_index=True)
      i = i + 1
    except IndexError:
      neg = neg + 1
print(i, "tweets augmented.")
print("Could not augment ", neg, "tweets.")

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\yan\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


353 tweets augmented.
Could not augment  598 tweets.


In [11]:
Counter(y)

Counter({0: 4648, 1: 4481})

In [12]:
with open("1/aug.tsv", 'wt', encoding='latin') as out_file:
    tsv_writer = csv.writer(out_file, delimiter='\t')
    tsv_writer.writerow(['text', 'label'])
    for i in range(len(X)):
      tsv_writer.writerow([X[i], y[i]])

In [13]:
import pandas as pd
import csv
augument3 = "1/aug.tsv"
df3 = pd.read_csv(augument3, sep='\t', quoting=csv.QUOTE_NONE)
print("There are ", len(df3[df3["label"]==1]) , "positive examples in this dataset.")
print("There are ", len(df3[df3["label"]==0]), "negative examples in this dataset.")

There are  4481 positive examples in this dataset.
There are  4648 negative examples in this dataset.


In [17]:
text = df.text
label = df.label

#### Random Oversampling

In [19]:
oversampler = RandomOverSampler(sampling_strategy=1)
X_oversampled, y_oversampled = oversampler.fit_resample(text.to_numpy().reshape(-1,1), label)

print(len(X_oversampled))
Counter(y_oversampled)

9296


Counter({0: 4648, 1: 4648})

In [20]:
with open("1/balanced_train.tsv", 'wt', encoding='latin') as out_file:
    tsv_writer = csv.writer(out_file, delimiter='\t')
    tsv_writer.writerow(['text', 'label'])
    for i in range(len(X_oversampled)):
      tsv_writer.writerow([X_oversampled[i][0], y_oversampled[i]])

In [21]:
augument4 = "1/balanced_train.tsv"
df4 = pd.read_csv(augument4, sep='\t', quoting=csv.QUOTE_NONE)
print("There are ", len(df4[df4["label"]==1]) , "positive examples in this dataset.")
print("There are ", len(df4[df4["label"]==0]), "negative examples in this dataset.")

There are  4648 positive examples in this dataset.
There are  4648 negative examples in this dataset.


In [22]:
X = df4.text
X

0       @emckaymd @MedExpress He’s heading to the ER i...
1       Self testing day, which is always fun 😫  #Covi...
2       @WeissSandor @woobackbaker @Turtwigpo Oh, I'm ...
3       Ordered to attend a maskless MI GOP event, he ...
4       back working on the covid unit, last time I wa...
                              ...                        
9291    @crbarnes001 I tested positive on the 12th Jan...
9292    Prayers please. Currently in the ER waiting to...
9293    I’d like to give a big ‘ol 🖕🏻 to anyone who ev...
9294    I had to cancel getting my vaccine today becau...
9295    Just got home from donating blood towards COVI...
Name: text, Length: 9296, dtype: object

In [23]:
y = df4.label
y

0       0
1       0
2       0
3       0
4       1
       ..
9291    1
9292    1
9293    1
9294    1
9295    1
Name: label, Length: 9296, dtype: int64

In [24]:
transformation = CompositeTransformation([WordSwapRandomCharacterSubstitution(), WordSwapQWERTY(), WordSwapWordNet(), WordSwapContract()])
constraints = [RepeatModification(), StopwordModification()]
augmenter = Augmenter(transformation=transformation, constraints=constraints, pct_words_to_swap=0.1, transformations_per_example=10)
print_var = 1
print_count = 0

i = 0
neg = 0
for index, row in df.iterrows():
  if(row["label"]==1):
    try:
      text = re.sub('@\w+', '@', row["text"])
      text = text.replace("'", "")
      l = augmenter.augment(text)
      for n in range(len(l)):
        l[n] = l[n].replace("@", "@USER____")
      new_serie = pd.Series(l)
      X = pd.concat([X, new_serie], ignore_index=True)
      ade_serie = pd.Series([1]*10)
      y = pd.concat([y, ade_serie], ignore_index=True)
      i = i + 1
    except IndexError:
      neg = neg + 1
  else:
    try:
      text = re.sub('@\w+', '@', row["text"])
      text = text.replace("'", "")
      l = augmenter.augment(text)
      for n in range(len(l)):
        l[n] = l[n].replace("@", "@USER____")
      new_serie = pd.Series(l)
      X = pd.concat([X, new_serie], ignore_index=True)
      ade_serie = pd.Series([0]*10)
      y = pd.concat([y, ade_serie], ignore_index=True)
      i = i + 1
    except IndexError:
      neg = neg + 1
        
print(i, "tweets augmented.")
print("Could not augment ", neg, "tweets.")

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\yan\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


1786 tweets augmented.
Could not augment  3813 tweets.


In [25]:
Counter(y)

Counter({0: 18978, 1: 8178})

In [29]:
with open("1/aug2.tsv", 'wt', encoding='utf-8') as out_file:
    tsv_writer = csv.writer(out_file, delimiter='\t')
    tsv_writer.writerow(['text', 'label'])
    for i in range(len(X)):
      tsv_writer.writerow([X[i], y[i]])

### Resample to make each class same number of samples

In [34]:
import pandas as pd
import csv
augument5 = "1/aug2.tsv"
df5 = pd.read_csv(augument5, sep='\t', quoting=csv.QUOTE_NONE)
print("There are ", len(df5[df5["label"]==1]) , "positive examples in this dataset.")
print("There are ", len(df5[df5["label"]==0]), "negative examples in this dataset.")

There are  8178 positive examples in this dataset.
There are  18978 negative examples in this dataset.


In [37]:
X = df5.text
y = df5.label

#### Random Undersampling

In [38]:
# Random Undersampler

oversampler = RandomUnderSampler(sampling_strategy=1)

X_undersampled, y_undersampled = oversampler.fit_resample(X.to_numpy().reshape(-1,1), y)

print(len(X_undersampled))
Counter(y_undersampled)

16356


Counter({0: 8178, 1: 8178})

In [39]:
with open("1/finalaug.tsv", 'wt', encoding='utf-8', newline='') as out_file:
    tsv_writer = csv.writer(out_file, delimiter='\t')
    tsv_writer.writerow(['text', 'label'])
    for i in range(len(X_undersampled)):
      tsv_writer.writerow([X_undersampled[i][0], y_undersampled[i]])

In [43]:
combined_data = list(zip(X_undersampled, y_undersampled))
random.shuffle(combined_data)
shuffled_X, shuffled_y = zip(*combined_data)

with open("1/finalaugshuffle.tsv", 'wt', encoding='utf-8', newline='') as out_file:
    tsv_writer = csv.writer(out_file, delimiter='\t')
    tsv_writer.writerow(['text', 'label'])
    for i in range(len(shuffled_X)):
        tsv_writer.writerow([shuffled_X[i][0], shuffled_y[i]])