### Imports

In [1]:
import re
import csv
import random
import pandas as pd
from sklearn.utils import shuffle
from collections import Counter

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from textattack.transformations import WordSwapRandomCharacterDeletion
from textattack.transformations import WordSwapQWERTY
from textattack.transformations import CompositeTransformation
from textattack.transformations import WordSwapChangeLocation
from textattack.transformations import WordSwapChangeName
from textattack.transformations import WordSwapChangeNumber
from textattack.transformations import WordSwapContract
from textattack.transformations import WordSwapWordNet
from textattack.transformations import WordSwapRandomCharacterSubstitution
from textattack.constraints.pre_transformation import RepeatModification
from textattack.constraints.pre_transformation import StopwordModification
from textattack.augmentation import Augmenter

### Data Preprocessing

* Read training dataset files
* Join datasets in a single DataFrame
* Lowercase text (optional)
* Replacing "\&amp;" for "&"
* Augment dataset
* Save augmented dataset

In [None]:
df = pd.read_csv('4/t.csv',encoding='latin')
df = df.iloc[1:]
df = shuffle(df)

train_size = int(0.7 * len(df))
val_size = int(0.2 * len(df))
test_size = len(df) - train_size - val_size
train_data = df[:train_size]
val_data = df[train_size:train_size + val_size]
test_data = df[train_size + val_size:]
train_data.columns = ['text', 'label']
val_data.columns = ['text', 'label']
test_data.columns = ['text', 'label']

train_data.to_csv('4/train6.csv', index=False ,encoding='latin')
val_data.to_csv('4/validate6.csv', index=False,encoding='latin')
test_data.to_csv('4/test6.csv', index=False,encoding='latin')

In [4]:
tweet_file = "4/train.csv"
lowercase = False

tweet_df = pd.read_csv(tweet_file, delimiter=',', index_col=False, encoding='latin1')
d = {"label":tweet_df['label'], "text":tweet_df['text']}

df = pd.DataFrame(data = d)
for i in range(0, len(df)-1):
  if "&amp;" in df["text"][i]:
    df["text"][i] = df["text"][i].replace("&amp;", "&")
  if lowercase:
    df["text"][i] = df["text"][i].lower()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text"][i] = df["text"][i].replace("&amp;", "&")


In [5]:
tweet_df['text']

0       I have never seen a therapist at any point in ...
1       Seriously my man, you got to understand that s...
2       Hey, I saw your post a couple of weeks ago on ...
3       Maybe if your feeling like that again in a sim...
4       I'm 20 and I would definitely be interested. I...
                              ...                        
3144    I am going to be a freshman in college this fa...
3145    Meditation, CBT therapy, calming oils, CBD. Oc...
3146    I do the same shit dude. I¡¯m in pretty consis...
3147    i¡¯ve had medication prescribed to me because ...
3148    I¡¯m 22 and wonder if it¡¯ll get better for me...
Name: text, Length: 3149, dtype: object

In [6]:
X = df.text
X

0       I have never seen a therapist at any point in ...
1       Seriously my man, you got to understand that s...
2       Hey, I saw your post a couple of weeks ago on ...
3       Maybe if your feeling like that again in a sim...
4       I'm 20 and I would definitely be interested. I...
                              ...                        
3144    I am going to be a freshman in college this fa...
3145    Meditation, CBT therapy, calming oils, CBD. Oc...
3146    I do the same shit dude. I¡¯m in pretty consis...
3147    i¡¯ve had medication prescribed to me because ...
3148    I¡¯m 22 and wonder if it¡¯ll get better for me...
Name: text, Length: 3149, dtype: object

In [7]:
y = df.label
y

0       0
1       0
2       0
3       1
4       1
       ..
3144    1
3145    1
3146    1
3147    1
3148    0
Name: label, Length: 3149, dtype: int64

In [8]:
print("There are ", len(df[df["label"]==1]) , "positive examples in this dataset.")
print("There are ", len(df[df["label"]==0]), "negative examples in this dataset.")

There are  1205 positive examples in this dataset.
There are  1944 negative examples in this dataset.


### Augmentation

* Use following transformations:
  * Random character swap
  * Character swap by adjacent QWERTY keyboard characters
  * Perform contractions (For example: "I am"->"I'm")
  * Swap words by Word Net synonyms 

In [10]:
transformation = CompositeTransformation([WordSwapRandomCharacterSubstitution(), WordSwapQWERTY(), WordSwapWordNet(), WordSwapContract()])
constraints = [RepeatModification(), StopwordModification()]
augmenter = Augmenter(transformation=transformation, constraints=constraints, pct_words_to_swap=0.1, transformations_per_example=2)
print_var = 1
print_count = 0

i = 0
neg = 0
for index, row in df.iterrows():
  if(row["label"]==1):
    try:
      text = re.sub('@\w+', '@', row["text"])
      text = text.replace("'", "")
      l = augmenter.augment(text)
      for n in range(len(l)):
        l[n] = l[n].replace("@", "@USER____")
      new_serie = pd.Series(l)
      X = pd.concat([X, new_serie], ignore_index=True)
      ade_serie = pd.Series([1]*2)
      y = pd.concat([y, ade_serie], ignore_index=True)
      i = i + 1
    except IndexError:
      neg = neg + 1
    
print(i, "tweets augmented.")
print("Could not augment ", neg, "tweets.")

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\yan\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


705 tweets augmented.
Could not augment  500 tweets.


In [11]:
Counter(y)

Counter({0: 1944, 1: 2615})

In [12]:
with open("4/aug1.tsv", 'wt', encoding='latin') as out_file:
    tsv_writer = csv.writer(out_file, delimiter='\t')
    tsv_writer.writerow(['text', 'label'])
    for i in range(len(X)):
      tsv_writer.writerow([X[i], y[i]])

In [14]:
import pandas as pd
import csv
augument3 = "4/aug1.tsv"
df3 = pd.read_csv(augument3, sep='\t', quoting=csv.QUOTE_NONE, encoding='latin')
print("There are ", len(df3[df3["label"]==1]) , "positive examples in this dataset.")
print("There are ", len(df3[df3["label"]==0]), "negative examples in this dataset.")

There are  2615 positive examples in this dataset.
There are  1944 negative examples in this dataset.


In [15]:
X3 = df3.text.values
y3 = df3.label.values

### Resample to make each class same number of samples

In [16]:
oversampler = RandomOverSampler(sampling_strategy=1)
X_oversampled, y_oversampled = oversampler.fit_resample(X.to_numpy().reshape(-1,1), y)

print(len(X_oversampled))
Counter(y_oversampled)

5230


Counter({0: 2615, 1: 2615})

In [17]:
with open("4/aug2.tsv", 'wt', encoding='latin') as out_file:
    tsv_writer = csv.writer(out_file, delimiter='\t')
    tsv_writer.writerow(['text', 'label'])
    for i in range(len(X_oversampled)):
      tsv_writer.writerow([X_oversampled[i][0], y_oversampled[i]])

In [20]:
df4 = pd.read_csv('4/aug2.tsv', sep='\t', quoting=csv.QUOTE_NONE, encoding='latin')
print("There are ", len(df4[df4["label"]==1]) , "positive examples in this dataset.")
print("There are ", len(df4[df4["label"]==0]), "negative examples in this dataset.")

There are  2615 positive examples in this dataset.
There are  2615 negative examples in this dataset.


In [24]:
X = df4.text
y = df4.label

In [26]:
transformation = CompositeTransformation([WordSwapRandomCharacterSubstitution(), WordSwapQWERTY(), WordSwapWordNet(), WordSwapContract()])
constraints = [RepeatModification(), StopwordModification()]
augmenter = Augmenter(transformation=transformation, constraints=constraints, pct_words_to_swap=0.1, transformations_per_example=1)
print_var = 1
print_count = 0

i = 0
neg = 0
for index, row in df4.iterrows():
  if(row["label"]==1):
    try:
      text = re.sub('@\w+', '@', row["text"])
      text = text.replace("'", "")
      l = augmenter.augment(text)
      for n in range(len(l)):
        l[n] = l[n].replace("@", "@USER____")
      new_serie = pd.Series(l)
      X = pd.concat([X, new_serie], ignore_index=True)
      ade_serie = pd.Series([1])
      y = pd.concat([y, ade_serie], ignore_index=True)
      i = i + 1
    except IndexError:
      neg = neg + 1
  else:
    try:
      text = re.sub('@\w+', '@', row["text"])
      text = text.replace("'", "")
      l = augmenter.augment(text)
      for n in range(len(l)):
        l[n] = l[n].replace("@", "@USER____")
      new_serie = pd.Series(l)
      X = pd.concat([X, new_serie], ignore_index=True)
      ade_serie = pd.Series([0])
      y = pd.concat([y, ade_serie], ignore_index=True)
      i = i + 1
    except IndexError:
      neg = neg + 1
        
print(i, "tweets augmented.")
print("Could not augment ", neg, "tweets.")

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\yan\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


4052 tweets augmented.
Could not augment  1178 tweets.


In [27]:
Counter(y)

Counter({0: 4633, 1: 4649})

In [38]:
combined_data1 = list(zip(X, y))
random.shuffle(combined_data1)
shuffled_X1, shuffled_y1 = zip(*combined_data1)

with open("4/finalaugshuffle1.tsv", 'wt', encoding='latin', newline='') as out_file:
    tsv_writer = csv.writer(out_file, delimiter='\t')
    tsv_writer.writerow(['text', 'label'])
    for i in range(len(shuffled_X1)):
      tsv_writer.writerow([shuffled_X1[i], shuffled_y1[i]])

In [39]:
combined_data1 = list(zip(X, y))
random.shuffle(combined_data1)
shuffled_X1, shuffled_y1 = zip(*combined_data1)

with open("4/finalaugshuffle1utf.tsv", 'wt', encoding='utf-8', newline='') as out_file:
    tsv_writer = csv.writer(out_file, delimiter='\t')
    tsv_writer.writerow(['text', 'label'])
    for i in range(len(shuffled_X1)):
      tsv_writer.writerow([shuffled_X1[i], shuffled_y1[i]])

In [28]:
oversampler = RandomUnderSampler(sampling_strategy=1)
X_oversampled, y_oversampled = oversampler.fit_resample(X.to_numpy().reshape(-1,1), y)

print(len(X_oversampled))
Counter(y_oversampled)

9266


Counter({0: 4633, 1: 4633})

#### Save various version becasue some encoding may has invalid characters

In [30]:
combined_data = list(zip(X_oversampled, y_oversampled))
random.shuffle(combined_data)
shuffled_X, shuffled_y = zip(*combined_data)

with open("4/finalaugshuffle2.tsv", 'wt', encoding='latin', newline='') as out_file:
    tsv_writer = csv.writer(out_file, delimiter='\t')
    tsv_writer.writerow(['text', 'label'])
    for i in range(len(shuffled_X)):
        tsv_writer.writerow([shuffled_X[i][0], shuffled_y[i]])

In [41]:
combined_data = list(zip(X_oversampled, y_oversampled))
random.shuffle(combined_data)
shuffled_X, shuffled_y = zip(*combined_data)

with open("4/finalaugshuffle2uef.tsv", 'wt', encoding='utf-8', newline='') as out_file:
    tsv_writer = csv.writer(out_file, delimiter='\t')
    tsv_writer.writerow(['text', 'label'])
    for i in range(len(shuffled_X)):
        tsv_writer.writerow([shuffled_X[i][0], shuffled_y[i]])

In [36]:
# Save oversampled dataset
with open("augmented_oversampled_training4.tsv", 'wt', encoding='utf-8') as out_file:
    tsv_writer = csv.writer(out_file, delimiter='\t')
    tsv_writer.writerow(['text', 'label'])
    for i in range(len(X_oversampled)):
      tsv_writer.writerow([X_oversampled[i][0], y_oversampled[i]])

In [37]:
print(len(X))
Counter(y)

5531


Counter({0: 2798, 1: 2733})

In [None]:
# Save dataset
with open("merged_training_dataset.tsv", 'wt', encoding='utf-8') as out_file:
    tsv_writer = csv.writer(out_file, delimiter='\t')
    tsv_writer.writerow(['text', 'label', 'start', 'end', 'span', 'med_id'])
    for i in range(len(X)):
      tsv_writer.writerow([X[i], y[i]])