### Imports

In [1]:
import re
import csv
import random
import pandas as pd
from sklearn.utils import shuffle
from collections import Counter

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from textattack.transformations import WordSwapRandomCharacterDeletion
from textattack.transformations import WordSwapQWERTY
from textattack.transformations import CompositeTransformation
from textattack.transformations import WordSwapChangeLocation
from textattack.transformations import WordSwapChangeName
from textattack.transformations import WordSwapChangeNumber
from textattack.transformations import WordSwapContract
from textattack.transformations import WordSwapWordNet
from textattack.transformations import WordSwapRandomCharacterSubstitution
from textattack.constraints.pre_transformation import RepeatModification
from textattack.constraints.pre_transformation import StopwordModification
from textattack.augmentation import Augmenter

### Data Preprocssing

In [None]:
df = pd.read_csv('2/train.csv',encoding='latin')
df = df.iloc[1:]
df = shuffle(df)

train_size = int(0.8 * len(df))
val_size = int(0.1 * len(df))
test_size = len(df) - train_size - val_size

train_data = df[:train_size]
val_data = df[train_size:train_size + val_size]
test_data = df[train_size + val_size:]

train_data.columns = ['text', 'label']
val_data.columns = ['text', 'label']
test_data.columns = ['text', 'label']

train_data.to_csv('2/train2.csv', index=False ,encoding='latin')
val_data.to_csv('2/validate2.csv', index=False,encoding='latin')
test_data.to_csv('2/test2.csv', index=False,encoding='latin')

In [None]:
tweet_file = "2/train2.csv"
lowercase = False
tweet_df = pd.read_csv(tweet_file, delimiter=',', index_col=False, encoding='latin')
d = {"label": tweet_df['label'], "text": tweet_df['text']}
df = pd.DataFrame(data=d)
df["text"] = df["text"].fillna("")  # 用空字符串替换缺失值


for i in range(len(df)):
    if "&amp;" in df["text"][i]:
        df["text"][i] = df["text"][i].replace("&amp;", "&")
    if lowercase:
        df["text"][i] = df["text"][i].lower()

In [7]:
tweet_df['text']

0       i took 18 codeine, 3 xanax &amp; 2 diazepam. h...
1       @IckyNocops iâve had chronic pain in the sam...
2       @mscisnt A big problem is that many chronic pa...
3       @WrestleCringe I take Adderall and wear button...
4       @StructuredSucc Aderall, vyvanse and Ritalin p...
                              ...                        
2997    @RawBeautyKristi My doc prescribed me Escitalo...
2998    I ate all of Deedeea??s food that weekend. But...
2999    cw/sex (TMI)\n.\nits so wild how much a orgasm...
3000    Dona??t blame the doctor for a??not understand...
3001    @Rosemcat1 @ibdgirl76 Exactly. This is whats c...
Name: text, Length: 3002, dtype: object

In [10]:
print("There are ", len(df[df["label"]=='positive']) , "positive examples in this dataset.")
print("There are ", len(df[df["label"]=='neutral']), "neutral examples in this dataset.")
print("There are ", len(df[df["label"]=='negative']), "negative examples in this dataset.")

There are  566 positive examples in this dataset.
There are  2099 neutral examples in this dataset.
There are  312 negative examples in this dataset.


In [11]:
df.loc[ df["label"] == 'positive', "label"] = 2 # Positive classification
df.loc[ df["label"] == 'neutral', "label"] = 1 # Neutral classification
df.loc[ df["label"] == 'negative', "label"] = 0 # Negative classification

In [14]:
valid_labels = [0, 1, 2]
df = df.dropna()
df = df[df['label'].isin(valid_labels)]

In [15]:
X = df.text
X

0       i took 18 codeine, 3 xanax & 2 diazepam. had a...
1       @IckyNocops iâve had chronic pain in the sam...
2       @mscisnt A big problem is that many chronic pa...
3       @WrestleCringe I take Adderall and wear button...
4       @StructuredSucc Aderall, vyvanse and Ritalin p...
                              ...                        
2997    @RawBeautyKristi My doc prescribed me Escitalo...
2998    I ate all of Deedeea??s food that weekend. But...
2999    cw/sex (TMI)\n.\nits so wild how much a orgasm...
3000    Dona??t blame the doctor for a??not understand...
3001    @Rosemcat1 @ibdgirl76 Exactly. This is whats c...
Name: text, Length: 2977, dtype: object

In [16]:
y = df.label
y

0       1
1       1
2       1
3       1
4       2
       ..
2997    2
2998    1
2999    1
3000    1
3001    1
Name: label, Length: 2977, dtype: object

### Augmentation

* Use following transformations:
  * Random character swap
  * Character swap by adjacent QWERTY keyboard characters
  * Perform contractions (For example: "I am"->"I'm")
  * Swap words by Word Net synonyms 

In [17]:
transformation = CompositeTransformation([WordSwapRandomCharacterSubstitution(), WordSwapQWERTY(), WordSwapWordNet(), WordSwapContract()])
constraints = [RepeatModification(), StopwordModification()]
augmenter = Augmenter(transformation=transformation, constraints=constraints, pct_words_to_swap=0.1, transformations_per_example=10)
print_var = 1
print_count = 0

i = 0
neg = 0
for index, row in df.iterrows():
  if(row["label"]==2):
    try:
      text = re.sub('@\w+', '@', row["text"])
      text = text.replace("'", "")
      l = augmenter.augment(text)
      for n in range(len(l)):
        l[n] = l[n].replace("@", "@USER____")
      new_serie = pd.Series(l)
      X = pd.concat([X, new_serie], ignore_index=True)
      ade_serie = pd.Series([2]*10)
      y = pd.concat([y, ade_serie], ignore_index=True)
      i = i + 1
    except IndexError:
      neg = neg + 1
      
  elif(row["label"]==0):
    try:
      text = re.sub('@\w+', '@', row["text"])
      text = text.replace("'", "")
      l = augmenter.augment(text)
      for n in range(len(l)):
        l[n] = l[n].replace("@", "@USER____")
      new_serie = pd.Series(l)
      X = pd.concat([X, new_serie], ignore_index=True)
      ade_serie = pd.Series([0]*10)
      y = pd.concat([y, ade_serie], ignore_index=True)
      i = i + 1
    except IndexError:
      neg = neg + 1
        
print(i, "tweets augmented.")
print("Could not augment ", neg, "tweets.")

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\yan\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


539 tweets augmented.
Could not augment  339 tweets.


In [18]:
Counter(y)

Counter({1: 2099, 2: 3866, 0: 2402})

In [41]:
combined_data = list(zip(X, y))
random.shuffle(combined_data)
shuffled_X, shuffled_y = zip(*combined_data)

with open("2/aug1.tsv", 'wt', encoding='latin', newline='') as out_file:
    tsv_writer = csv.writer(out_file, delimiter='\t')
    tsv_writer.writerow(['text', 'label'])
    for i in range(len(X)):
      tsv_writer.writerow([X[i], y[i]])

### Resampling to make each class same number of samples

In [47]:
import pandas as pd
import csv
augument3 = "2/aug1.tsv"
df3 = pd.read_csv(augument3, sep='\t', quoting=csv.QUOTE_NONE, encoding='latin')
print("There are ", len(df3[df3["label"]==1]) , "neural examples in this dataset.")
print("There are ", len(df3[df3["label"]==0]), "negative examples in this dataset.")
print("There are ", len(df3[df3["label"]==2]), "positive examples in this dataset.")

There are  2099 neural examples in this dataset.
There are  2402 negative examples in this dataset.
There are  3866 positive examples in this dataset.


In [50]:
df3 = df3.dropna(subset=['text', 'label'])

In [51]:
df3['label'] = df3['label'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['label'] = df3['label'].astype(int)


In [52]:
X3 = df3.text
y3 = df3.label

In [53]:
# check none value and delete them
nan_mask = pd.isna(X3) | pd.isna(y3)
X_no_nan = X3[~nan_mask]
y_no_nan = y3[~nan_mask]

In [55]:
y3.value_counts()

2    3866
0    2402
1    2099
Name: label, dtype: int64

In [56]:
# Random Oversampler
sampling_strategy = {
    0: 2000,  
    1: 2000, 
    2: 2000  
}
oversampler = RandomUnderSampler(sampling_strategy=sampling_strategy)
X_oversampled, y_oversampled = oversampler.fit_resample(X_no_nan.to_numpy().reshape(-1,1), y_no_nan)

print(len(X_oversampled))
Counter(y_oversampled)

6000


Counter({0: 2000, 1: 2000, 2: 2000})

In [57]:
combined_data = list(zip(X_oversampled, y_oversampled))
random.shuffle(combined_data)
shuffled_X, shuffled_y = zip(*combined_data)

with open("2/finalaugshuffle2.tsv", 'wt', encoding='latin', newline='') as out_file:
    tsv_writer = csv.writer(out_file, delimiter='\t')
    tsv_writer.writerow(['text', 'label'])
    for i in range(len(shuffled_X)):
        tsv_writer.writerow([shuffled_X[i][0], shuffled_y[i]])

In [38]:
combined_data = list(zip(X_oversampled, y_oversampled))
random.shuffle(combined_data)
shuffled_X, shuffled_y = zip(*combined_data)

with open("2/finalaugshuffle2utf.tsv", 'wt', encoding='utf-8', newline='') as out_file:
    tsv_writer = csv.writer(out_file, delimiter='\t')
    tsv_writer.writerow(['text', 'label'])
    for i in range(len(shuffled_X)):
        tsv_writer.writerow([shuffled_X[i][0], shuffled_y[i]])