# Split into stratified train, val, test sets

In [1]:
import pandas as pd
import numpy as np

np.random.seed(12345678)

In [2]:
df = pd.read_csv("data/train.csv")
print(f"Length: {len(df)}")

negatives = df[df["label"] == 0]
positives = df[df["label"] == 1]

neg_size = len(negatives)
pos_size = len(positives)

print(f"Negatives: {neg_size}, Positives: {pos_size}")

Length: 5410
Negatives: 1714, Positives: 3696


In [3]:
"""
5410 data points
Train, Val, Test = 80%, 10%, 10% = 4328, 541, 541
"""

negative_indices = np.random.permutation(negatives.index.values)
positive_indices = np.random.permutation(positives.index.values)

neg_train, neg_val, neg_test = np.split(negative_indices, [round(0.8 * neg_size), round(0.9 * neg_size)])
pos_train, pos_val, pos_test = np.split(positive_indices, [round(0.8 * pos_size), round(0.9 * pos_size)])

train = np.concatenate((neg_train, pos_train))
val = np.concatenate((neg_val, pos_val))
test = np.concatenate((neg_test, pos_test))

np.random.shuffle(train)
np.random.shuffle(val)
np.random.shuffle(test)

In [4]:
trainset = df.iloc[train]
valset = df.iloc[val]
testset = df.iloc[test]

print(f"Length: (Train: {len(trainset)}, Val: {len(valset)}, Test: {len(testset)})")
print(f"Negatives: (Train: {sum(trainset['label']==0)}, Val: {sum(valset['label']==0)}, Test: {sum(testset['label']==0)})")
print(f"Positives: (Train: {sum(trainset['label']==1)}, Val: {sum(valset['label']==1)}, Test: {sum(testset['label']==1)})")

Length: (Train: 4328, Val: 541, Test: 541)
Negatives: (Train: 1371, Val: 172, Test: 171)
Positives: (Train: 2957, Val: 369, Test: 370)


In [5]:
trainset.to_csv("data/train.csv", index=False)
valset.to_csv("data/val.csv", index=False)
testset.to_csv("data/test.csv", index=False)

# Augment Train Data to 50% neg 50% pos

In [1]:
import pandas as pd
import numpy as np
from BERTweet.TweetNormalizer import normalizeTweet
from eda_nlp.code.eda import eda

np.random.seed(12345678)

In [2]:
df = pd.read_csv("data/train.csv")
print(f"Length: {len(df)}")

negatives = df[df["label"] == 0]
positives = df[df["label"] == 1]

neg_size = len(negatives)
pos_size = len(positives)

print(f"Negatives: {neg_size}, Positives: {pos_size}")

Length: 4328
Negatives: 1371, Positives: 2957


In [3]:
num_aug = np.full(neg_size, pos_size // neg_size)
num_aug[:pos_size % neg_size] += 1
np.random.shuffle(num_aug)
num_aug

array([2, 2, 2, ..., 2, 2, 2])

In [4]:
alpha_sr = 0.1 # synonym replacement
alpha_ri = 0.1 # random insertion
alpha_rs = 0.0 # random swap
alpha_rd = 0.0 # random deletion

augmented_negatives = {"label": [], "tweet": []}

for i, line in enumerate(negatives.iloc):
    label = line["label"]
    sentence = normalizeTweet(line["tweet"])
    aug_sentences = eda(sentence, alpha_sr=alpha_sr, alpha_ri=alpha_ri, alpha_rs=alpha_rs, p_rd=alpha_rd, num_aug=num_aug[i]-1)
    for aug_sentence in aug_sentences:
        augmented_negatives["label"].append(label)
        augmented_negatives["tweet"].append(aug_sentence)

augmented_negatives = pd.DataFrame(augmented_negatives)
augmented_df = pd.concat([augmented_negatives, positives[["label", "tweet"]]])

In [5]:
augmented_df.to_csv("data/train_augment_5050.csv")

In [6]:
df = pd.read_csv("data/train_augment_5050.csv")
print(f"Length: {len(df)}")

negatives = df[df["label"] == 0]
positives = df[df["label"] == 1]

neg_size = len(negatives)
pos_size = len(positives)

print(f"Negatives: {neg_size}, Positives: {pos_size}")

Length: 5914
Negatives: 2957, Positives: 2957


# Augment Train Data to 75% neg 25% pos

In [68]:
import pandas as pd
import numpy as np
from BERTweet.TweetNormalizer import normalizeTweet
from eda_nlp.code.eda import eda

np.random.seed(12345678)

In [69]:
df = pd.read_csv("data/train.csv")
print(f"Length: {len(df)}")

negatives = df[df["label"] == 0]
positives = df[df["label"] == 1]

neg_size = len(negatives)
pos_size = len(positives)

print(f"Negatives: {neg_size}, Positives: {pos_size}")

Length: 4328
Negatives: 1371, Positives: 2957


In [70]:
num_aug = np.full(neg_size, 3 * pos_size // neg_size)
num_aug[:3 * pos_size % neg_size] += 1
np.random.shuffle(num_aug)
num_aug

array([7, 7, 7, ..., 6, 6, 7])

In [71]:
alpha_sr = 0.1 # synonym replacement
alpha_ri = 0.1 # random insertion
alpha_rs = 0.1 # random swap
alpha_rd = 0.1 # random deletion

augmented_negatives = {"label": [], "tweet": []}

for i, line in enumerate(negatives.iloc):
    label = line["label"]
    sentence = normalizeTweet(line["tweet"])
    aug_sentences = eda(sentence, alpha_sr=alpha_sr, alpha_ri=alpha_ri, alpha_rs=alpha_rs, p_rd=alpha_rd, num_aug=num_aug[i]-1)
    for aug_sentence in aug_sentences:
        augmented_negatives["label"].append(label)
        augmented_negatives["tweet"].append(aug_sentence)

augmented_negatives = pd.DataFrame(augmented_negatives)
augmented_df = pd.concat([augmented_negatives, positives[["label", "tweet"]]])

In [72]:
augmented_df.to_csv("data/train_augment_7525.csv")

In [74]:
df = pd.read_csv("data/train_augment_7525.csv")
print(f"Length: {len(df)}")

negatives = df[df["label"] == 0]
positives = df[df["label"] == 1]

neg_size = len(negatives)
pos_size = len(positives)

print(f"Negatives: {neg_size}, Positives: {pos_size}")

Length: 11828
Negatives: 8871, Positives: 2957


## Adversarial Augment

In [128]:
import pandas as pd
import numpy as np
import pickle
from BERTweet.TweetNormalizer import normalizeTweet
from eda_nlp.code.eda import eda

np.random.seed(12345678)

In [50]:
df = pd.read_csv("data/train.csv")
print(f"Length: {len(df)}")

negatives = df[df["label"] == 0]
positives = df[df["label"] == 1]

neg_size = len(negatives)
pos_size = len(positives)

print(f"Negatives: {neg_size}, Positives: {pos_size}")

Length: 4328
Negatives: 1371, Positives: 2957


In [51]:
from textattack.attack_results.successful_attack_result import SuccessfulAttackResult

original = []
perturbed = []

with open("textattackresults.pkl", "rb") as f:
    attacks = pickle.load(f)
    for a in attacks:
        if isinstance(a, SuccessfulAttackResult):
            original.append(a.original_text())
            perturbed.append(a.perturbed_text())

atk = pd.DataFrame({"original_text": original, "perturbed_text": perturbed})
atk = atk[:len(atk)//2]
len(atk)

1000

In [52]:
# atk = pd.read_csv("attacks.csv")
# atk = atk[atk['result_type'] == "Successful"]
# atk["original_text"]  = [i.replace('[[', '').replace(']]', '') for i in atk["original_text"]]
# atk["perturbed_text"] = [i.replace('[[', '').replace(']]', '') for i in atk["perturbed_text"]]

# atk = atk[:len(atk)//2]
# len(atk)

In [53]:
atk_dict = {o: p for o, p in atk[['original_text', 'perturbed_text']].values}

In [54]:
count = 0
new_tweets = []

for i, line in enumerate(df.iloc):
    label = line["label"]
    sentence = normalizeTweet(line["tweet"])

    if sentence in atk_dict:
        new_tweets.append(atk_dict[sentence])
    else:
        new_tweets.append(sentence)

In [55]:
df["tweet"] = new_tweets
df.to_csv("data/train_ta.csv")

In [56]:
df = pd.read_csv("data/train_ta.csv")
print(f"Length: {len(df)}")

negatives = df[df["label"] == 0]
positives = df[df["label"] == 1]

neg_size = len(negatives)
pos_size = len(positives)

print(f"Negatives: {neg_size}, Positives: {pos_size}")

Length: 4328
Negatives: 1371, Positives: 2957


## Random Sentiment140

In [1]:
import pandas as pd
import numpy as np

np.random.seed(12345678)

In [2]:
df = pd.read_csv("data/train.csv")
print(f"Length: {len(df)}")

negatives = df[df["label"] == 0]
positives = df[df["label"] == 1]

neg_size = len(negatives)
pos_size = len(positives)

print(f"Negatives: {neg_size}, Positives: {pos_size}")

Length: 4328
Negatives: 1371, Positives: 2957


In [3]:
ratio = 0.9
num_samples = int(pos_size * ratio / (1-ratio))
num_new = num_samples - neg_size
print(num_new, num_samples/(num_samples + pos_size))

25242 0.9


In [4]:
new_df = pd.read_csv("data/sentiment140.csv", names=["target", "id", "date", "flag", "user", "text"], encoding="ISO-8859-1")
new_samples = new_df.sample(num_new)
new_samples = pd.DataFrame({"label": np.zeros(len(new_samples), dtype=int), "tweet": new_samples['text'].values})

In [5]:
new_samples

Unnamed: 0,label,tweet
0,0,Aww. Well that was quick. Poor Maria. At least...
1,0,@AubreyODay Right? Especially the part about h...
2,0,@azandiaMJBB Oh I do agree - vegging out occas...
3,0,"If you've emailed or messaged me this week, I'..."
4,0,@spazziness stress all day long... need to fig...
...,...,...
25237,0,@ChreeesDunn Good luck with that dude. Should ...
25238,0,@IamAdamFierce adammmmmm!! I kept coming to se...
25239,0,@peterfacinelli Just make sure its not an inte...
25240,0,@tweet_homes gotta watch out for those telemar...


In [8]:
words = "alzheimer dementia".split()
for w in words:
    print(new_samples[new_samples['tweet'].str.find(w)!=-1]['tweet'].values)

64
22
255
111


In [201]:
augmented_df = pd.concat([new_samples, df[["label", "tweet"]]])

In [202]:
augmented_df.to_csv("data/train_sentiment140.csv")

In [203]:
df = pd.read_csv("data/train_sentiment140.csv")
print(f"Length: {len(df)}")

negatives = df[df["label"] == 0]
positives = df[df["label"] == 1]

neg_size = len(negatives)
pos_size = len(positives)

print(f"Negatives: {neg_size}, Positives: {pos_size}")

Length: 29570
Negatives: 26613, Positives: 2957


## Parental Sentiment140

In [118]:
import pandas as pd
import numpy as np

np.random.seed(12345678)

In [119]:
df = pd.read_csv("data/train.csv")
print(f"Length: {len(df)}")

negatives = df[df["label"] == 0]
positives = df[df["label"] == 1]

neg_size = len(negatives)
pos_size = len(positives)

print(f"Negatives: {neg_size}, Positives: {pos_size}")

Length: 4328
Negatives: 1371, Positives: 2957


In [120]:
ratio = 0.9
parent_ratio = 0.5
num_samples = int(pos_size * ratio / (1-ratio))
num_new = num_samples - neg_size
num_parent = num_new // 2
num_random = num_new - num_parent

num_total = num_samples + pos_size

print(num_new, num_samples/num_total, (num_parent/num_total, num_random/num_total, neg_size/num_total))

25242 0.9 (0.42681772066283397, 0.42681772066283397, 0.04636455867433209)


In [121]:
new_df = pd.read_csv("data/sentiment140.csv", names=["target", "id", "date", "flag", "user", "text"], encoding="ISO-8859-1")
new_df = pd.DataFrame({"label": np.zeros(len(new_df), dtype=int), "tweet": new_df['text'].values})

In [122]:
new_df

Unnamed: 0,label,tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."
...,...,...
1599995,0,Just woke up. Having no school is the best fee...
1599996,0,TheWDB.com - Very cool to hear old Walt interv...
1599997,0,Are you ready for your MoJo Makeover? Ask me f...
1599998,0,Happy 38th Birthday to my boo of alll time!!! ...


In [123]:
words = ["mother", "mama", "mum", "mom", "mommy", "father", "dad", "daddy", "papa", "parent"]

is_parental = []
for s in new_df['tweet'].values:
    for w in words:
        if w in s.lower():
            is_parental.append(True)
            break
    else:
        is_parental.append(False)

In [124]:
new_df[is_parental]

Unnamed: 0,label,tweet
86,0,@msdrama hey missed ya at the meeting sup mama
106,0,Emily will be glad when Mommy is done training...
147,0,My mom might have breast cancer won't find out...
233,0,@labelsnotlove my home town. My mammy called...
379,0,Bad news was Dad has cancer and is dying Goo...
...,...,...
1599890,0,"going to south streeeet with kate, hopefully m..."
1599912,0,@ShannonGilliam good luck!! what an exciting ...
1599929,0,@mom2jwo Woo hoo!!!! Keep working hard my dear!!
1599939,0,OMG!!!!!!!!!! My dad will be having surgery to...


In [125]:
new_parent = new_df[is_parental].sample(num_parent)
new_random = new_df[np.invert(is_parental)].sample(num_random)
new_samples = pd.concat([new_parent, new_random])

In [126]:
words = "alzheimer dement".split()
for w in words:
    print(new_samples[new_samples['tweet'].str.lower().str.find(w)!=-1]['tweet'].values)

["i wonder if there is a life after Alzheimer's caring. beyond despair. poverty sux but commerce is futile. there is no description. mommy "
 "Hi all haven't been on for 48 hours! Been to visit mum her alzheimers has got so bad was very sad "
 'Just contacted AppleCare cuz my Mac apparently has Alzheimers and the memory is bad. But they were awesome...no Mac for 2 - 3 days '
 "Waiting on news about my Grandfather, mom called telling me he didn't have much time left  struggled with Alzheimer's for years now "
 'How do I tell my grandmother (with alzheimers) that her sister (also with alzheimers) just died. Having such a shitty day... ']
['@Gianuario dementia and cancer  i feel so bad for him and my grandmother &lt;3'
 '@PandaDementia But I heard you dripping '
 "@PandaDementia  boo...I didn't get a chance to try!"
 "@PandaDementia lol silly girl, I would've if I'd known ;) I do feel badly for you though  But I know the cobbler will be amazing! Mmmm.."
 "@velvetdementia Yep, $75/day + ta

In [127]:
augmented_df = pd.concat([new_samples, df[["label", "tweet"]]])

In [128]:
augmented_df.to_csv("data/train_parental.csv")

In [129]:
df = pd.read_csv("data/train_parental.csv")
print(f"Length: {len(df)}")

negatives = df[df["label"] == 0]
positives = df[df["label"] == 1]

neg_size = len(negatives)
pos_size = len(positives)

print(f"Negatives: {neg_size}, Positives: {pos_size}")

Length: 29570
Negatives: 26613, Positives: 2957
