# Split into stratified train, val, test sets

In [1]:
import pandas as pd
import numpy as np

np.random.seed(12345678)

In [2]:
df = pd.read_csv("data/all_tweets.csv")
print(f"Length: {len(df)}")

negatives = df[df["label"] == 0]
positives = df[df["label"] == 1]

neg_size = len(negatives)
pos_size = len(positives)

print(f"Negatives: {neg_size}, Positives: {pos_size}")

Length: 5410
Negatives: 1714, Positives: 3696


In [3]:
"""
5410 data points
Train, Val, Test = 80%, 10%, 10% = 4328, 541, 541
"""

negative_indices = np.random.permutation(negatives.index.values)
positive_indices = np.random.permutation(positives.index.values)

neg_train, neg_val, neg_test = np.split(negative_indices, [round(0.8 * neg_size), round(0.9 * neg_size)])
pos_train, pos_val, pos_test = np.split(positive_indices, [round(0.8 * pos_size), round(0.9 * pos_size)])

train = np.concatenate((neg_train, pos_train))
val = np.concatenate((neg_val, pos_val))
test = np.concatenate((neg_test, pos_test))

np.random.shuffle(train)
np.random.shuffle(val)
np.random.shuffle(test)

In [4]:
trainset = df.iloc[train]
valset = df.iloc[val]
testset = df.iloc[test]

print(f"Length: (Train: {len(trainset)}, Val: {len(valset)}, Test: {len(testset)})")
print(f"Negatives: (Train: {sum(trainset['label']==0)}, Val: {sum(valset['label']==0)}, Test: {sum(testset['label']==0)})")
print(f"Positives: (Train: {sum(trainset['label']==1)}, Val: {sum(valset['label']==1)}, Test: {sum(testset['label']==1)})")

Length: (Train: 4328, Val: 541, Test: 541)
Negatives: (Train: 1371, Val: 172, Test: 171)
Positives: (Train: 2957, Val: 369, Test: 370)


In [5]:
trainset.to_csv("data/train.csv", index=False)
valset.to_csv("data/val.csv", index=False)
testset.to_csv("data/test.csv", index=False)

# Augment Train Data to 50% neg 50% pos

In [1]:
import pandas as pd
import numpy as np
from BERTweet.TweetNormalizer import normalizeTweet
from eda_nlp.code.eda import eda

np.random.seed(12345678)

In [2]:
df = pd.read_csv("data/train.csv")
print(f"Length: {len(df)}")

negatives = df[df["label"] == 0]
positives = df[df["label"] == 1]

neg_size = len(negatives)
pos_size = len(positives)

print(f"Negatives: {neg_size}, Positives: {pos_size}")

Length: 4328
Negatives: 1371, Positives: 2957


In [3]:
num_aug = np.full(neg_size, pos_size // neg_size)
num_aug[:pos_size % neg_size] += 1
np.random.shuffle(num_aug)
num_aug

array([2, 2, 2, ..., 2, 2, 2])

In [4]:
alpha_sr = 0.1 # synonym replacement
alpha_ri = 0.1 # random insertion
alpha_rs = 0.0 # random swap
alpha_rd = 0.0 # random deletion

augmented_negatives = {"label": [], "tweet": []}

for i, line in enumerate(negatives.iloc):
    label = line["label"]
    sentence = normalizeTweet(line["tweet"])
    aug_sentences = eda(sentence, alpha_sr=alpha_sr, alpha_ri=alpha_ri, alpha_rs=alpha_rs, p_rd=alpha_rd, num_aug=num_aug[i]-1)
    for aug_sentence in aug_sentences:
        augmented_negatives["label"].append(label)
        augmented_negatives["tweet"].append(aug_sentence)

augmented_negatives = pd.DataFrame(augmented_negatives)
augmented_df = pd.concat([augmented_negatives, positives[["label", "tweet"]]])

In [5]:
augmented_df.to_csv("data/train_augment_5050.csv")

In [6]:
df = pd.read_csv("data/train_augment_5050.csv")
print(f"Length: {len(df)}")

negatives = df[df["label"] == 0]
positives = df[df["label"] == 1]

neg_size = len(negatives)
pos_size = len(positives)

print(f"Negatives: {neg_size}, Positives: {pos_size}")

Length: 5914
Negatives: 2957, Positives: 2957


# Augment Train Data to 75% neg 25% pos

In [68]:
import pandas as pd
import numpy as np
from BERTweet.TweetNormalizer import normalizeTweet
from eda_nlp.code.eda import eda

np.random.seed(12345678)

In [69]:
df = pd.read_csv("data/train.csv")
print(f"Length: {len(df)}")

negatives = df[df["label"] == 0]
positives = df[df["label"] == 1]

neg_size = len(negatives)
pos_size = len(positives)

print(f"Negatives: {neg_size}, Positives: {pos_size}")

Length: 4328
Negatives: 1371, Positives: 2957


In [70]:
num_aug = np.full(neg_size, 3 * pos_size // neg_size)
num_aug[:3 * pos_size % neg_size] += 1
np.random.shuffle(num_aug)
num_aug

array([7, 7, 7, ..., 6, 6, 7])

In [71]:
alpha_sr = 0.1 # synonym replacement
alpha_ri = 0.1 # random insertion
alpha_rs = 0.1 # random swap
alpha_rd = 0.1 # random deletion

augmented_negatives = {"label": [], "tweet": []}

for i, line in enumerate(negatives.iloc):
    label = line["label"]
    sentence = normalizeTweet(line["tweet"])
    aug_sentences = eda(sentence, alpha_sr=alpha_sr, alpha_ri=alpha_ri, alpha_rs=alpha_rs, p_rd=alpha_rd, num_aug=num_aug[i]-1)
    for aug_sentence in aug_sentences:
        augmented_negatives["label"].append(label)
        augmented_negatives["tweet"].append(aug_sentence)

augmented_negatives = pd.DataFrame(augmented_negatives)
augmented_df = pd.concat([augmented_negatives, positives[["label", "tweet"]]])

In [72]:
augmented_df.to_csv("data/train_augment_7525.csv")

In [74]:
df = pd.read_csv("data/train_augment_7525.csv")
print(f"Length: {len(df)}")

negatives = df[df["label"] == 0]
positives = df[df["label"] == 1]

neg_size = len(negatives)
pos_size = len(positives)

print(f"Negatives: {neg_size}, Positives: {pos_size}")

Length: 11828
Negatives: 8871, Positives: 2957
