In [7]:
# RUN SETUP.SH BEFORE RUNNING THIS IPYNB
# REQUIREMENTS FOR SETUP.SH:
# python 3.11.8
# pip 23.3.1

import pandas as pd
import pickle
from sklearn.metrics import f1_score, precision_recall_fscore_support, accuracy_score
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.naive_bayes import MultinomialNB # Naive Bayes Classifier
from sklearn.linear_model import LogisticRegression # Logistic Regression Classifier
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import ADASYN
from scipy.sparse import hstack, csr_matrix, save_npz, load_npz
import spacy
import re
import numpy as np
from spacytextblob.spacytextblob import SpacyTextBlob
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectPercentile, f_classif, mutual_info_classif
import nltk
import gensim.downloader as api
import gensim 
from readability import Readability
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler
import csv
import random 
from collections import defaultdict


In [8]:
SEED = 42 # seed random state for comparison, testing
random.seed(SEED)
PARTITION_SIZE = 500 # Adjust lower if potato PC and higher if gaming rig or want results closer to actual
enable_all_data = True # SET TO FALSE IF PREPROCESSING TAKES A LONG TIME (True = test on PARTITION_SIZE training and PARTITION_SIZE testing samples)

# Pretrained models load
spacy_model = spacy.load("en_core_web_lg")
gensim_model = api.load("word2vec-google-news-300")

KeyboardInterrupt: 

In [None]:

df = pd.read_csv('../raw_data/fulltrain.csv', header=None, index_col = False)
# df.head()

In [None]:
df = df if enable_all_data else df.sample(n=PARTITION_SIZE, random_state=SEED)

X_train = df.iloc[:, 1] 
y_train = df.iloc[:, 0]

# print(type(X_train))
# print(type(y_train))

# print(len(X_train))
# print(len(y_train))

y_train.value_counts()

0
1    3500
2    3500
3    3500
4    3500
Name: count, dtype: int64

In [None]:
def synonym_augmentation(X, y, percentage_majority=1.1, similarity_threshold=0.65, num_word_changes=10):
    '''
    X: Input sentences
    y: Labels for sentences
    percentage_majority: Number of training examples should each label have relative to the label with largest number of examples
    similarity_threshold: How similar words should be to current word to be considered as synonyms
    num_word_changes: Number of words that should be converted to synonyms
    '''
    
    if X.shape[0] != y.shape[0]:
        print("Wrong dimension of X to y")

    # Remove punctuation
    X = X.apply(lambda sentence: re.sub(r"[^\w\s]", "", sentence))

    # Number of training examples should each label have relative to the label with largest number of examples
    aug_num_labels_per_class = int(y.value_counts().max() * percentage_majority)
    synthetic_text = [] # datatype: tuple <sentence, label>

    unique_labels = y.unique()
    for label in unique_labels:
        # Get all training examples with a certain label
        mask = y.index[y == label]
        data_with_label = X[mask]

        counter = len(data_with_label)
        # Generate new synthetic labels
        while counter < aug_num_labels_per_class:
            # Get a random sentence
            data_index = random.randint(0, len(data_with_label) - 1)
            sentence = data_with_label.iloc[data_index]

            # Get verbs and adverbs in sentence to change them
            document = spacy_model(sentence)
            verbs_adverbs = [token.text for token in document if token.pos_ in ["VERB", "ADV"]]
            num_words = min(len(verbs_adverbs), num_word_changes, len(sentence.split()))
            words_to_change = random.sample(verbs_adverbs, k=num_words) # already seeded

            synonym_mapping = defaultdict(list)
            # Get matching based on similarity_threshold
            for word in words_to_change:
                try:
                    synonyms = gensim_model.most_similar(word)
                    synonyms = list(map(lambda tup: tup[0], filter(lambda tup: tup[1] > similarity_threshold, synonyms)))
                    synonyms = [re.sub("_", "", syn) for syn in synonyms]
                    synonym_mapping[word] = synonyms
                except: 
                    continue

            # Create synthetic sentences by randomly selecting
            if len(synonym_mapping) == 0:
                continue
            else:
                for key in synonym_mapping:
                    if len(synonym_mapping[key]) == 0:
                        mapped_word = key
                    else:
                        mapped_word = random.choice(synonym_mapping[key])
                    sentence = re.sub(key, mapped_word, sentence)

                synthetic_text.append((sentence, label))
                counter += 1

    # Add synthetic_text to X and y
    X_synthetic = pd.Series([tup[0] for tup in synthetic_text])
    y_synthetic = pd.Series([tup[1] for tup in synthetic_text])
    
    X = pd.concat([X, X_synthetic], ignore_index=True)
    y = pd.concat([y, y_synthetic], ignore_index=True)
    return X, y

X_train_syn, y_train_syn = synonym_augmentation(X_train, y_train)


In [None]:
df1 = pd.DataFrame({'label': y_train_syn})
df2 = pd.DataFrame({'sentence': X_train_syn})

df_syn = pd.concat([df1, df2], axis=1)

df_syn.to_csv('synonym_augmented_train.csv', index=False)