In [116]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


After studying NLP these days, I got a huge improvement now!

1. Feature Engineering in NLP (already have done but with improvement today) [link](https://github.com/y1u2a3n4g5/aap/blob/main/week3/aap_datacleaning.ipynb)
- tokenization
- stemmization
- lowercase
- label encoder (going to do)
- embeddings
  - count vectorizer
  - TF-IDF
  - word2vec

2. Data Augmentation in NLP (I'm going to do today)
- Synonym Replacement Using WordNet
- Random Word Insertion
- Random Word Swap
- Noise Injection

3. Dimensionality Reduction Techniques in NLP (already have done but with improvement today) [link](https://github.com/y1u2a3n4g5/aap/blob/main/week4/nlpaap_embeddings.ipynb)
- I have used word2vec to cut down my data dimensions

In [117]:
# import the packages
import pandas as pd
import numpy as np
import os
import re
import zipfile
import matplotlib.pyplot as plt
import string
import nltk
import random
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models.word2vec import Word2Vec
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [118]:
train_csv = '/content/drive/My Drive/twitter_training.csv'
test_csv = '/content/drive/My Drive/twitter_validation.csv'
train_data = pd.read_csv(train_csv, sep = ",", header=None)
test_data = pd.read_csv(test_csv, sep = ",", header=None)

In [119]:
train_data.columns = ['id', 'company', 'sentiment', 'text']
test_data.columns = ['id', 'company', 'sentiment', 'text']

In [120]:
train_data.head()

Unnamed: 0,id,company,sentiment,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


in this background, we only need the sentiment and test data

In [121]:
train_df = train_data.iloc[:, [2, 3]]
test_df = test_data.iloc[:, [2, 3]]

In [122]:
train_df

Unnamed: 0,sentiment,text
0,Positive,im getting on borderlands and i will murder yo...
1,Positive,I am coming to the borders and I will kill you...
2,Positive,im getting on borderlands and i will kill you ...
3,Positive,im coming on borderlands and i will murder you...
4,Positive,im getting on borderlands 2 and i will murder ...
...,...,...
74677,Positive,Just realized that the Windows partition of my...
74678,Positive,Just realized that my Mac window partition is ...
74679,Positive,Just realized the windows partition of my Mac ...
74680,Positive,Just realized between the windows partition of...


In [123]:
test_df.head()

Unnamed: 0,sentiment,text
0,Irrelevant,I mentioned on Facebook that I was struggling ...
1,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,Negative,@Microsoft Why do I pay for WORD when it funct...
3,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,Neutral,Now the President is slapping Americans in the...


# 1. Feature Engineering

## tokenizing, stemming/lemmatization

In [124]:
# check is null or is duplicate
train_nulls = train_df.isnull().sum()
print("Missing values in train_df:")
print(train_nulls)

# Check for missing values in the test data
test_nulls = test_df.isnull().sum()
print("Missing values in test_df:")
print(test_nulls)

# Check for duplicate rows in the training data
train_duplicates = train_df["text"].duplicated().sum()
print(f"Number of duplicate rows in train_df: {train_duplicates}")

# Check for duplicate rows in the test data
test_duplicates = test_df["text"].duplicated().sum()
print(f"Number of duplicate rows in test_df: {test_duplicates}")

Missing values in train_df:
sentiment      0
text         686
dtype: int64
Missing values in test_df:
sentiment    0
text         0
dtype: int64
Number of duplicate rows in train_df: 5190
Number of duplicate rows in test_df: 1


In [125]:
# Drop rows with missing values
train_df_cleaned = train_df.dropna()

# Drop duplicate rows based on the 'text' column
train_df_cleaned = train_df_cleaned.drop_duplicates(subset="text")

# Check if all missing values and duplicates are removed
train_nulls = train_df_cleaned.isnull().sum()
print("Missing values after cleaning train_df:")
print(train_nulls)

train_duplicates = train_df_cleaned["text"].duplicated().sum()
print(f"Number of duplicate rows after cleaning train_df: {train_duplicates}")

train_df = train_df_cleaned

Missing values after cleaning train_df:
sentiment    0
text         0
dtype: int64
Number of duplicate rows after cleaning train_df: 0


now we have data:
- train_df
- test_df
with 2 columns sentiment and text
I'm going to write a function for feature engineering

In [126]:
train_df

Unnamed: 0,sentiment,text
0,Positive,im getting on borderlands and i will murder yo...
1,Positive,I am coming to the borders and I will kill you...
2,Positive,im getting on borderlands and i will kill you ...
3,Positive,im coming on borderlands and i will murder you...
4,Positive,im getting on borderlands 2 and i will murder ...
...,...,...
74677,Positive,Just realized that the Windows partition of my...
74678,Positive,Just realized that my Mac window partition is ...
74679,Positive,Just realized the windows partition of my Mac ...
74680,Positive,Just realized between the windows partition of...


In [127]:
def preprocess_df(df):
    def process_string(x):
        # tokenize
        words = nltk.word_tokenize(' '.join([char for char in x.strip().split(" ") if char not in string.punctuation]).lower())

        # stemmization and remove stopwords
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words if word not in stopwords.words('english') and word.isalpha()]

        # regular expression
        words = list(map(lambda x: re.sub("[^a-zA-Z]", " ", x), words))
        words = ' '.join(words)

        # lowercase
        words = words.lower()
        return words

    df['text'] = df['text'].apply(process_string)

    return df

In [128]:
train_df = preprocess_df(train_df)
test_df = preprocess_df(test_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['text'].apply(process_string)


In [129]:
train_df.head()

Unnamed: 0,sentiment,text
0,Positive,im getting borderland murder
1,Positive,coming border kill
2,Positive,im getting borderland kill
3,Positive,im coming borderland murder
4,Positive,im getting borderland murder


## embeddings

In [130]:
test_df.head()

Unnamed: 0,sentiment,text
0,Irrelevant,mentioned facebook struggling motivation go ru...
1,Neutral,bbc news amazon bos jeff bezos reject claim co...
2,Negative,microsoft pay word function poorly samsungus c...
3,Negative,csgo matchmaking full closet hacking truly awf...
4,Neutral,president slapping american face really commit...


In [131]:
train = train_df["text"]
target = train_df["sentiment"]
test = test_df["text"]
test_label = test_df["sentiment"]

## label encoder

In [132]:
label_encoder = LabelEncoder()

target = label_encoder.fit_transform(target)
test_label = label_encoder.transform(test_label)

label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label mapping:", label_mapping)

Label mapping: {'Irrelevant': 0, 'Negative': 1, 'Neutral': 2, 'Positive': 3}


In [133]:
def count_vectorizer(train, target,test=None, test_size=0.2, random_state=42):
    # Split the train data into train and validation sets
    X_train, X_val, y_train, y_val = train_test_split(train, target, test_size=test_size, random_state=random_state)

    # Initialize CountVectorizer and fit on the training data
    cv = CountVectorizer()
    X_train_cv = cv.fit_transform(X_train)
    X_val_cv = cv.transform(X_val)

    # If test data is provided, transform it using the trained CountVectorizer
    if test is not None:
        X_test_cv = cv.transform(test)
        return X_train_cv, X_val_cv,  y_train, y_val, X_test_cv, cv
    else:
        return X_train_cv, X_val_cv, cv


In [134]:
def tfidf_vectorizer(train, target, test=None, test_size=0.2, random_state=42):
    # Split the train data into train and validation sets
    X_train, X_val, y_train, y_val = train_test_split(train, target, test_size=test_size, random_state=random_state)

    # Initialize TfidfVectorizer and fit on the training data
    tfidf = TfidfVectorizer(stop_words="english", ngram_range=(1, 2), analyzer="word",
                            max_df=0.5, binary=False, token_pattern=r'\w+', sublinear_tf=False)

    # Fit the vectorizer on the training data and transform both training and validation sets
    X_train_tfidf = tfidf.fit_transform(X_train)
    X_val_tfidf = tfidf.transform(X_val)

    # If test data is provided, transform it using the trained TfidfVectorizer
    if test is not None:
        X_test_tfidf = tfidf.transform(test)
        return X_train_tfidf, X_val_tfidf, y_train, y_val, X_test_tfidf, tfidf
    else:
        return X_train_tfidf, X_val_tfidf, y_train, y_val , tfidf


In [137]:
def word2vec(train, target, test=None, test_size=0.2, random_state=42, vector_size=100, window=5, min_count=1):
    # Step 1: Split the train data into train and validation sets
    X_train, X_val, y_train, y_val = train_test_split(train, target, test_size=test_size, random_state=random_state)

    # Step 2: Train Word2Vec model on the training data
    word2vec_model = Word2Vec(sentences=[i.split() for i in X_train], vector_size=vector_size, window=window, min_count=min_count, sg=0)

    # Step 3: Function to generate sentence vectors by averaging word vectors
    def get_sentence_vector(sentence, model, vector_size):
        sentence_vector = np.zeros(vector_size)  # Initialize an empty vector
        count = 0
        for word in sentence.split():
            if word in model.wv:
                sentence_vector += model.wv[word]
                count += 1
        if count != 0:
            sentence_vector /= count  # Average the vectors
        return sentence_vector

    # Step 4: Convert train and validation datasets to sentence vectors
    train_vectors = np.array([get_sentence_vector(sentence, word2vec_model, vector_size) for sentence in X_train])
    val_vectors = np.array([get_sentence_vector(sentence, word2vec_model, vector_size) for sentence in X_val])

    # Step 5: If test data is provided, apply the same transformation
    if test is not None:
        test_vectors = np.array([get_sentence_vector(sentence, word2vec_model, vector_size) for sentence in test])
        return train_vectors, val_vectors, y_train, y_val, test_vectors, word2vec_model
    else:
        return train_vectors, val_vectors, y_train, y_val, word2vec_model


In [138]:
X_train_cv, X_val_cv, y_train_cv, y_val_cv, X_test_cv, cv = count_vectorizer(train, target, test)
X_train_tfidf, X_val_tfidf, y_train_tfidf, y_val_tfidf, X_test_tfidf, tfidf = tfidf_vectorizer(train, target, test)
X_train_w2v, X_val_w2v, y_train_w2v, y_val_w2v, X_test_w2v, w2v = word2vec(train, target, test)

In [142]:
X_train_cv.shape

(55592, 22126)

this is a very sparse matrix! I need to find a way to deal with this problem!
maybe use stemmization other than lemmatization?
we should use word2vec instead!

In [143]:
X_train_w2v.shape

(55592, 100)

# data augmentation

## 1. synonym process

In [3]:
def synonym_replacement(sentence, n=1):
    """
    Replace 'n' random words in the sentence with their synonyms.
    """
    words = sentence.split()
    new_words = words.copy()
    random_word_list = list(set([word for word in words if wordnet.synsets(word)]))
    random.shuffle(random_word_list)

    num_replaced = 0
    for random_word in random_word_list:
        synonyms = wordnet.synsets(random_word)
        synonym_words = list(set([syn.lemmas()[0].name() for syn in synonyms]))
        if len(synonym_words) > 0:
            synonym = random.choice(synonym_words)
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break
    return ' '.join(new_words)

def random_preprocess_df(df, synonym_replacements=1):
    def process_string(x):
        # Step 1: Synonym replacement
        x = synonym_replacement(x, n=synonym_replacements)

        # Step 2: Remove punctuation and lowercase the string
        x = ''.join([char for char in x if char not in string.punctuation]).lower()

        # Step 3: Tokenize the string
        words = nltk.word_tokenize(x)

        # Step 4: Lemmatization and remove stopwords
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words if word not in stopwords.words('english') and word.isalpha()]

        # Step 5: Join words back into a single string
        words = ' '.join(words)

        return words

    # Apply the process_string function to the 'text' column in the DataFrame
    df['text'] = df['text'].apply(process_string)

    return df


In [4]:
train_synonym = random_preprocess_df(train_df, synonym_replacements=1)
test_synonym = random_preprocess_df(test_df, synonym_replacements=1)

## 2. random deletion

In [None]:
# Ensure necessary downloads for nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def random_deletion(sentence, p=0.2):
    """
    Randomly deletes words from the sentence with probability p.
    """
    words = sentence.split()
    if len(words) == 1:
        return sentence  # If there's only one word, return the original sentence

    # Randomly delete words
    new_words = [word for word in words if random.uniform(0, 1) > p]

    # Ensure at least one word remains
    if len(new_words) == 0:
        return random.choice(words)

    return ' '.join(new_words)

def deletion_preprocess_df(df, deletion_prob=0.2):
    def process_string(x):
        # Step 1: Random deletion
        x = random_deletion(x, p=deletion_prob)

        # Step 2: Remove punctuation and lowercase the string
        x = ''.join([char for char in x if char not in string.punctuation]).lower()

        # Step 3: Tokenize the string
        words = nltk.word_tokenize(x)

        # Step 4: Lemmatization and remove stopwords
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words if word not in stopwords.words('english') and word.isalpha()]

        # Step 5: Join words back into a single string
        words = ' '.join(words)

        return words

    # Apply the process_string function to the 'text' column in the DataFrame
    df['text'] = df['text'].apply(process_string)

    return df


In [5]:
train_deletion = deletion_preprocess_df(train_df, synonym_replacements=1)
test_deletion = deletion_preprocess_df(test_df, synonym_replacements=1)

## 3. random swap

In [None]:
def random_swap(sentence, n=1):
    """
    Randomly swaps two words in the sentence n times.
    """
    words = sentence.split()
    if len(words) < 2:
        return sentence  # No swapping possible if less than 2 words

    for _ in range(n):
        idx1, idx2 = random.sample(range(len(words)), 2)
        words[idx1], words[idx2] = words[idx2], words[idx1]  # Swap words at idx1 and idx2

    return ' '.join(words)

def swap_preprocess_df(df, swap_count=1):
    def process_string(x):
        # Step 1: Random word swap
        x = random_swap(x, n=swap_count)

        # Step 2: Remove punctuation and lowercase the string
        x = ''.join([char for char in x if char not in string.punctuation]).lower()

        # Step 3: Tokenize the string
        words = nltk.word_tokenize(x)

        # Step 4: Lemmatization and remove stopwords
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words if word not in stopwords.words('english') and word.isalpha()]

        # Step 5: Join words back into a single string
        words = ' '.join(words)

        return words

    # Apply the process_string function to the 'text' column in the DataFrame
    df['text'] = df['text'].apply(process_string)

    return df


In [None]:
train_swap = swap_preprocess_df(train_df, synonym_replacements=1)
test_swap = swap_preprocess_df(test_df, synonym_replacements=1)

maybe I will use those dataset later...