<h1> <center>GoEmotions   </center>

GoEmotions is a corpus of 58k carefully curated comments extracted from Reddit, with human annotations to 27 emotion categories or Neutral.


*   Number of examples: 58,009.
*   Number of labels: 27 + Neutral.
*   Maximum sequence length in training and evaluation datasets: 30.



The emotion categories are: admiration, amusement, anger, annoyance, approval, caring, confusion, curiosity, desire, disappointment, disapproval, disgust, embarrassment, excitement, fear, gratitude, grief, joy, love, nervousness, optimism, pride, realization, relief, remorse, sadness, surprise.

This dataset includes the following columns:

*   id: The unique id of the comment.
*   text: The text of the comment (with masked tokens, as described in the paper).
*   author: The Reddit username of the comment's author.
*   example_very_unclear: Whether the annotator marked the example as being very unclear or difficult to label (in this case they did not choose any emotion labels).
*   28 other columns for the emotions: admiration, amusement, anger, annoyance, approval, caring, confusion, curiosity, desire, disappointment, disapproval, disgust, embarrassment, excitement, fear, gratitude, grief, joy, love, nervousness, optimism, pride, realization, relief, remorse, sadness, surprise.





In [None]:
#some important imports
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from collections import defaultdict
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import skipgrams


In [None]:
#from google.colab import drive
#drive.mount('/content/drive/')

In [None]:
#reading the goEmotions dataset
df = pd.read_csv('/content/go_emotions_dataset.csv')
df.head()

Unnamed: 0,id,text,example_very_unclear,admiration,amusement,anger,annoyance,approval,caring,confusion,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,eew5j0j,That game hurt.,False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,eemcysk,>sexuality shouldn’t be a grouping category I...,True,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ed2mah1,"You do right, if you don't care then fuck 'em!",False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,eeibobj,Man I love reddit.,False,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,eda6yn6,"[NAME] was nowhere near them, he was by the Fa...",False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [None]:
dfCopy = df.copy()

In [None]:
#the shape of the dataset
df.shape

(81496, 31)

In [None]:
#data types of the columns
df.dtypes

id                       object
text                     object
example_very_unclear     object
admiration              float64
amusement               float64
anger                   float64
annoyance               float64
approval                float64
caring                  float64
confusion               float64
curiosity               float64
desire                  float64
disappointment          float64
disapproval             float64
disgust                 float64
embarrassment           float64
excitement              float64
fear                    float64
gratitude               float64
grief                   float64
joy                     float64
love                    float64
nervousness             float64
optimism                float64
pride                   float64
realization             float64
relief                  float64
remorse                 float64
sadness                 float64
surprise                float64
neutral                 float64
dtype: o

In [None]:
#checking for null values
df.isna().sum()

id                      0
text                    0
example_very_unclear    1
admiration              1
amusement               1
anger                   1
annoyance               1
approval                1
caring                  1
confusion               1
curiosity               1
desire                  1
disappointment          1
disapproval             1
disgust                 1
embarrassment           1
excitement              1
fear                    1
gratitude               1
grief                   1
joy                     1
love                    1
nervousness             1
optimism                1
pride                   1
realization             1
relief                  1
remorse                 1
sadness                 1
surprise                1
neutral                 1
dtype: int64

# <center>Preprocessing</center>

## Lowercase

In [None]:
#changing the text column to lowercase
df["text"] = df["text"].str.lower()
df.text

0                                          that game hurt.
1         >sexuality shouldn’t be a grouping category i...
2           you do right, if you don't care then fuck 'em!
3                                       man i love reddit.
4        [name] was nowhere near them, he was by the fa...
                               ...                        
81491    weird how they shoehorned s character from an ...
81492    define woman please if you're not going to use...
81493          it was a good sub before the porn took over
81494    wait, i see the problem, you are changing the ...
81495      that’s crazy how much [name] dunks compared to 
Name: text, Length: 81496, dtype: object

## Punctuation removal

In [None]:
#All the charcters in the string lib
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
#Takes a text string as input and removes all punctuation marks from it using the translate method and a string of punctuation characters.
def Punc_remove(text):
    punc = string.punctuation + "’‘“”"
    return text.translate(str.maketrans('', '', punc))

In [None]:
#This code applies the Punc_remove() function to the 'text' column to remove any text punctuations
df['text'] = df['text'].apply(lambda x: Punc_remove(x))

In [None]:
df['text']

0                                           that game hurt
1         sexuality shouldnt be a grouping category it ...
2               you do right if you dont care then fuck em
3                                        man i love reddit
4         name was nowhere near them he was by the falcon 
                               ...                        
81491    weird how they shoehorned s character from an ...
81492    define woman please if youre not going to use ...
81493          it was a good sub before the porn took over
81494    wait i see the problem you are changing the wo...
81495         thats crazy how much name dunks compared to 
Name: text, Length: 81496, dtype: object

## Stopping words removal

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
#English stopwords separated by commas using the NLTK
" , ".join(stopwords.words('english'))

"i , me , my , myself , we , our , ours , ourselves , you , you're , you've , you'll , you'd , your , yours , yourself , yourselves , he , him , his , himself , she , she's , her , hers , herself , it , it's , its , itself , they , them , their , theirs , themselves , what , which , who , whom , this , that , that'll , these , those , am , is , are , was , were , be , been , being , have , has , had , having , do , does , did , doing , a , an , the , and , but , if , or , because , as , until , while , of , at , by , for , with , about , against , between , into , through , during , before , after , above , below , to , from , up , down , in , out , on , off , over , under , again , further , then , once , here , there , when , where , why , how , all , any , both , each , few , more , most , other , some , such , no , nor , not , only , own , same , so , than , too , very , s , t , can , will , just , don , don't , should , should've , now , d , ll , m , o , re , ve , y , ain , aren ,

In [None]:
#This function removes the stop words from a given text.
stop_words = set(stopwords.words('english'))
def StopWords_removal(text):
    SW = ' '.join([word for word in text.split() if word not in stop_words])
    return SW

In [None]:
#This code applies the StopWords_removal() function to the 'text' column to remove any text stop words
df['text'] = df['text'].apply(lambda x: StopWords_removal(x))

In [None]:
df['text']

0                                                game hurt
1        sexuality shouldnt grouping category makes dif...
2                                  right dont care fuck em
3                                          man love reddit
4                                 name nowhere near falcon
                               ...                        
81491    weird shoehorned character ongoing tv show cel...
81492    define woman please youre going use accepted d...
81493                                   good sub porn took
81494    wait see problem changing word observation ass...
81495                 thats crazy much name dunks compared
Name: text, Length: 81496, dtype: object

## Frequent words

In [None]:
#Counts the frequency of each word in the "text" column we didn't remove them as they contain emotions that could help in the classification.
from collections import Counter
cnt = Counter()
for text in df["text"].values:
    for word in text.split():
        cnt[word] += 1

cnt.most_common(10)

[('name', 14714),
 ('like', 6350),
 ('im', 5880),
 ('dont', 4561),
 ('get', 3519),
 ('thats', 3438),
 ('one', 3394),
 ('would', 3268),
 ('people', 3262),
 ('love', 3205)]

## Spelling Correction

In [None]:
from textblob import TextBlob
from tqdm import tqdm

In [None]:
#word = df.text

#result = df['text'].spellcheck()

In [None]:
import re
# Remove url in the input text
def remove_url(input_text: str) -> str:
    return re.sub('(www|http)\S+', '', input_text)

In [None]:
df['text'].apply(remove_url)

0                                                game hurt
1        sexuality shouldnt grouping category makes dif...
2                                  right dont care fuck em
3                                          man love reddit
4                                 name nowhere near falcon
                               ...                        
81491    weird shoehorned character ongoing tv show cel...
81492    define woman please youre going use accepted d...
81493                                   good sub porn took
81494    wait see problem changing word observation ass...
81495                 thats crazy much name dunks compared
Name: text, Length: 81496, dtype: object

In [None]:
 # Remove email in the text
def remove_email(input_text: str) -> str:
  regex_pattern = '[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}'
  return re.sub(regex_pattern, '', input_text)

In [None]:
df['text'].apply(remove_email)

0                                                game hurt
1        sexuality shouldnt grouping category makes dif...
2                                  right dont care fuck em
3                                          man love reddit
4                                 name nowhere near falcon
                               ...                        
81491    weird shoehorned character ongoing tv show cel...
81492    define woman please youre going use accepted d...
81493                                   good sub porn took
81494    wait see problem changing word observation ass...
81495                 thats crazy much name dunks compared
Name: text, Length: 81496, dtype: object

In [None]:
from typing import List, Optional, Union, Callable
import os
import posixpath as path
import ntpath as path
_IGNORE_SPELLCHECK_WORD_FILE_PATH = os.path.join('/content/ignore_spellcheck_words.txt')
from pathlib import Path



In [None]:
!pip install pyspellchecker

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspellchecker
  Downloading pyspellchecker-0.7.1-py3-none-any.whl (2.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m36.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.7.1


In [None]:
from spellchecker import SpellChecker


NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------


In [None]:
#def check_spelling(input_text_or_list: Union[str, List[str]], lang='en',
#                   ignore_word_file_path: Union[str, Path] = _IGNORE_SPELLCHECK_WORD_FILE_PATH) -> str:
#    """ Check and correct spellings of the text list """
#    if input_text_or_list is None or len(input_text_or_list) == 0:
#        return ''
#    spelling_checker = SpellChecker(language=lang, distance=1)
#    # TODO: add acronyms into spell checker to ignore auto correction specified by _IGNORE_SPELLCHECK_WORD_FILE_PATH
#    spelling_checker.word_frequency.load_text_file(ignore_word_file_path)
#    if isinstance(input_text_or_list, str):
#        if not input_text_or_list.islower():
#            input_text_or_list = input_text_or_list.lower()
#        tokens = word_tokenize(input_text_or_list)
#    else:
#        tokens = [token.lower() for token in input_text_or_list if token is not None and len(token) > 0]
#    misspelled = spelling_checker.unknown(tokens)
#    for word in misspelled:
#        tokens[tokens.index(word)] = spelling_checker.correction(word)
#    return ' '.join(tokens).strip()
#    df['text'].apply(check_spelling)
#with tqdm(total=len(df)) as pbar:
#    for i, row in df.iterrows():
#        df.at[i, 'text'] = check_spelling(row['text'])
#        pbar.update(1)

In [None]:
#df['text'].apply(check_spelling)

The code aims to correct the spelling of the text data in a pandas DataFrame column called 'text'. It does this by iterating over each row of the DataFrame, calling the correct_sentence_spelling function on the 'text' column of the row, and replacing the original text with the corrected version.

The correct_sentence_spelling function takes a sentence as input, creates a TextBlob object from the sentence, applies spell checking and correction on the sentence using the correct() method of the TextBlob object, and returns the corrected sentence.

The iteration over the DataFrame is done using the iterrows() method, which iterates over the rows of the DataFrame as (index, Series) pairs. The at method of the DataFrame is used to update the 'text' column of the current row with the corrected version of the text. Finally, the 'text' column of the DataFrame is updated by applying the correct_sentence_spelling function on each element using the apply() method.

In [None]:
#def correct_sentence_spelling(sentence):
#    sentence = TextBlob(sentence)
#    result = sentence.correct()
#    return result
#with tqdm(total=len(df)) as pbar:
#    for i, row in df.iterrows():
#        df.at[i, 'text'] = correct_sentence_spelling(row['text'])
#        pbar.update(1)

#df['text'] = df['text'].apply(lambda x: correct_sentence_spelling(x))
#df['text']

############################################

## Lemmatization

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

In [None]:
df["text"] = df["text"].apply(lambda text: lemmatize_words(text))
df.text

0                                                game hurt
1        sexuality shouldnt grouping category make diff...
2                                  right dont care fuck em
3                                          man love reddit
4                                 name nowhere near falcon
                               ...                        
81491    weird shoehorned character ongoing tv show cel...
81492    define woman please youre going use accepted d...
81493                                   good sub porn took
81494    wait see problem changing word observation ass...
81495                  thats crazy much name dunk compared
Name: text, Length: 81496, dtype: object

In [None]:
#the lemmatization cannot deal only with ing words
#lemmatize_words("running")

## Tokenization

In [None]:
df['text']

0                                                game hurt
1        sexuality shouldnt grouping category make diff...
2                                  right dont care fuck em
3                                          man love reddit
4                                 name nowhere near falcon
                               ...                        
81491    weird shoehorned character ongoing tv show cel...
81492    define woman please youre going use accepted d...
81493                                   good sub porn took
81494    wait see problem changing word observation ass...
81495                  thats crazy much name dunk compared
Name: text, Length: 81496, dtype: object

In [None]:
nltk.download('punkt');

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
def tokenize(column):
    tokens = nltk.word_tokenize(column)
    return [w for w in tokens if w.isalpha()]

In [None]:
df['text'] = df['text'].apply(lambda x: tokenize(x))

In [None]:
df['text'].head()

0                                         [game, hurt]
1    [sexuality, shouldnt, grouping, category, make...
2                        [right, dont, care, fuck, em]
3                                  [man, love, reddit]
4                        [name, nowhere, near, falcon]
Name: text, dtype: object

## Stemming

In [None]:
stemmer = PorterStemmer()

def stem_tokens(tokens):
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens

df['text'] = df['text'].apply(lambda x: stem_tokens(x))

In [None]:
df['text']

0                                             [game, hurt]
1        [sexual, shouldnt, group, categori, make, diff...
2                            [right, dont, care, fuck, em]
3                                      [man, love, reddit]
4                             [name, nowher, near, falcon]
                               ...                        
81491    [weird, shoehorn, charact, ongo, tv, show, cel...
81492    [defin, woman, pleas, your, go, use, accept, d...
81493                              [good, sub, porn, took]
81494    [wait, see, problem, chang, word, observ, assu...
81495              [that, crazi, much, name, dunk, compar]
Name: text, Length: 81496, dtype: object

# <center> word embeddings</center>

## CBOW

In [None]:
!pip install expects


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting expects
  Downloading expects-0.9.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: expects
  Building wheel for expects (setup.py) ... [?25l[?25hdone
  Created wheel for expects: filename=expects-0.9.0-py3-none-any.whl size=18599 sha256=fa8c0e912c04df62b9d2d4530bfa70e1ace4d4d7be07e8924645db88451caadc
  Stored in directory: /root/.cache/pip/wheels/67/e4/dc/7d0873f8e4d68377443095ebf0de4ab6551ffe54fbaa15d580
Successfully built expects
Installing collected packages: expects
Successfully installed expects-0.9.0


In [None]:
#Imports
# python
import math

# from pypi
from expects import  be_true
from expects import equal
from expects import expect
import numpy

The code is defining a function window_generator which is a generator function that yields windows of words for the continuous bag-of-words (CBOW) model.

The function takes two arguments:

words: a list of cleaned tokens
half_window: an integer that represents the number of words in the half-window. The half-window is the number of words before and after the center word in the window.
The function yields a tuple consisting of two parts:

context_words: a list of words that are in the context of the center_word. The context words are all the words that are in the half-window.
center_word: the word in the center of the window.
The generator function iterates over the words list, starting from the half_window index, and ending at len(words) - half_window index. For each index center_index in this range, the function retrieves the center_word from words at index center_index, and creates the context_words list by taking the words from words that appear before and after the center_word. It then yields the tuple consisting of context_words and center_word.

Finally, if the generator reaches the end of the loop, it returns None to signal that it is done generating windows

In [None]:
def window_generator(words: list, half_window: int):
    """Generates windows of words

    Args:
     words: cleaned tokens
     half_window: number of words in the half-window

    Yields:
     the next window
    """
    for center_index in range(half_window, len(words) - half_window):
        center_word = words[center_index]
        context_words = (words[(center_index - half_window) : center_index]
                         + words[(center_index + 1):(center_index + half_window + 1)])
        yield context_words, center_word
    return





This code defines a function called index_word_maps that takes in a list of data as input and returns two dictionaries, word_to_index and index_to_word. The purpose of these dictionaries is to create a mapping between the unique words in the data and their corresponding indices.

The function first creates a sorted list of unique words in the data using sorted(list(set(data))). Then, it creates two dictionaries - word_to_index and index_to_word.

The word_to_index dictionary maps each unique word to its corresponding index in the sorted list of words. It uses the enumerate() function to loop through the sorted list and assigns an index to each word.

The index_to_word dictionary maps each index to its corresponding word in the sorted list of words. It also uses the enumerate() function to loop through the sorted list, but instead of assigning an index to each word, it assigns a word to each index.

Finally, the function returns both dictionaries as a tuple.

In [None]:
def index_word_maps(data: list) -> tuple:
    """Creates index to word mappings

    The index is based on sorted unique tokens in the data

    Args:
       data: the data you want to pull from

    Returns:
       word2Ind: returns dictionary mapping the word to its index
       Ind2Word: returns dictionary mapping the index to its word
    """
    words = sorted(list(set(data)))

    word_to_index = {word: index for index, word in enumerate(words)}
    index_to_word = {index: word for index, word in enumerate(words)}
    return word_to_index, index_to_word

This function takes a word, a dictionary that maps words to indices, and the size of the vocabulary as inputs. It creates a one-hot-encoded vector for the given word using numpy.

First, it creates a numpy array of zeros with a length equal to the vocabulary size. Then it sets the element at the index corresponding to the given word's index in the dictionary to 1, indicating the presence of the word in the vocabulary.

Finally, it returns the one-hot-encoded vector.

In [None]:
def word_to_one_hot_vector(word: str, word_to_index: dict, vocabulary_size: int) -> numpy.ndarray:
    """Create a one-hot-encoded vector

    Args:
     word: the word from the corpus that we're encoding
     word_to_index: map of the word to the index
     vocabulary_size: the size of the vocabulary

    Returns:
     vector with all zeros except where the word is
    """
    one_hot_vector = numpy.zeros(vocabulary_size)
    one_hot_vector[word_to_index[word]] = 1
    return one_hot_vector

This code defines a function context_words_to_vector() which takes two arguments - context_words and word_to_index. context_words is a list of words (strings) that represent the context words around a target word in a sentence, and word_to_index is a dictionary that maps each unique word in the corpus to an integer index.

The function converts each word in context_words to a one-hot-encoded vector using the word_to_one_hot_vector() function. It then calculates the mean of all the one-hot vectors along the ROWS axis (which is defined as 0). This results in a vector that represents the average of all the one-hot vectors, which can be thought of as a vector representation of the context words.

The function returns this context vector.

In [None]:
ROWS = 0
def context_words_to_vector(context_words: list,
                            word_to_index: dict) -> numpy.ndarray:
    """Create vector with the mean of the one-hot-vectors

    Args:
     context_words: words to covert to one-hot vectors
     word_to_index: dict mapping word to index
    """
    vocabulary_size = len(word_to_index)
    context_words_vectors = [
        word_to_one_hot_vector(word, word_to_index, vocabulary_size)
        for word in context_words]
    return numpy.mean(context_words_vectors, axis=ROWS)

The function training_example_generator is a generator that generates training examples for a continuous bag-of-words (CBOW) model. The function takes in three arguments: words, which is a list of tokens (words) in the corpus; half_window, which is an integer that determines the size of the context window (the number of words to the left and right of the target word to consider as context); and word_to_index, which is a dictionary mapping each word in the vocabulary to a unique index.

The function uses the window_generator function to generate context words and center words for each window in the corpus. For each center word, the function generates a training example consisting of a feature vector and a target vector. The feature vector is the mean of the one-hot-encoded vectors of the context words, and the target vector is a one-hot-encoded vector of the center word. The function generates the feature and target vectors using the context_words_to_vector and word_to_one_hot_vector functions, respectively.

The function yields each training example as a tuple of two numpy arrays: the feature vector and the target vector. The function continues generating training examples until all windows in the corpus have been processed.

In [None]:
def training_example_generator(words: list, half_window: int, word_to_index: dict):
    """generates training examples

    Args:
     words: source of words
     half_window: half the window size
     word_to_index: dict with word to index mapping
    """
    vocabulary_size = len(word_to_index)
    for context_words, center_word in window_generator(words, half_window):
        yield (context_words_to_vector(context_words, word_to_index),
               word_to_one_hot_vector(
                   center_word, word_to_index, vocabulary_size))
    return

Activation Functions

The code contains two functions relu() and softmax() which are used in the CBOW implementation.

relu() takes an input array z and returns the ReLU (Rectified Linear Unit) of that array. ReLU is an activation function that is commonly used in neural networks. It applies the function f(x) = max(0,x) to each element of the input array, meaning that it sets negative values to zero and leaves positive values unchanged. The function first creates a copy of the input array, and then replaces all negative values with zero.

softmax() takes an input array z and returns an array of probabilities. Softmax is another activation function that is commonly used in neural networks, especially for classification problems. It maps the input array to a probability distribution, meaning that it ensures that the sum of the output probabilities is equal to 1. The function first calculates the exponential of each element in the input array, then calculates the sum of all exponential values, and finally divides each exponential value by the sum to obtain the corresponding probability.

In [None]:
def relu(z: numpy.ndarray) -> numpy.ndarray:
    """Get the ReLU for the input array

    Args:
     z: an array of numbers

    Returns:
     ReLU of z
    """
    result = z.copy()
    result[result < 0] = 0
    return result
def softmax(z: numpy.ndarray) -> numpy.ndarray:
    """Calculate Softmax for the input

    Args:
     v: array of values

    Returns:
     array of probabilities
    """
    e_z = numpy.exp(z)
    sum_e_z = numpy.sum(e_z)
    return e_z / sum_e_z

Word Embeddings: Training the CBOW model


Neural Network Initialization


In [None]:
# Define the size of the word embedding vectors and save it in the variable 'N'
N = 3

# Define V. Remember this was the size of the vocabulary in the previous lecture notebooks
V = 5

Initialization of the weights and biases

In [None]:

#Define the first matrix of weights
W1 = numpy.array([
    [ 0.41687358,  0.08854191, -0.23495225,  0.28320538,  0.41800106],
    [ 0.32735501,  0.22795148, -0.23951958,  0.4117634 , -0.23924344],
    [ 0.26637602, -0.23846886, -0.37770863, -0.11399446,  0.34008124]])
#Define the second matrix of weights
W2 = numpy.array([[-0.22182064, -0.43008631,  0.13310965],
                  [ 0.08476603,  0.08123194,  0.1772054 ],
                  [ 0.1871551 , -0.06107263, -0.1790735 ],
                  [ 0.07055222, -0.02015138,  0.36107434],
                  [ 0.33480474, -0.39423389, -0.43959196]])
#Define the first vector of biases
b1 = numpy.array([[ 0.09688219],
                  [ 0.29239497],
                  [-0.27364426]])
#Define the second vector of biases
b2 = numpy.array([[ 0.0352008 ],
                  [-0.36393384],
                  [-0.12775555],
                  [-0.34802326],
                  [-0.07017815]])
#Check that the dimensions of these matrices are correct.

print(f'V (vocabulary size): {V}')
print(f'N (embedding size / size of the hidden layer): {N}')

print(f'size of W1: {W1.shape} (NxV)')
print(f'size of b1: {b1.shape} (Nx1)')
print(f'size of W2: {W2.shape} (VxN)')
print(f'size of b2: {b2.shape} (Vx1)')

expect(W1.shape).to(equal((N, V)))
expect(b1.shape).to(equal((N, 1)))
expect(W2.shape).to(equal((V, N)))
expect(b2.shape).to(equal((V, 1)))

V (vocabulary size): 5
N (embedding size / size of the hidden layer): 3
size of W1: (3, 5) (NxV)
size of b1: (3, 1) (Nx1)
size of W2: (5, 3) (VxN)
size of b2: (5, 1) (Vx1)


In [None]:
#Define the tokenized version of the corpus


words = ['i', 'am', 'happy', 'because', 'i', 'am', 'learning']


#Get 'word_to_index' and 'Ind2word' dictionaries for the tokenized corpus
word_to_index, index_to_word = index_word_maps(words)

#The First Training Example
training_examples = training_example_generator(words, 2, word_to_index)
x_array, y_array = next(training_examples)

#In this notebook next is used because you will only be performing one iteration of training. In this week's assignment with the full training over several iterations you'll use regular for loops with the iterator that supplies the training examples.

#The vector representing the context words, which will be fed into the neural network, is:

print(x_array)

[0.25 0.25 0.   0.5  0.  ]


In [None]:
#The one-hot vector representing the center word to be predicted is:

print(y_array)

[0. 0. 1. 0. 0.]


In [None]:
 # Copy vector
x = x_array.copy()

 # Reshape it
x.shape = (V, 1)

 # Print it
print(f'x:\n{x}\n')

 # Copy vector
y = y_array.copy()

 # Reshape it
y.shape = (V, 1)

 # Print it
print(f'y:\n{y}')

x:
[[0.25]
 [0.25]
 [0.  ]
 [0.5 ]
 [0.  ]]

y:
[[0.]
 [0.]
 [1.]
 [0.]
 [0.]]


The Hidden Layer

In [None]:
z1 = numpy.dot(W1, x) + b1
print(z1)

[[ 0.36483875]
 [ 0.63710329]
 [-0.3236647 ]]


In [None]:
h = relu(z1)
print(h)

[[0.36483875]
 [0.63710329]
 [0.        ]]


In [None]:
#The Output Layer
z2 = numpy.dot(W2, h) + b2
print(z2)
expected = numpy.array([
    [-0.31973737],
    [-0.28125477],
    [-0.09838369],
    [-0.33512159],
    [-0.19919612]])
expect(numpy.allclose(z2, expected)).to(be_true)

[[-0.31973737]
 [-0.28125477]
 [-0.09838369]
 [-0.33512159]
 [-0.19919612]]


In [None]:
y_hat = softmax(z2)
print(y_hat)
expected = numpy.array([
    [0.18519074],
    [0.19245626],
    [0.23107446],
    [0.18236353],
    [0.20891502]])
expect(numpy.allclose(expected, y_hat)).to(be_true)

[[0.18519074]
 [0.19245626]
 [0.23107446]
 [0.18236353]
 [0.20891502]]


In [None]:
prediction = numpy.argmax(y_hat)
print(f"The predicted word at index {prediction} is '{index_to_word[prediction]}'.")

The predicted word at index 2 is 'happy'.


In [None]:
#Cross-Entropy Loss
print(y_hat)

[[0.18519074]
 [0.19245626]
 [0.23107446]
 [0.18236353]
 [0.20891502]]


In [None]:
print(y)


[[0.]
 [0.]
 [1.]
 [0.]
 [0.]]


The cross_entropy_loss function computes the cross-entropy loss between the predicted output and actual output. The function takes in two parameters:

y_predicted: a numpy array representing the predicted output of the model.
y_actual: a numpy array representing the actual output (labels) of the training data.
The function returns the calculated cross-entropy loss.

The function first multiplies the y_actual and the natural log of y_predicted element-wise, then computes the negative sum of the resulting array. This is the standard formula for cross-entropy loss.

In [None]:
def cross_entropy_loss(y_predicted: numpy.ndarray,
                       y_actual: numpy.ndarray) -> numpy.ndarray:
    """Calculate cross-entropy loss  for the prediction

    Args:
     y_predicted: what our model predicted
     y_actual: the known labels

    Returns:
     cross-entropy loss for y_predicted
    """
    loss = -numpy.sum(y_actual * numpy.log(y_predicted))
    return loss

In [None]:
loss = cross_entropy_loss(y_hat, y)
print(f"{loss:0.3f}")
expected = 1.4650152923611106
expect(math.isclose(loss, expected)).to(be_true)

1.465


Backpropagation

This code computes the gradient of the loss function with respect to the bias term b2 of the output layer. The gradient is computed using the predicted output y_hat and the actual target output y.

The grad_b2 variable stores the calculated gradient, which is a 2D numpy array. The expected value of grad_b2 is compared to the computed value using the numpy.allclose function. If the expected value and the computed value are almost equal (i.e., within a small tolerance), then the test passes.

Overall, this code is likely part of a unit test for the CBOW implementation, where the goal is to ensure that the gradient calculation is correct.

In [None]:
grad_b2 = y_hat - y
print(grad_b2)
expected = numpy.array([
    [ 0.18519074],
    [ 0.19245626],
    [-0.76892554],
    [ 0.18236353],
    [ 0.20891502]])
expect(numpy.allclose(grad_b2, expected)).to(be_true)

[[ 0.18519074]
 [ 0.19245626]
 [-0.76892554]
 [ 0.18236353]
 [ 0.20891502]]


In [None]:
grad_W2 = numpy.dot(y_hat - y, h.T)
print(grad_W2)
expected = numpy.array([
    [0.06756476,  0.11798563,  0.        ],
    [ 0.0702155 ,  0.12261452,  0.        ],
    [-0.28053384, -0.48988499,  0.        ],
    [ 0.06653328,  0.1161844 ,  0.        ],
    [ 0.07622029,  0.13310045,  0.        ]])

expect(numpy.allclose(grad_W2, expected)).to(be_true)

[[ 0.06756476  0.11798563  0.        ]
 [ 0.0702155   0.12261452  0.        ]
 [-0.28053384 -0.48988499  0.        ]
 [ 0.06653328  0.1161844   0.        ]
 [ 0.07622029  0.13310045  0.        ]]


In [None]:
grad_b1 = relu(numpy.dot(W2.T, y_hat - y))
print(grad_b1)
expected = numpy.array([
    [0.        ],
    [0.        ],
    [0.17045858]])
expect(numpy.allclose(grad_b1, expected)).to(be_true)

[[0.        ]
 [0.        ]
 [0.17045858]]


In [None]:
grad_W1 = numpy.dot(relu(numpy.dot(W2.T, y_hat - y)), x.T)
print(grad_W1)
expected = numpy.array([
    [0.        , 0.        , 0.        , 0.        , 0.        ],
    [0.        , 0.        , 0.        , 0.        , 0.        ],
    [0.04261464, 0.04261464, 0.        , 0.08522929, 0.        ]])

expect(numpy.allclose(grad_W1, expected)).to(be_true)

[[0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.        ]
 [0.04261464 0.04261464 0.         0.08522929 0.        ]]


In [None]:
print(f'V (vocabulary size): {V}')
print(f'N (embedding size / size of the hidden layer): {N}')
print(f'size of grad_W1: {grad_W1.shape} (NxV)')
print(f'size of grad_b1: {grad_b1.shape} (Nx1)')
print(f'size of grad_W2: {grad_W2.shape} (VxN)')
print(f'size of grad_b2: {grad_b2.shape} (Vx1)')

expect(grad_W1.shape).to(equal((N, V)))
expect(grad_b1.shape).to(equal((N, 1)))
expect(grad_W2.shape).to(equal((V, N)))
expect(grad_b2.shape).to(equal((V, 1)))

V (vocabulary size): 5
N (embedding size / size of the hidden layer): 3
size of grad_W1: (3, 5) (NxV)
size of grad_b1: (3, 1) (Nx1)
size of grad_W2: (5, 3) (VxN)
size of grad_b2: (5, 1) (Vx1)


Gradient descent


In [None]:

alpha = 0.03

W1_new = W1 - alpha * grad_W1


print('old value of W1:')
print(W1)
print()
print('new value of W1:')
print(W1_new)

old value of W1:
[[ 0.41687358  0.08854191 -0.23495225  0.28320538  0.41800106]
 [ 0.32735501  0.22795148 -0.23951958  0.4117634  -0.23924344]
 [ 0.26637602 -0.23846886 -0.37770863 -0.11399446  0.34008124]]

new value of W1:
[[ 0.41687358  0.08854191 -0.23495225  0.28320538  0.41800106]
 [ 0.32735501  0.22795148 -0.23951958  0.4117634  -0.23924344]
 [ 0.26509758 -0.2397473  -0.37770863 -0.11655134  0.34008124]]


In [None]:
W2_new = W2 - alpha * grad_W2


b1_new = b1 - alpha * grad_b1


b2_new = b2 - alpha * grad_b2
print('W2_new')
print(W2_new)
print()
print('b1_new')
print(b1_new)
print()
print('b2_new')
print(b2_new)

w2_expected = numpy.array(
   [[-0.22384758, -0.43362588,  0.13310965],
    [ 0.08265956,  0.0775535 ,  0.1772054 ],
    [ 0.19557112, -0.04637608, -0.1790735 ],
    [ 0.06855622, -0.02363691,  0.36107434],
    [ 0.33251813, -0.3982269 , -0.43959196]])

b1_expected = numpy.array(
   [[ 0.09688219],
    [ 0.29239497],
    [-0.27875802]])

b2_expected = numpy.array(
   [[ 0.02964508],
    [-0.36970753],
    [-0.10468778],
    [-0.35349417],
    [-0.0764456 ]]
)

for actual, expected in zip((W2_new, b1_new, b2_new), (w2_expected, b1_expected, b2_expected)):
    expect(numpy.allclose(actual, expected)).to(be_true)

W2_new
[[-0.22384758 -0.43362588  0.13310965]
 [ 0.08265956  0.0775535   0.1772054 ]
 [ 0.19557112 -0.04637608 -0.1790735 ]
 [ 0.06855622 -0.02363691  0.36107434]
 [ 0.33251813 -0.3982269  -0.43959196]]

b1_new
[[ 0.09688219]
 [ 0.29239497]
 [-0.27875802]]

b2_new
[[ 0.02964508]
 [-0.36970753]
 [-0.10468778]
 [-0.35349417]
 [-0.0764456 ]]


In [None]:
#import matplotlib.pyplot as plt
#import seaborn as sns
# Class Balance visualization on GoEmotions
#plt.figure(figsize=(20,15))
#sns.barplot(x='Percentage', y='Emotion', data=balance_GE, orient='h', hue='Dataset', palette="Blues_d")
#plt.title("GoEmotions : Percentage of samples per emotion in the train, validation and test datasets", fontweight='bold')
#plt.ylabel("Emotions", fontweight='bold')
#plt.xlabel("Percentage of all samples", fontweight='bold')
#plt.show()



## Skip gram

This section defines three hyperparameters: WINDOW_SIZE, EMBEDDING_SIZE, and LEARNING_RATE. These values can be modified to adjust the behavior of the skip-gram model.

In [None]:
# Define constants
WINDOW_SIZE = 2
EMBEDDING_SIZE = 50
LEARNING_RATE = 0.1

This line creates a list of sentences from a DataFrame column called 'text'.

In [None]:
# Define corpus
corpus = df['text'].tolist()

This code block creates a dictionary that maps each unique word in the corpus to an index. The defaultdict is a subclass of the built-in dict class that returns a default value when an unknown key is accessed. In this case, the default value is the length of the word_to_idx dictionary, which ensures that each word is assigned a unique index.

In [None]:
# Create word to index mapping
word_to_idx = defaultdict(lambda: len(word_to_idx))
for sentence in corpus:
    for word in sentence:
        word_to_idx[word]

This line calculates the total number of unique words in the corpus, which is equal to the size of the vocabulary.

In [None]:
# Define vocabulary size
vocab_size = len(word_to_idx)
# Initialize weight matrices
W1 = np.random.randn(vocab_size, EMBEDDING_SIZE)
W2 = np.random.randn(EMBEDDING_SIZE, vocab_size)


These lines initialize the weight matrices W1 and W2 with random values drawn from a standard normal distribution. W1 is a matrix that maps each one-hot encoded input word to a dense embedding vector, while W2 is a matrix that maps each embedding vector back to a probability distribution over the entire vocabulary.

In [None]:
# Define softmax function
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)


This function implements the softmax function, which is used to convert the output of W2 into a probability distribution over the vocabulary. The np.max(x) term is subtracted from x to ensure numerical stability.

In [None]:
# Define one-hot encoding function
def one_hot_encoding(word_idx, vocab_size):
    x = np.zeros(vocab_size)
    x[word_idx] = 1.0
    return x


This function creates a one-hot encoded vector for a given word index. The vector has a length equal to the size of the vocabulary, and all elements are set to zero except for the element corresponding to the word index, which is set to one.

In [None]:
# Define skip-gram training function
def train(corpus, word_to_idx, W1, W2, EMBEDDING_SIZE, WINDOW_SIZE, LEARNING_RATE):
    for sentence in corpus:
        for i, center_word in enumerate(sentence):
            context_words = sentence[max(0, i - WINDOW_SIZE):i] + \
                            sentence[i + 1:min(len(sentence), i + WINDOW_SIZE + 1)]
            for context_word in context_words:
                center_word_idx = word_to_idx[center_word]
                context_word_idx = word_to_idx[context_word]
                x = one_hot_encoding(center_word_idx, vocab_size)
                y = one_hot_encoding(context_word_idx, vocab_size)
                hidden = np.dot(W1.T, x)
                output = softmax(np.dot(W2.T, hidden))
                error = y - output
                dW2 = np.outer(hidden, error)
                dW1 = np.outer(x, np.dot(W2, error))
                W2 += LEARNING_RATE * dW2
                W1 += LEARNING_RATE * dW1
    return W1

The execution wouldn't end more that 5 hours and still nothing


In [None]:
# Train skip-gram model
W1 = train(corpus, word_to_idx, W1, W2, EMBEDDING_SIZE, WINDOW_SIZE, LEARNING_RATE)

# Extract word embeddings
word_embeddings = W1

KeyboardInterrupt: ignored

## Skip-gram with Negative Sampling

We then tried the Skip-gram with Negative Sampling so it could run faster

The code imports necessary packages and sets hyperparameters for the Skip-gram with Negative Sampling model, such as the window size, number of negative samples, learning rate, number of epochs, and the embedding size.


In [None]:
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer

# set hyperparameters
WINDOW_SIZE = 5
NUM_NEG_SAMPLES = 5
LEARNING_RATE = 0.05
NUM_EPOCHS = 5
EMBEDDING_SIZE = 100


This code creates a vocabulary for the corpus by using the CountVectorizer from scikit-learn to tokenize the text data and extract the vocabulary. X is the sparse matrix representation of the corpus with the count of each vocabulary item in each document, and vocab is a dictionary with the vocabulary items as keys and their corresponding index as values.

In [None]:
# create vocabulary
vectorizer = CountVectorizer(tokenizer=lambda x: x, lowercase=False)
X = vectorizer.fit_transform(df['text'])
vocab = vectorizer.vocabulary_




This code creates a co-occurrence matrix for the vocabulary items in the corpus. The co-occurrence matrix is a dictionary where the keys are tuples representing the indices of the co-occurring vocabulary items and the values are the frequency of co-occurrence. The frequency is calculated based on the distance between the two vocabulary items in a document, where closer words have a higher frequency of co-occurrence.

In [None]:
# create co-occurrence matrix
cooc_matrix = defaultdict(float)
for row in X.toarray():
    indices = np.where(row > 0)[0]
    for i in range(len(indices)):
        for j in range(max(0, i-WINDOW_SIZE), min(len(indices), i+WINDOW_SIZE)):
            if i != j:
                cooc_matrix[(indices[i], indices[j])] += 1.0 / np.abs(i-j)

This code initializes the word embeddings matrix with random values in the range [-0.5/EMBEDDING_SIZE, 0.5/EMBEDDING_SIZE]. The size of the matrix is (vocab_size, EMBEDDING_SIZE), where vocab_size is the number of unique vocabulary items in the corpus and EMBEDDING_SIZE is the dimensionality of the embeddings.

In [None]:
# initialize word embeddings
embedding_matrix = (np.random.rand(len(vocab), EMBEDDING_SIZE) - 0.5) / EMBEDDING_SIZE

In [None]:
# train word embeddings using Skip-gram with Negative Sampling
for epoch in range(NUM_EPOCHS):
    total_loss = 0.0
    for (i, j), cooc in cooc_matrix.items():
        # generate negative samples
        neg_word_idxs = np.random.choice(len(vocab), NUM_NEG_SAMPLES, replace=False)
        context_word_idxs = np.array([i])
        word_pair_idxs = np.concatenate((context_word_idxs, neg_word_idxs))

        # calculate output and error
        u = embedding_matrix[context_word_idxs]
        v = embedding_matrix[neg_word_idxs]
        z = 1 / (1 + np.exp(-np.dot(u, v.T)))
        dL_dz = (1 - cooc / (cooc + NUM_NEG_SAMPLES)) - z
        total_loss += (cooc / (cooc + NUM_NEG_SAMPLES)) * np.log(z).sum() + (NUM_NEG_SAMPLES / (NUM_NEG_SAMPLES + cooc)) * np.log(1 - z).sum()

        # update word embeddings
        dL_du = np.dot(dL_dz, v) / len(word_pair_idxs)
        dL_dv = np.dot(dL_dz.T, u) / len(word_pair_idxs)
        embedding_matrix[context_word_idxs] -= LEARNING_RATE * dL_du
        embedding_matrix[neg_word_idxs] -= LEARNING_RATE * dL_dv

    print('Epoch:', epoch+1, 'Loss:', total_loss)


Epoch: 1 Loss: -50163.04428151421
Epoch: 2 Loss: -50162.991252269814
Epoch: 3 Loss: -50162.892524819654
Epoch: 4 Loss: -50162.65577377647
Epoch: 5 Loss: -50162.11019805966
