In [1]:
import nltk
from nltk.corpus import gutenberg
from nltk.tokenize import sent_tokenize, word_tokenize
from autocorrect import Speller
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pandas as pd
import re

##### 1. Data

a. Use nltk.corpus.gutenberg.raw to load the three plays listed above into a single variable and lower the case.

In [2]:
# Load and combine the three plays into a single text
hamlet = gutenberg.raw('shakespeare-hamlet.txt')
macbeth = gutenberg.raw('shakespeare-macbeth.txt')
julius_caesar = gutenberg.raw('shakespeare-caesar.txt')

In [3]:
# Lowercasing the text to normalize it
combined_text = (hamlet + macbeth + julius_caesar).lower()

print(combined_text[:500])

[the tragedie of hamlet by william shakespeare 1599]


actus primus. scoena prima.

enter barnardo and francisco two centinels.

  barnardo. who's there?
  fran. nay answer me: stand & vnfold
your selfe

   bar. long liue the king

   fran. barnardo?
  bar. he

   fran. you come most carefully vpon your houre

   bar. 'tis now strook twelue, get thee to bed francisco

   fran. for this releefe much thankes: 'tis bitter cold,
and i am sicke at heart

   barn. haue you had quiet guard?
  fran. not


b. Perform the following steps in an order of your choosing:

- Tokenize the text into sentences, and then each sentence into words.
- Use Speller from the autocorrect library to correct spelling mistakes. 
- Create a list of stopwords (using publicly available lists and/or adding your own) and remove these.
- Use PorterStemmer or WordNetLemmatizer from nltk.stem on the text.
- Use regular expressions (the re library) to do any additional cleanup of the text you wish to do.

In [4]:
# Tokenize the text into sentences and words
sentences = sent_tokenize(combined_text)
words = [word_tokenize(sentence) for sentence in sentences]

In [5]:
# Correct spelling mistakes using Speller from the autocorrect library
spell = Speller()
words_corrected = [[spell(word) for word in sentence] for sentence in words]

In [6]:
# Remove stopwords 
stop_words = set(stopwords.words('english'))
custom_stopwords = {'thou', 'thee', 'thy', 'hath'}  
stop_words.update(custom_stopwords)

words_no_stopwords = [[word for word in sentence if word not in stop_words] for sentence in words_corrected]

In [7]:
# Lemmatize the words using WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
words_lemmatized = [[lemmatizer.lemmatize(word) for word in sentence] for sentence in words_no_stopwords]

In [8]:
# Use regular expressions to clean up non-alphabetic tokens
words_cleaned = [[re.sub(r'\W+', '', word) for word in sentence if word.isalpha()] for sentence in words_lemmatized]

c. Print out the words in the first five sentences of the processed text data.

In [9]:
# Print the first five processed sentences for review
print(words_cleaned[:5])

[['tragedy', 'hamlet', 'william', 'shakespeare', 'act', 'prime'], ['scene', 'prima'], ['enter', 'bernard', 'francisco', 'two', 'sentinel'], ['bernard'], []]


##### 2. Modeling

a. Create a CBOW word2vec model from gensim.model. Make choices of vector_size, epochs, window, min_count, and possibly other hyperparameters. Train it on the cleaned Shakespeare text data. Use gensim.model.wv.key_to_index  and gensim.model.wv.get_vecattr to print out a list of the 20 most frequent words in the vocabulary along with the word count. Consider improving the text cleaning steps above based on this information. 

In [10]:
# Define the hyperparameters for the CBOW model
vector_size = 100  # Dimensionality of word vectors
window = 5  # Context window size
min_count = 2  # Ignores words with total frequency lower than this
epochs = 10  # Number of iterations over the corpus

In [11]:
# Train the CBOW Word2Vec model (sg=0 indicates CBOW model)
cbow_model = Word2Vec(sentences=words_cleaned, vector_size=vector_size, window=window, min_count=min_count, sg=0, epochs=epochs)

In [12]:
# Get the 20 most frequent words from the vocabulary along with their counts
most_frequent_words = list(cbow_model.wv.key_to_index.items())[:20]

# Print the most frequent words and their word counts
print("Top 20 most frequent words and their counts:")
for word, index in most_frequent_words:
    word_count = cbow_model.wv.get_vecattr(word, "count")
    print(f"{word}: {word_count}")

Top 20 most frequent words and their counts:
ham: 337
lord: 306
shall: 300
come: 284
king: 248
enter: 230
good: 221
let: 220
mac: 205
like: 200
cesar: 193
one: 188
make: 185
know: 184
v: 175
self: 165
would: 162
aboutus: 162
von: 160
go: 159


b. Create a skipgram word2vec model from gensim.model. Make choices of vector_size, epochs, window, min_count, and possibly other hyperparameters. Train it on the cleaned Shakespeare text data.

In [13]:
# Train the Skipgram Word2Vec model (sg=1 for Skipgram)
skipgram_model = Word2Vec(sentences=words_cleaned, vector_size=vector_size, window=window, min_count=min_count, sg=1, epochs=epochs)

In [14]:
skipgram_model

<gensim.models.word2vec.Word2Vec at 0x1e1bf477ac0>

c. Load the pretrained GloVe model from gensim.models.keyedvectors for comparison with the models trained on Shakespeare text. Use markdown to make note of the data that GloVe has been trained on.

In [15]:
# Path to the GloVe file 
glove_file = 'glove.6B.100d.txt'

# Load the GloVe model
glove_model = KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True)

# Print a summary of the GloVe model
glove_model

<gensim.models.keyedvectors.KeyedVectors at 0x1e1bdc27340>

##### 3. Discussion

a. Compare the three models by finding the 5 most similar terms to each of the following terms: 'hamlet', 'cauldron', 'nature', 'spirit', 'general', and 'prythee'. Comment on how well each model captured the meaning of the word, and if there are multiple meanings, which meaning was given.

In [16]:
# List of target words for comparison
target_words = ['hamlet', 'cauldron', 'nature', 'spirit', 'general', 'prythee']

# Function to get most similar words from a model
def get_most_similar(model, word):
    try:
        return [f"{sim_word}: {sim_score:.2f}" for sim_word, sim_score in model.most_similar(word, topn=5)]
    except KeyError:
        return ["Not in vocabulary"] * 5  # Handle missing vocabulary gracefully

# Initialize a dictionary to hold the results
results = {"Word": target_words}

# Get results for each model and store them in the dictionary
cbow_similarities = [get_most_similar(cbow_model.wv, word) for word in target_words]
skipgram_similarities = [get_most_similar(skipgram_model.wv, word) for word in target_words]
glove_similarities = [get_most_similar(glove_model, word) for word in target_words]

# Convert the lists into columns for the table
results['CBOW'] = [' | '.join(similar) for similar in cbow_similarities]
results['Skipgram'] = [' | '.join(similar) for similar in skipgram_similarities]
results['GloVe'] = [' | '.join(similar) for similar in glove_similarities]

# Convert the results dictionary into a DataFrame
df = pd.DataFrame(results)

# Display the table
print(df.to_string(index=False))

    Word                                                                CBOW                                                                  Skipgram                                                                                             GloVe
  hamlet     king: 1.00 | good: 1.00 | till: 1.00 | give: 1.00 | faire: 1.00          queen: 0.99 | king: 0.99 | colony: 0.98 | lord: 0.98 | ham: 0.98                village: 0.70 | town: 0.66 | situated: 0.59 | located: 0.57 | unincorporated: 0.56
cauldron   one: 1.00 | thought: 1.00 | stroke: 1.00 | man: 1.00 | poor: 1.00       memory: 1.00 | cool: 1.00 | bubble: 1.00 | slow: 1.00 | share: 1.00                              caldron: 0.76 | flame: 0.69 | lit: 0.59 | torch: 0.56 | candle: 0.55
  nature whose: 1.00 | thought: 1.00 | thing: 1.00 | may: 1.00 | world: 1.00          yes: 0.99 | seems: 0.99 | hold: 0.99 | state: 0.99 | whose: 0.99                              natural: 0.72 | true: 0.71 | aspects: 0.71 | life: 0.70 | view: 0.70
  sp

b. Compare the three models by finding the cosine similarity between the following pairs of terms: ('brutus', 'murder'), ('lady macbeth', 'queen gertrude'), ('fortinbras', 'norway'), ('rome', 'norway'), ('ghost', 'spirit'), ('macbeth', 'hamlet'). Comment on how well each model captured the similarity between these terms, especially considering the data that each was trained on.

In [17]:
# List of word pairs to compare
word_pairs = [
    ('brutus', 'murder'),
    ('lady macbeth', 'queen gertrude'),
    ('fortinbras', 'norway'),
    ('rome', 'norway'),
    ('ghost', 'spirit'),
    ('macbeth', 'hamlet')
]

# Function to compute cosine similarity between two words for a given model
def get_cosine_similarity(model, word1, word2):
    try:
        return model.similarity(word1, word2)
    except KeyError:
        return "N/A"  # Return "N/A" if one or both words are not in the model's vocabulary

# Dictionary to hold results for the DataFrame
results = {'Word Pair': [f"{word1} - {word2}" for word1, word2 in word_pairs]}

# Get cosine similarities for each model
results['CBOW'] = [get_cosine_similarity(cbow_model.wv, word1, word2) for word1, word2 in word_pairs]
results['Skipgram'] = [get_cosine_similarity(skipgram_model.wv, word1, word2) for word1, word2 in word_pairs]
results['GloVe'] = [get_cosine_similarity(glove_model, word1, word2) for word1, word2 in word_pairs]

# Convert the results dictionary into a DataFrame
df = pd.DataFrame(results)

# Display the table
print(df.to_string(index=False))

                    Word Pair      CBOW  Skipgram     GloVe
              brutus - murder       N/A       N/A  0.073644
lady macbeth - queen gertrude       N/A       N/A       N/A
          fortinbras - norway  0.999197  0.996554 -0.028962
                rome - norway  0.999187  0.983509  0.285837
               ghost - spirit  0.998928  0.984075  0.428209
             macbeth - hamlet  0.998807  0.887623  0.429359


c. Compare the three models by finding the 5 most similar terms to each of the following word vectors obtained via linear combination: 'denmark' + 'queen', 'scotland' + 'army' + 'general', 'father' - 'man' + 'woman', 'mother' - 'woman' + 'man'. Comment on how well each model described the ideas behind these word vectors.

In [18]:
# List of linear word combinations for comparison
vector_combinations = [
    (['denmark', 'queen'], []),  # 'denmark' + 'queen'
    (['scotland', 'army', 'general'], []),  # 'scotland' + 'army' + 'general'
    (['father', 'woman'], ['man']),  # 'father' - 'man' + 'woman'
    (['mother', 'man'], ['woman'])   # 'mother' - 'woman' + 'man'
]

# Function to get the most similar terms to a vector combination, with vector averaging for multiple words
def get_similar_terms(model, positives, negatives):
    try:
        # If multiple positive words, average their vectors
        if len(positives) > 1:
            vector = sum([model[word] for word in positives]) / len(positives)
            # Adding negative vector handling
            if negatives:
                vector -= sum([model[word] for word in negatives]) / len(negatives)
            return model.similar_by_vector(vector, topn=5)
        else:
            return model.most_similar(positive=positives, negative=negatives, topn=5)
    except KeyError:
        return "One or more words not in vocabulary"

# Dictionary to hold results for the DataFrame
results = {'Combination': ["'denmark' + 'queen'", "'scotland' + 'army' + 'general'", "'father' - 'man' + 'woman'", "'mother' - 'woman' + 'man'"]}

# Get most similar terms for each model and each word vector combination
results['CBOW'] = [get_similar_terms(cbow_model.wv, positives, negatives) for positives, negatives in vector_combinations]
results['Skipgram'] = [get_similar_terms(skipgram_model.wv, positives, negatives) for positives, negatives in vector_combinations]
results['GloVe'] = [get_similar_terms(glove_model, positives, negatives) for positives, negatives in vector_combinations]

# Format results for table display (convert list of tuples to string)
def format_similar_terms(similar_terms):
    if isinstance(similar_terms, str):
        return similar_terms  # In case of KeyError (not found in vocabulary)
    return ' | '.join([f"{term}: {round(similarity, 2)}" for term, similarity in similar_terms])

# Apply formatting to the DataFrame
results['CBOW'] = [format_similar_terms(sim) for sim in results['CBOW']]
results['Skipgram'] = [format_similar_terms(sim) for sim in results['Skipgram']]
results['GloVe'] = [format_similar_terms(sim) for sim in results['GloVe']]

# Convert the results dictionary into a DataFrame
df = pd.DataFrame(results)

# Display the table
print(df.to_string(index=False))

                    Combination                                                                CBOW                                                                Skipgram                                                                                               GloVe
            'denmark' + 'queen'       queen: 1.0 | denmark: 1.0 | take: 1.0 | king: 1.0 | till: 1.0 ophelia: 1.0 | deer: 1.0 | sweet: 0.99 | alert: 0.99 | donalbaine: 0.99                          queen: 0.86 | denmark: 0.84 | sweden: 0.74 | norway: 0.69 | princess: 0.69
'scotland' + 'army' + 'general'          general: 1.0 | put: 1.0 | hee: 1.0 | hand: 1.0 | name: 1.0        ne: 1.0 | pindarus: 1.0 | rule: 1.0 | legion: 1.0 | silence: 1.0                            army: 0.83 | general: 0.81 | force: 0.76 | military: 0.75 | forces: 0.74
     'father' - 'man' + 'woman' cl: 0.1 | descends: 0.07 | donald: -0.02 | hec: -0.09 | stra: -0.09    macbeth: 0.18 | father: 0.17 | enter: 0.15 | cl: 0.14 | hamlet: 0.13 daughter-in-

d. Give overall comments on how each model performs. Describe what data you would use to train a better word embedding model to captures the meaning of Shakespearean English.