In [None]:
import nltk
from nltk.corpus import shakespeare
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
import pandas as pd
import re
import math

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

Bag of Words

In [None]:
sentences = [
    "The cat jumped over the fence",
    "She loves to play with her pet cat",
    "A tall fence surrounds the garden",
    "They enjoy watching the cat chase butterflies"
]

lower_sentences = [sentence.lower() for sentence in sentences]

cleaned_sentences = []
stop_words = set(stopwords.words('english'))
for sentence in lower_sentences:
    words = word_tokenize(sentence)
    filtered_words = [word for word in words if word not in stop_words]
    cleaned_sentences.append(filtered_words)

vocabulary = set()
for sentence_words in cleaned_sentences:
    vocabulary.update(sentence_words)

df = pd.DataFrame(0, index=range(len(sentences)), columns=list(vocabulary))
for i, sentence_words in enumerate(cleaned_sentences):
    for word in sentence_words:
        df.loc[i, word] = 1

print("Bag of Words:")
print(df)

Bag of Words:
   .  pet  butterflies  fence  surrounds  enjoy  loves  tall  chase  play  \
0  1    0            0      1          0      0      0     0      0     0   
1  1    1            0      0          0      0      1     0      0     1   
2  1    0            0      1          1      0      0     1      0     0   
3  1    0            1      0          0      1      0     0      1     0   

   garden  jumped  cat  watching  
0       0       1    1         0  
1       0       0    1         0  
2       1       0    0         0  
3       0       0    1         1  


TF - IDF

In [None]:
def compute_tf(doc):
    tokens = word_tokenize(doc.lower())
    fdist = FreqDist(tokens)
    tf_dict = {}
    total_words = len(tokens)
    for word, freq in fdist.items():
        tf_dict[word] = freq / total_words
    return tf_dict

In [None]:
compute_tf("They enjoy watching the cat chase butterflies.")

{'they': 0.125,
 'enjoy': 0.125,
 'watching': 0.125,
 'the': 0.125,
 'cat': 0.125,
 'chase': 0.125,
 'butterflies': 0.125,
 '.': 0.125}

In [None]:
def compute_idf(corpus, term):
    num_documents_with_term = sum([1 for doc in corpus if term in doc])
    if num_documents_with_term > 0:
        return math.log(len(corpus) / num_documents_with_term)
    else:
        return 0

In [None]:
compute_idf(sentences, "fence")

0.6931471805599453

In [None]:
def compute_tfidf(corpus):
    tfidf_dict = {}
    for doc in corpus:
        tf_dict = compute_tf(doc)
        for word, tf in tf_dict.items():
            idf = compute_idf(corpus, word)
            tfidf = tf * idf
            if word in tfidf_dict:
                tfidf_dict[word].append(tfidf)
            else:
                tfidf_dict[word] = [tfidf]
    return tfidf_dict

In [None]:
compute_tfidf(sentences)

{'the': [0.08219487784336595, 0.04109743892168297, 0.035960259056472606],
 'cat': [0.04109743892168297, 0.03196467471686454, 0.035960259056472606],
 'jumped': [0.19804205158855578],
 'over': [0.19804205158855578],
 'fence': [0.09902102579427789, 0.09902102579427789],
 '.': [0.0, 0.0, 0.0, 0.0],
 'she': [0.0],
 'loves': [0.15403270679109896],
 'to': [0.15403270679109896],
 'play': [0.15403270679109896],
 'with': [0.15403270679109896],
 'her': [0.15403270679109896],
 'pet': [0.15403270679109896],
 'a': [0.0],
 'tall': [0.19804205158855578],
 'surrounds': [0.19804205158855578],
 'garden': [0.19804205158855578],
 'they': [0.0],
 'enjoy': [0.17328679513998632],
 'watching': [0.17328679513998632],
 'chase': [0.17328679513998632],
 'butterflies': [0.17328679513998632]}

In [None]:
nltk.download('shakespeare')

[nltk_data] Downloading package shakespeare to /root/nltk_data...
[nltk_data]   Unzipping corpora/shakespeare.zip.


True

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
shakespeare.fileids()

['a_and_c.xml',
 'dream.xml',
 'hamlet.xml',
 'j_caesar.xml',
 'macbeth.xml',
 'merchant.xml',
 'othello.xml',
 'r_and_j.xml']

In [None]:
play = shakespeare.xml('dream.xml')
for p in play:
    print('%s: %s' % (p.tag, list(p.itertext())))

TITLE: ["A Midsummer Night's Dream"]
PERSONAE: ['\n', 'Dramatis Personae', '\n\n', 'THESEUS, Duke of Athens.', '\n', 'EGEUS, father to Hermia.', '\n\n', '\n', 'LYSANDER', '\n', 'DEMETRIUS', '\n', 'in love with Hermia.', '\n', '\n\n', 'PHILOSTRATE, master of the revels to Theseus.', '\n', 'QUINCE, a carpenter.', '\n', 'SNUG, a joiner.', '\n', 'BOTTOM, a weaver.', '\n', 'FLUTE, a bellows-mender.', '\n', 'SNOUT, a tinker.', '\n', 'STARVELING, a tailor.', '\n', 'HIPPOLYTA, queen of the Amazons, betrothed to Theseus.', '\n', 'HERMIA, daughter to Egeus, in love with Lysander.', '\n', 'HELENA, in love with Demetrius.', '\n', 'OBERON, king of the fairies.', '\n', 'TITANIA, queen of the fairies.', '\n', 'PUCK, or Robin Goodfellow.', '\n\n', '\n', 'PEASEBLOSSOM', '\n', 'COBWEB', '\n', 'MOTH', '\n', 'MUSTARDSEED', '\n', 'fairies.', '\n', '\n\n', 'Other fairies attending their King and Queen.', '\n', 'Attendants on Theseus and Hippolyta.', '\n']
SCNDESCR: ['SCENE  Athens, and a wood near it.']
PLA

In [None]:
acts = []
for p in play:
  if p.tag == "ACT":
    acts.append(list(p.itertext()))

acts2 = []
for act in acts:
  acts2.append("".join(act))

def word_tokenization(text):
  word_text = nltk.word_tokenize(text)
  return word_text

import re
import string
def removePunc(text):
  ll = [x for x in text if not re.fullmatch('[' + string.punctuation + ']+', x)]
  return ''.join(ll)

acts3 = []
for act in acts2:
  acts3.append(removePunc(act))

def removeStopWords(text):
  stop_words = set(stopwords.words('english'))
  word_tokens = word_tokenization(text)
  filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
  return ' '.join(filtered_sentence)

acts4 = []
for act in acts3:
  acts4.append(removeStopWords(act).lower())

acts4

['act scene athens palace theseus enter theseus hippolyta philostrate attendants theseus fair hippolyta nuptial hour draws apace four happy days bring another moon methinks slow old moon wanes lingers desires like stepdame dowager long withering young man revenue hippolyta four days quickly steep night four nights quickly dream away time moon like silver bow newbent heaven shall behold night solemnities theseus go philostrate stir athenian youth merriments awake pert nimble spirit mirth turn melancholy forth funerals pale companion pomp exit philostrate hippolyta wood thee sword thy love thee injuries wed thee another key pomp triumph revelling enter egeus hermia lysander demetrius egeus happy theseus renowned duke theseus thanks good egeus whats news thee egeus full vexation come complaint child daughter hermia stand forth demetrius noble lord man hath consent marry stand forth lysander gracious duke man hath bewitchd bosom child thou thou lysander thou hast given rhymes interchanged 

In [None]:
corpus = acts4

tfidf_results = compute_tfidf(corpus)
words_to_display = ['glory', 'choice', 'queen', 'sword', 'egypt']
for word in words_to_display:
  if word in tfidf_results:
    print(f"TF-IDF scores for '{word}': {tfidf_results[word]}")
  else:
    print(f"'{word}' not found in the corpus.")

TF-IDF scores for 'glory': [0.000927629920711297]
TF-IDF scores for 'choice': [0.0013787466228501774, 0.0004500666288687143, 0.00029442399064322233]
TF-IDF scores for 'queen': [0.00015056919791782034, 0.0004983663904281625, 0.00015394518890252484, 0.0009830112392696466]
TF-IDF scores for 'sword': [0.00015056919791782034, 0.00012459159760704064, 0.00015394518890252484, 0.0002572259957512505]
TF-IDF scores for 'egypt': [0.000927629920711297]
