<a href="https://colab.research.google.com/github/alexandrelombard/ai54-notebooks/blob/master/Tutorial_1_AI54.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Normalization of the text

In [48]:
file = open("romeo_and_juliet.txt")
content = file.read()
file.close()

In [49]:
# Normalize the text

# 1) Remove uppercase letters
content = content.lower()

# 2) Remove punctuation symbols
punctuation = ['.', ',', ';', '!', '?']
for symbol in punctuation:
  content = content.replace(symbol, '')

# 3) Remove accents
accents = {'é': 'e', 'è': 'e', 'ê': 'e', 'à': 'a'}
for accent in accents:
  content = content.replace(accent, accents[accent])

# Tokenization

In [50]:
# Tokenize the text (split along spaces, including new lines)
words = content.split()
print(words)



# Stopwords removal

In [51]:
# Remove stopwords
with open("english_stopwords.txt") as file:
  stopwords = file.read().split()

words_without_stopwords = []
for word in words:
  if not word in stopwords:
    words_without_stopwords.append(word)

print(words_without_stopwords)



# Introduction to NLTK

In [52]:
!pip install nltk



In [53]:
# NLTK tokenization and stemming
import nltk
from nltk import word_tokenize
from nltk.stem import PorterStemmer

nltk.download('punkt_tab')

nltk_words = word_tokenize(content)
print(nltk_words)

stemmer = PorterStemmer()
stemmed_words = []
for w in nltk_words:
  stemmed_words.append(stemmer.stem(w))

print(stemmed_words)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


['1595', 'the', 'tragedi', 'of', 'romeo', 'and', 'juliet', 'by', 'william', 'shakespear', 'dramati', 'persona', 'choru', 'escalu', 'princ', 'of', 'verona', 'pari', 'a', 'young', 'count', 'kinsman', 'to', 'the', 'princ', 'montagu', 'head', 'of', 'two', 'hous', 'at', 'varianc', 'with', 'each', 'other', 'capulet', 'head', 'of', 'two', 'hous', 'at', 'varianc', 'with', 'each', 'other', 'an', 'old', 'man', 'of', 'the', 'capulet', 'famili', 'romeo', 'son', 'to', 'montagu', 'tybalt', 'nephew', 'to', 'ladi', 'capulet', 'mercutio', 'kinsman', 'to', 'the', 'princ', 'and', 'friend', 'to', 'romeo', 'benvolio', 'nephew', 'to', 'montagu', 'and', 'friend', 'to', 'romeo', 'tybalt', 'nephew', 'to', 'ladi', 'capulet', 'friar', 'laurenc', 'franciscan', 'friar', 'john', 'franciscan', 'balthasar', 'servant', 'to', 'romeo', 'abram', 'servant', 'to', 'montagu', 'sampson', 'servant', 'to', 'capulet', 'gregori', 'servant', 'to', 'capulet', 'peter', 'servant', 'to', 'juliet', "'s", 'nurs', 'an', 'apothecari', 't

In [54]:
# NLTK Lemmatization
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

lemmatized_words = []
for w in nltk_words:
  lemmatized_words.append(lemmatizer.lemmatize(w))

print(lemmatizer.lemmatize("wolves"))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


wolf


# Generation of text with the unigram model

In [55]:
vocabulary = {}
for word in words:
  if word in vocabulary:
    vocabulary[word] += 1
  else:
    vocabulary[word] = 1

print(vocabulary)



In [56]:
print(vocabulary['romeo']/len(words))
cumulative_probability = []

probability = 0.0

for w in vocabulary:
  cumulative_probability.append((probability, w))
  probability += vocabulary[w] / len(words)

cumulative_probability

0.005118461359494358


[(0.0, '1595'),
 (3.87762224204118e-05, 'the'),
 (0.026290278801039204, 'tragedy'),
 (0.026329055023459617, 'of'),
 (0.04187832021404475, 'romeo'),
 (0.046996781573539106, 'and'),
 (0.07441157082477025, 'juliet'),
 (0.07662181550273373, 'by'),
 (0.0808096475241382, 'william'),
 (0.08084842374655861, 'shakespeare'),
 (0.08088719996897901, 'dramatis'),
 (0.08092597619139942, 'personae'),
 (0.08096475241381983, 'chorus'),
 (0.08108108108108106, 'escalus'),
 (0.08115863352592188, 'prince'),
 (0.08255457753305671, 'verona'),
 (0.0831362208693629, 'paris'),
 (0.08426073131955485, 'a'),
 (0.10209779363294427, 'young'),
 (0.10302842297103415, 'count'),
 (0.10322230408313621, 'kinsman'),
 (0.10372639497460157, 'to'),
 (0.12458800263678312, 'montague'),
 (0.12575128930939547, 'heads'),
 (0.12602272286633837, 'two'),
 (0.12699212842684865, 'houses'),
 (0.12726356198379155, 'at'),
 (0.12997789755322037, 'variance'),
 (0.13005544999806118, 'with'),
 (0.13978828182558453, 'each'),
 (0.13994338671526

In [58]:
import random

for i in range(50):
  p = random.random()
  it = 0
  while p > cumulative_probability[it][0]:
    it += 1

  print(cumulative_probability[it])

(0.5529101554926525, 'flourishes')
(0.1547946799022839, 'mercutio')
(0.45573694210710003, 'fought')
(0.9490868199620384, 'unseemly')
(0.44166117336849053, 'all')
(0.7596649734382938, 'sups')
(0.07441157082477025, 'juliet')
(0.6618325642715915, 'day')
(0.10209779363294427, 'young')
(0.18988716119275653, 'dignity')
(0.44774904028849516, 'show')
(0.7189887161192795, 'hide')
(0.32319981387413227, 'word')
(0.2733335918414827, 'here')
(0.36798635076970787, 'neck')
(0.4862538291519641, 'how')
(0.9167474504634044, 'loathsome')
(0.6199154678351254, 'many')
(0.816588467951463, 'smelling')
(0.026290278801039204, 'tragedy')
(0.8958082903563781, 'her)')
(0.9492031486292997, "temper'd")
(0.8730854240180117, '[nurse]')
(0.6155725309240391, 'wood')
(0.266392648028229, 'patient')
(0.509131800380007, 'gregory]')
(0.16138663771375392, 'abram')
(0.527550506029703, 'upon')
(0.9686688122843535, 'stark')
(0.4706657877389585, 'am')
(0.04187832021404475, 'romeo')
(0.5529101554926525, 'flourishes')
(0.421226104

# Generation of text with the bi-gram model

In [82]:
pair_of_words = []
for i in range(len(words) - 1):
  pair_of_words.append((words[i], words[i + 1]))

print(pair_of_words)



In [85]:
vocabulary_pair = {}
for pair_of_word in pair_of_words:
  if pair_of_word in vocabulary_pair:
    vocabulary_pair[pair_of_word] += 1
  else:
    vocabulary_pair[pair_of_word] = 1

print(vocabulary_pair)



In [87]:
sentence = []

# At first we can select a random pair
current_element = random.choices(list(vocabulary_pair.keys()), list(vocabulary_pair.values()))[0]
sentence.append(current_element[0])
sentence.append(current_element[1])

# Then the probability of each element depends on the last element of the sentence
for i in range(50):
  # We keep only the pairs starting with sentence[-1]
  relevant_pairs = {k: v for k, v in vocabulary_pair.items() if k[0] == sentence[-1]}

  # We choose one of them with their probabilities
  current_element = random.choices(list(relevant_pairs.keys()), list(relevant_pairs.values()))[0]
  sentence.append(current_element[1])

print(' '.join(sentence))

able to bed away to the weaker vessels are happy by having makes huge waste for this is it for the fairer than he hath residence and lips are you for there be much unfurnish'd for juliet's nurse peter he slew him in this face nor arm beats down their pride for


# With the tri-gram model

In [84]:
triplet_of_words = []
for i in range(len(words) - 2):
  triplet_of_words.append((words[i], words[i + 1], words[i + 2]))

print(triplet_of_words)



In [86]:
vocabulary_triplets = {}
for triplet_of_word in triplet_of_words:
  if triplet_of_word in vocabulary_triplets:
    vocabulary_triplets[triplet_of_word] += 1
  else:
    vocabulary_triplets[triplet_of_word] = 1

print(vocabulary_triplets)



In [88]:
sentence = []

# At first we can select a random pair
current_element = random.choices(list(vocabulary_triplets.keys()), list(vocabulary_triplets.values()))[0]
sentence.append(current_element[0])
sentence.append(current_element[1])
sentence.append(current_element[2])

# Then the probability of each element depends on the last element of the sentence
for i in range(50):
  # We keep only the pairs starting with sentence[-1]
  relevant_triplets = {k: v for k, v in vocabulary_triplets.items() if k[0] == sentence[-2] and k[1] == sentence[-1]}

  # We choose one of them with their probabilities
  current_element = random.choices(list(relevant_triplets.keys()), list(relevant_triplets.values()))[0]
  sentence.append(current_element[2])

print(' '.join(sentence))

behind your back than to your face par poor soul thy face is mine shall never do thee good trust to't bethink you i'll not endure him cap he shall soon keep tybalt company and then tybalt fled but by-and-by comes back to gaze on us mer men's eyes were made to look
