In [1]:
import nltk

### Task 1

Write a function which picks rhymes for a word using CMU Pronouncing Dictionary (nltk.corpus.cmudict). Two words usually rhyme if their pronunciation from the stressed syllable till the end of the word is the same.

In [2]:
from nltk.corpus import*
nltk.download('cmudict')

[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Unzipping corpora/cmudict.zip.


True

In [51]:
def rhyme_grime(input_word='rhyme'):
  '''
  Returns set of rhymes for input word.
  '''

  if input_word not in cmudict.dict().keys():
    raise ValueError('There is no such word in cmudict corpora.')

  for pronoun in cmudict.dict()[input_word]:
    pronunciation = pronoun
    stressed_syl_idx = [idx for idx, syl in enumerate(pronunciation) if syl.endswith('1')][0]
    rhyme_end = pronunciation[stressed_syl_idx:]
    rhymes = []
    for word, pronouns in cmudict.dict().items():
      for pronoun in pronouns:
        if pronoun[len(pronoun)-len(rhyme_end):] == rhyme_end:
          rhymes.append(word)
  rhymes.remove(input_word)

  return set(rhymes)

Evaluation:

In [152]:
word = input("Enter word - ")
print('There are {} rhymes to the word {}:\n\n{}'.format(len(rhyme_grime(word)), word, ',\n'.join(rhyme_grime(word))))

Enter word - cloud
There are 35 rhymes to the word cloud:

houde,
avowed,
disavowed,
plowed,
how'd,
loud,
disallowed,
o'dowd,
crowd,
odowd,
vowed,
wowed,
stroud,
browed,
sowed,
dowd,
bowed,
ploughed,
doud,
macleod,
mcloud,
cowed,
endowed,
aloud,
abboud,
goude,
allowed,
shroud,
enshroud,
mcleod,
unbowed,
overcrowd,
daoud,
proud,
mccloud


### Task 2
Improve our text generator using trigrams (nltk.trigram) instead of bigrams. The idea is to select the next word based on two previous words, not just one. It is acceptable if you have to start the generation from two initial words instead of one. Apply the generator to texts from different corpora.

In [33]:
nltk.download('webtext')
nltk.download('inaugural')
nltk.download('movie_reviews')
nltk.download('brown')
nltk.download('nps_chat')

[nltk_data] Downloading package webtext to /root/nltk_data...
[nltk_data]   Unzipping corpora/webtext.zip.
[nltk_data] Downloading package inaugural to /root/nltk_data...
[nltk_data]   Unzipping corpora/inaugural.zip.
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package nps_chat to /root/nltk_data...
[nltk_data]   Unzipping corpora/nps_chat.zip.


True

In [50]:
import random

def generate(corpora, word1=None, word2=None, sentences=5):
  '''
  Returns the generated text from trigrams built from input corpora.

          Parameters:
                  corpora: text from where trigrams will generate
                  word1 (str): first conditional word from which text will be generated (default = None)
                  word2 (str): second conditional word from which text will be generated (default = None)
                  sentences (int): number of sentences in generated text (default = 5)

          Returns:
                  output_string (str): generated text
  '''
  
  if (word1 == None and word2 != None) or (word2 == None and word1 != None):
    raise ValueError('word1 and word2 must be specified together or not specified at all')

  words = corpora.words()
  trigrams = list(nltk.trigrams(words))
  cfd = nltk.ConditionalFreqDist(((first,second),third)
                                  for (first, second, third) in trigrams[:-2])
  
  if word1==None and word2==None:
    word1, word2, _ = random.choice(trigrams)
  
  output_string = ""

  while sentences > 0:
    output_string += word2
    if word2[0] in ".?!":
      sentences -= 1
      output_string += '\n'
    else:
      output_string += ' '

    index = (word1, word2)
    candidate_words = [word for word, freq in cfd[index].most_common()]
    freqs = [freq for word, freq in cfd[index].most_common()]
    word1 = word2
    word2 = random.choices(candidate_words, weights = freqs)[0]

  return output_string 

Evaluation:

In [48]:
# movie reviews
corpora = movie_reviews
word1 = 'based'
word2 = 'on'

print('Generated text based on words \'{}\' and \'{}\':\n{}'.format(word1, word2, generate(corpora, word1, word2)))

print('Generated text based on random 2 words:\n{}'.format(generate(corpora)))

Generated text based on words 'based' and 'on':
on the straight - laced straight man .
believe me when i saw this in the american political media works , due to all the plot has max asking his best here , her " visions .
because he couldn ' t happy !
" when we think .
betty thomas worked wonders with it !

Generated text based on random 2 words:
named l .
jackson ) , but the desert or astronauts in the us .
president martin van buren ( nigel hawthorne do another voice for a heart operation .
with a very rousing speech before sending his troops .
the sound off .



In [53]:
# web text
corpora = webtext
print('Generated text based on random 2 words:\n{}'.format(generate(corpora)))

Generated text based on random 2 words:
.
** Apricot jam - quite hard and the station ?!
Guy # 2 : The second smartest person in the right , we need a joint .
A good wine for it , then you fart .
Teen girl : Why do they have persuaded me that your phone stolen .



In [49]:
# nps chat
corpora = nps_chat
print('Generated text based on random 2 words:\n{}'.format(generate(corpora)))


Generated text based on random 2 words:
Whats going on U22 ...
PART all the time PART will send the infor JOIN at what time U50 , as an elected official of this room LOL hey U104 !
whip U50 heyyyy U110 U110 ??
PART aww - thanks JOIN PART <333 whats up no offence against the gay JOIN 19 m fl !
seen a girl or people have been lying to me dang it .......



### Task 3
Write a code for Hangman game (https://en.wikipedia.org/wiki/Hangman_(game)). The code should select a random word from a dictionary (e. g. nltk.corpus.words) and show it to the user, replacing letters with dots. The user has to guess the word, naming one letter per move. If the named letter is there within the word, then all its occurrences are shown, otherwise the user loses an attempt. The user wins if (s)he opens all the letters before all attempts are spent, otherwise (s)he fails. You do not have to draw the hangman, just count the attempts left.


In [74]:
game_start = ''' _                                             
| |                                            
| |__   __ _ _ __   __ _ _ __ ___   __ _ _ __  
| '_ \ / _` | '_ \ / _` | '_ ` _ \ / _` | '_ \ 
| | | | (_| | | | | (_| | | | | | | (_| | | | |
|_| |_|\__,_|_| |_|\__, |_| |_| |_|\__,_|_| |_|
                    __/ |                      
                   |___/                       '''
hangman = ['''
  +---+
  |   |
      |
      |
      |
      |
=========''', '''
  +---+
  |   |
  O   |
      |
      |
      |
=========''', '''
  +---+
  |   |
  O   |
  |   |
      |
      |
=========''', '''
  +---+
  |   |
  O   |
 /|   |
      |
      |
=========''', '''
  +---+
  |   |
  O   |
 /|\  |
      |
      |
=========''', '''
  +---+
  |   |
  O   |
 /|\  |
 /    |
      |
=========''', '''
  +---+
  |   |
  O   |
 /|\  |
 / \  |
      |
=========''']

In [80]:
nltk.download('popular')

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to-date!
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/names.zip.
[nltk_data]    | Downloading package shakespeare to

True

In [263]:
from IPython.display import clear_output 

def hangman_word_gen(corpora, min_word_length=4, max_word_length = 10):
  candidate_words = [word for word in corpora.words() if len(word) >= min_word_length and len(word) <= max_word_length
                                                                                    and not word[0].isupper()]
  random_word = random.choice(candidate_words)
  return (list(random_word), list('*'*len(random_word)))

def hangman_letter_check(guess_letter, guess_word):
  letter_indexes = [i for i, ltr in enumerate(guess_word) if ltr == guess_letter]
  if not letter_indexes:
    return (False, [])
  else:
    return (True, letter_indexes)

def hangman_game(corpora=words, min_word_length=4, max_word_length=10):

  print(game_start)
  newgame_input = input('Begin a new game? y/n\n')
  
  if newgame_input == 'y':
    clear_output()
    print('Let\'s begin!')
    print(hangman[0])
    guess_word, my_word = hangman_word_gen(corpora, min_word_length, max_word_length)
    marker = True
    hangman_iter = 0
    guessed_letters = []
    print(guess_word)

    while marker:
      print('Your word consists of {} letters:\n{}'.format(len(guess_word), '-'.join(my_word)))
      my_guess_letter = input('Your guess?\n')
      if len(my_guess_letter) > 1:
        clear_output()
        print(hangman[hangman_iter])
        print('Use only letters!')
        continue
      marker = hangman_letter_check(my_guess_letter, guess_word)[0]

      if not marker:
        clear_output()
        print(hangman[hangman_iter+1])
        print('Oops, you got wrong')
        hangman_iter += 1
        if hangman_iter == 6:
          print()
          print("Hangman didn't discover that the real word was", ''.join(guess_word))
          return 'Game over'
        else:
          marker = True
      elif marker and my_guess_letter not in guessed_letters:
        clear_output()
        print(hangman[hangman_iter])
        print('Your\'re right!')
      elif marker and my_guess_letter in guessed_letters:
        clear_output()
        print(hangman[hangman_iter])
        print('This letter is already used!')

      guessed_letters.append(my_guess_letter)
      print('Used letters:\n', ' '.join(set(guessed_letters)))
      positions = hangman_letter_check(my_guess_letter, guess_word)[1]
      for pos in positions:
        my_word[pos] = my_guess_letter
      if '*' not in my_word:
        print("Hangman discovered that the real word was", ''.join(guess_word))
        return 'You won!!!'

  elif newgame_input == 'n':
    return 'Game over'
  else:
    raise ValueError('Unknown symbol was passed.')

In [264]:
hangman_game(corpora=words, min_word_length=3, max_word_length=5)


  +---+
  |   |
  O   |
      |
      |
      |
Your're right!
Used letters:
 w a i t l
Hangman discovered that the real word was twill


'You won!!!'