In [1]:
import string
import nltk
import gensim.downloader as api
from nltk.corpus import cmudict
from nltk.corpus import stopwords
from transformers import BertTokenizer, BertForMaskedLM
from difflib import SequenceMatcher
import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
import homophones

ModuleNotFoundError: No module named 'homophones'

# homophone.com

In [67]:
"""
Python wrapper for the website: https://www.homophone.com/
Gets the homophones of a word.
"""

from urllib.request import urlopen
import requests
from bs4 import BeautifulSoup
from typing import Dict, List
import re

class Pyphones:
    
    def __init__(self, word):
        self.word = word
        self.url = "https://www.homophone.com/search?page={}&type=&q={}"
        self.homophones = {self.word: []}
        
    def get_the_page(self, page_no = 1):
        """
        Get the page content.

        Returns
            str: the content of the page.
        """
        url = self.url.format(page_no, self.word)
        r = requests.get(url)
        soup = BeautifulSoup(r.content, "html.parser")
        return soup

    def get_the_page_nos(self):
        """
        Get the total number of pages

        Returns
            int: the total number of the pages.
        """
        soup = self.get_the_page()
        pages = soup.find_all('div', attrs={'class':'col-sm-9'})
        total_pages = pages[0].find('h5').text.split('/')[-1].strip()
        return int(total_pages)

    def get_the_homophones(self):
        """
        Get the homophones of the word.

        Returns
            dict: {word: [list_of_homophones]} against each word.
        """
        total_pages = self.get_the_page_nos()
        for ix in range(total_pages):
            page_no = ix + 1
            soup = self.get_the_page(page_no)
            raw_homophones = soup.find_all('div', attrs={'class': 'well well-lg'})
            for elem in range(len(raw_homophones)):
                raw_homophones_2 = raw_homophones[elem].find_all('a', attrs={'class': 'btn word-btn'})
                list_of_homophones = list(raw_homophones_2)
                if any(list_of_homophones):
                    local_homophones = []
                    for tag_of_homophone in list_of_homophones:
                        homophone = tag_of_homophone.text
                        local_homophones.append(homophone)
                    self.homophones[self.word].append(local_homophones)

        return self.homophones

In [2]:
# CMU Dictionary
nltk.download('cmudict')
cmu = nltk.corpus.cmudict.dict()

[nltk_data] Downloading package cmudict to
[nltk_data]     C:\Users\LG\AppData\Roaming\nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


In [3]:
w2v_model = api.load('word2vec-google-news-300')

In [4]:
nlp = English()
tokenizer = Tokenizer(nlp.vocab)

In [5]:
stop_words = set(stopwords.words('english'))
stop_words.update('.', '?', '-', '\'', '\:', ',', '!', '<', '>', '\"', '/', '(', ')',
                  '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 's', 't', 're', 'm')

In [6]:
from transformers import BertTokenizer

text = "The quick brown fox was jumping over the lazy dog."
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokens = tokenizer.tokenize(text)
print(tokens)

['the', 'quick', 'brown', 'fox', 'was', 'jumping', 'over', 'the', 'lazy', 'dog', '.']


In [48]:
# Step 1: Remove Punctuation and Tokenize Text
def tokenize_text(text):
    
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    text = ''.join([char for char in text if char not in string.punctuation])
#     words = tokenizer.tokenize(text)
    words = text.split()
    original_sentences = []
    original_sentence = []
    
    vocab = w2v_model.index_to_key

    for i in range(len(words)):
        original_sentence.append(words[i].lower())
    original_sentences.append(original_sentence)

    sentences = []
    sentence = []

    for i in range(len(words)):
        if words[i].lower() not in stop_words and words[i].lower() in vocab:
            sentence.append(words[i].lower())
    sentences.append(sentence)
    
    print('original_sentences',original_sentences[0])
    print(" ")
    print('sentences',sentences[0])
    
    return sentences[0]

In [63]:
# Step 3: Find Homophones
def find_homophones(word, cmu_dict):
    
    homophones = [key for key, value in cmu_dict.items() if value == cmu_dict[word]]
#     temp_homophones = []
    
    for key, value in cmu_dict.items():
        
        if value == cmu_dict[word]:
            print('word', word)
            print('cmu_dict[word]', cmu_dict[word])
            print('key', key)
            print('value', value)
            print("")
#             temp_homophones.append(key)
#     print('temp_homophones', temp_homophones)
    return homophones

# Step 4: Phonetic Analysis
def phonetic_analysis(word, cmu_dict):
    return cmu_dict[word]

# Step 5: Find Pun Keywords
def find_pun_keywords(words, temp_homophones):
    # Implement your pun keyword detection logic here
    vocab = w2v_model.index_to_key
    scores = {}

    for i in range(len(words)-1):
        for j in range(i+1, len(temp_homophones)):
            if temp_homophones[j] not in vocab:
                pass
            else:
                
                sim_score = w2v_model.similarity(words[i], temp_homophones[j])
                scores['{0}-{1}'.format(words[i], temp_homophones[j])] = sim_score
    print('scores',scores)
    if len(scores) >= 5:
        top3 = sorted(zip(scores.values(), scores.keys()), reverse=True)[0:3]
        bottom3 = sorted(zip(scores.values(), scores.keys()), reverse=False)[:3]
        final = top3+bottom3
#             final = list(final)
#             final = tuple(final)
#             print('')
#             print('bottom3',bottom3)
        print('')
        print('final',final)
        print('')
        print('top3',top3)
        print(" ")
        poss = [tup[1].split(sep='-') for tup in top3]
#             poss = [tup[1].split(sep='-') for tup in final]

        print('poss',poss)
        print(" ")
        possible_pun_words = set(poss[0] + poss[1] + poss[2])
#             possible_pun_words = set(poss[0] + poss[1] + poss[2]+poss[3]+poss[4]+poss[5])

        print('possible_pun_words',possible_pun_words)
        print(" ")
    else:
        poss = [pair.split(sep='-') for pair in scores.keys()]
        possible_pun_words = set()
        for i in range(len(poss)):
            possible_pun_words = possible_pun_words.union(set(poss[i]))
        """
        top = sorted(zip(scores.values(), scores.keys()), reverse=True)[:1]
        poss = [tup[1].split(sep='-') for tup in top]
        possible_pun_words = set(poss[0])
        """

    return possible_pun_words
#     return []

# Step 6: Search for Phrases
def search_phrases(word, phrases, error_margin):
    similar_phrases = []
    for phrase in phrases:
        similarity = SequenceMatcher(None, word, phrase).ratio()
        print('word: ', word)
        print('phrase:', phrase)
        print('similarity', similarity)
        print(" ")
        if similarity >= (1 - error_margin):
            similar_phrases.append(phrase)
    return similar_phrases

# Example Usage
text = "The quick brown fox was jumping over the lazy dog."
text2 = "The boating store had its best sail ever."
text3 = "Authors can be very PENsive"
text4= "I lift weights only on Saturday and Sunday because Monday to Friday are weak days."
text5= 'why did the cookie cry, because his mother is awafer too long.'
text6= 'The postmen get together for mail bonding .'
text7= 'People who like gold paint have a gilt complex'
text8 = 'a meowntain is a pile of kittens'

# what do you call a pile of kittens? A meowntain

# text = remove_punctuation(text)

words = tokenize_text(text2)

temp_homophones = []

for word in words:
    homophones = find_homophones(word, cmu)
#     temp_homophones.append(homophones)
    if homophones:
        print(f'Homophones for "{word}": {homophones}')
    else:
        phonetics = phonetic_analysis(word, cmu)
        print(f'Phonetics for "{word}": {phonetics}')
    
    for k in homophones:
        if k in words:
            pass
        else:
            temp_homophones.append(k)

print('temp_homophones', temp_homophones)
# Find Pun Keywords
pun_keywords = find_pun_keywords(words, temp_homophones)
print(f'Pun Keywords: {pun_keywords}')

# Search for Phrases
# phrases = ["The quick brown fox", "The lazy dog", "jumps over", "was jumping"]
# error_margin = 0.2
# for word in words:
#     similar_phrases = search_phrases(word, phrases, error_margin)
#     if similar_phrases:
#         print(f'Similar Phrases for "{word}": {similar_phrases}')

original_sentences ['the', 'boating', 'store', 'had', 'its', 'best', 'sail', 'ever']
 
sentences ['boating', 'store', 'best', 'sail', 'ever']
word boating
cmu_dict[word] [['B', 'OW1', 'T', 'IH0', 'NG']]
key boating
value [['B', 'OW1', 'T', 'IH0', 'NG']]

Homophones for "boating": ['boating']
word store
cmu_dict[word] [['S', 'T', 'AO1', 'R']]
key stoehr
value [['S', 'T', 'AO1', 'R']]

word store
cmu_dict[word] [['S', 'T', 'AO1', 'R']]
key stohr
value [['S', 'T', 'AO1', 'R']]

word store
cmu_dict[word] [['S', 'T', 'AO1', 'R']]
key store
value [['S', 'T', 'AO1', 'R']]

word store
cmu_dict[word] [['S', 'T', 'AO1', 'R']]
key storr
value [['S', 'T', 'AO1', 'R']]

Homophones for "store": ['stoehr', 'stohr', 'store', 'storr']
word best
cmu_dict[word] [['B', 'EH1', 'S', 'T']]
key best
value [['B', 'EH1', 'S', 'T']]

word best
cmu_dict[word] [['B', 'EH1', 'S', 'T']]
key beste
value [['B', 'EH1', 'S', 'T']]

Homophones for "best": ['best', 'beste']
word sail
cmu_dict[word] [['S', 'EY1', 'L']]
key

you can ignore the number

strsimpy - 
metaphone phonetic algorithm
levenshtein distance algorithm


How do you measure the distance if the word is not in CMU dictionary <= Using table

In [30]:
sim_score = w2v_model.similarity('store', 'sale')
print(sim_score)

0.24029954


In [48]:
def phonetic_analysis(word, cmu_dict):
    return cmu_dict[word]

In [49]:
phonetic_analysis('sale', cmu)

[['S', 'EY1', 'L']]

In [50]:
phonetic_analysis('sail', cmu)

[['S', 'EY1', 'L']]

In [51]:
phonetic_analysis('salle', cmu)

[['S', 'EY1', 'L']]

In [52]:
phonetic_analysis('sayle', cmu)

[['S', 'EY1', 'L']]

In [53]:
phonetic_analysis('boating', cmu)

[['B', 'OW1', 'T', 'IH0', 'NG']]

In [54]:
phonetic_analysis('voting', cmu)

[['V', 'OW1', 'T', 'IH0', 'NG']]

In [61]:
phonetic_analysis('meow', cmu)

[['M', 'IY0', 'AW1']]

In [62]:
phonetic_analysis('why did the cookie cry, because his mother is awafer too long.', cmu)

KeyError: 'why did the cookie cry, because his mother is awafer too long.'

In [17]:
tokenize_text('The boating store had its best sail ever.')

['the', 'boating', 'store', 'had', 'its', 'best', 'sail', 'ever', '.']


In [24]:
text2 = "The boating store had its best sail ever."

words = tokenize_text(text2)

for word in words:
#     print('word', word)
    homophones = find_homophones(word, cmu)
    if homophones:
        print(f'Homophones for "{word}": {homophones}')
    else:
        phonetics = phonetic_analysis(word, cmu)
        print(f'Phonetics for "{word}": {phonetics}')

original_sentences ['the', 'boating', 'store', 'had', 'its', 'best', 'sail', 'ever']
 
sentences ['boating', 'store', 'best', 'sail', 'ever']
word boating
cmu_dict[word] [['B', 'OW1', 'T', 'IH0', 'NG']]
key boating
value [['B', 'OW1', 'T', 'IH0', 'NG']]

Homophones for "boating": ['boating']
word store
cmu_dict[word] [['S', 'T', 'AO1', 'R']]
key stoehr
value [['S', 'T', 'AO1', 'R']]

word store
cmu_dict[word] [['S', 'T', 'AO1', 'R']]
key stohr
value [['S', 'T', 'AO1', 'R']]

word store
cmu_dict[word] [['S', 'T', 'AO1', 'R']]
key store
value [['S', 'T', 'AO1', 'R']]

word store
cmu_dict[word] [['S', 'T', 'AO1', 'R']]
key storr
value [['S', 'T', 'AO1', 'R']]

Homophones for "store": ['stoehr', 'stohr', 'store', 'storr']
word best
cmu_dict[word] [['B', 'EH1', 'S', 'T']]
key best
value [['B', 'EH1', 'S', 'T']]

word best
cmu_dict[word] [['B', 'EH1', 'S', 'T']]
key beste
value [['B', 'EH1', 'S', 'T']]

Homophones for "best": ['best', 'beste']
word sail
cmu_dict[word] [['S', 'EY1', 'L']]
key

In [34]:
find_homophones('The boating store had its best sail ever',cmu)

KeyError: 'The boating store had its best sail ever'

In [68]:
py = Pyphones("sail")
homophones = py.get_the_homophones()
print(homophones)

{'sail': [['sail', 'sale'], ['sailer', 'sailor'], ['sails', 'sales']]}
