# Clue Generator

Scrape MadGab clues from a variety of sources. Remove any extra characters, make sure all clues are valid, write to file with category.

In [1]:
import json
import re

from bs4 import BeautifulSoup
import requests

In [2]:
# General functions and setup
all_phrases = []

with open("madgab/word_to_phonemes.json") as file:
        word_to_phoneme = json.load(file)

def process_phrases(phrases):
    """ 
    Process the passed in phrases in the following ways
    
    1. Remove everything other than letters and apostraphes
    2. Make sure the word count is in [2,5]
    3. Make sure every word is in the phoneme dictionary
    """
    
    def valid_phrase(phrase):
        phrase = phrase.lower()
        phrase_clean = re.sub(r"[^a-zA-Z '\.]", "", phrase)
        if len(phrase_clean.split(" ")) != len(phrase.split(" ")):
            return False
        words = phrase_clean.split(" ")
        if not 2 <= len(words) <= 5:
            return False
        return all([word in word_to_phoneme for word in words])
    
    phrases = [re.sub(r"[^a-zA-Z '\.]", "", phrase) for phrase in phrases if valid_phrase(phrase)]
    return list(set(phrases))

### Common Phrases

https://www.phrases.org.uk/meanings/phrases-and-sayings-list.html

In [3]:
phrases_page = requests.get("https://www.phrases.org.uk/meanings/phrases-and-sayings-list.html")
phrases_soup = BeautifulSoup(phrases_page.text)

In [4]:
phrases = [elem.find('a').text for elem in phrases_soup.find_all("p", {"class": "phrase-list"})]
phrases[0:10]

['A bird in the hand is worth two in the bush',
 'A bunch of fives',
 'A chain is only as strong as its weakest link',
 'A change is as good as a rest',
 'A countenance more in sorrow than in anger',
 'A Daniel come to judgement',
 'A diamond in the rough',
 'A diamond is forever',
 'A different kettle of fish',
 'A dish fit for the gods']

In [5]:
processsed_phrases = process_phrases(phrases)

In [6]:
all_phrases += [(phrase, "General Expression") for phrase in processsed_phrases]
print(f"All phrases with phrases: {len(all_phrases)}")

All phrases with phrases: 1486


### Geographic Locations

https://www.listchallenges.com/print-list/13033  
https://www.listchallenges.com/print-list/81758

In [7]:
with open("clues_temp/places.txt", "r") as file:
    places = [place.strip() for place in file.readlines()]

In [8]:
processsed_places = process_phrases(places)

In [9]:
processsed_places[0:10]

['Florence Cathedral',
 'Havana Cuba',
 'Freedom Tower Ground Zero',
 'Blue Lagoon',
 'Lugano Switzerland',
 'Cologne Cathedral',
 'Petronas Twin Towers',
 'Doha Qatar',
 'Lausanne Switzerland',
 'Gateway Arch']

In [10]:
all_phrases += [(phrase, "Geographic Location") for phrase in processsed_places]
print(f"All phrases with locations: {len(all_phrases)}")

All phrases with locations: 1795


### Movies

https://www.imdb.com/list/ls055592025/  
https://www.filmsite.org/boxoffice.html

In [11]:
movie_page = requests.get("https://www.imdb.com/list/ls055592025/")
movie_soup = BeautifulSoup(movie_page.text)

In [12]:
movies = [elem.find('a').text for elem in movie_soup.find_all("h3", {"class": "lister-item-header"})]
movies[0:10]

['The Godfather',
 'The Shawshank Redemption',
 "Schindler's List",
 'Raging Bull',
 'Casablanca',
 'Citizen Kane',
 'Gone with the Wind',
 'The Wizard of Oz',
 "One Flew Over the Cuckoo's Nest",
 'Lawrence of Arabia']

In [13]:
"""
with open("clues_temp/movies.txt", "w+") as file:
    file.write("\n".join(movies))
"""

'\nwith open("clues_temp/movies.txt", "w+") as file:\n    file.write("\n".join(movies))\n'

In [14]:
with open("clues_temp/movies.txt", "r") as file:
    movies_full = file.read().split("\n")

In [15]:
processed_movies = process_phrases(movies_full)
all_phrases += [(phrase, "Movie") for phrase in processed_movies]
print(f"All phrases with movies: {len(all_phrases)}")

All phrases with movies: 1893


### TV Shows

https://www.imdb.com/list/ls066095353/

In [16]:
tv_page = requests.get("https://www.imdb.com/list/ls066095353/")
tv_soup = BeautifulSoup(tv_page.text)

In [17]:
tv = [elem.find('a').text for elem in tv_soup.find_all("h3", {"class": "lister-item-header"})]
tv[0:10]

['The Sopranos',
 'The Wire',
 'Breaking Bad',
 'Mad Men',
 'Seinfeld',
 'The Simpsons',
 'The Twilight Zone',
 'Saturday Night Live',
 'All in the Family',
 'The Daily Show']

In [18]:
"""
with open("clues_temp/tv.txt", "w+") as file:
    file.write("\n".join(tv))
"""

'\nwith open("clues_temp/tv.txt", "w+") as file:\n    file.write("\n".join(tv))\n'

In [19]:
with open("clues_temp/tv.txt", "r") as file:
    tv_full = file.read().split("\n")

In [20]:
processed_tv = process_phrases(tv_full)
all_phrases += [(phrase, "TV Show") for phrase in processed_tv]
print(f"All phrases with tv: {len(all_phrases)}")

All phrases with tv: 1954


### Musical Artists

https://www.billboard.com/charts/greatest-hot-100-artists  
https://en.wikipedia.org/wiki/List_of_best-selling_music_artists

In [21]:
music_page = requests.get("https://www.billboard.com/charts/greatest-hot-100-artists")
music_soup = BeautifulSoup(music_page.text)

In [22]:
music = [elem.find('a').text for elem in music_soup.find_all("span", {"class": "chart-list-item__title-text"}) if elem and elem.find('a')]
music[0:10]

['\nThe Beatles\n',
 '\nMadonna\n',
 '\nElton John\n',
 '\nElvis Presley\n',
 '\nMariah Carey\n',
 '\nStevie Wonder\n',
 '\nJanet Jackson\n',
 '\nMichael Jackson\n',
 '\nWhitney Houston\n',
 '\nRihanna\n']

In [23]:
"""
with open("clues_temp/music.txt", "w+") as file:
    file.write("\n".join(music))
"""

'\nwith open("clues_temp/music.txt", "w+") as file:\n    file.write("\n".join(music))\n'

In [24]:
with open("clues_temp/music.txt", "r") as file:
    music_full = file.read().split("\n")

In [25]:
processed_music = process_phrases(music_full)
all_phrases += [(phrase, "Musical Artists") for phrase in processed_music]
print(f"All phrases with musical artists: {len(all_phrases)}")

All phrases with musical artists: 2054


### Songs

https://www.billboard.com/articles/news/hot-100-turns-60/8468142/hot-100-all-time-biggest-hits-songs-list  
https://en.wikipedia.org/wiki/List_of_best-selling_singles

In [26]:
song_page = requests.get("https://www.billboard.com/articles/news/hot-100-turns-60/8468142/hot-100-all-time-biggest-hits-songs-list")
song_soup = BeautifulSoup(song_page.text)

In [27]:
song = [elem.find('strong').text for elem in song_soup.find_all("p") if elem.find('strong')]
song[0:10]

['Meet Our Pop Experts!',
 'Diane Warren: ',
 'The Twist',
 'Smooth',
 'Mack the Knife',
 'Uptown Funk! ',
 'How Do I Live',
 'Party Rock Anthem',
 'I Gotta Feeling',
 'Macarena (Bayside Boys Mix)']

In [28]:
"""
with open("clues_temp/song.txt", "w+") as file:
    file.write("\n".join(song))
"""

'\nwith open("clues_temp/song.txt", "w+") as file:\n    file.write("\n".join(song))\n'

In [29]:
with open("clues_temp/song.txt", "r") as file:
    song_full = file.read().split("\n")

In [30]:
processed_song = process_phrases(song_full)
all_phrases += [(phrase, "Song") for phrase in processed_song]
print(f"All phrases with song: {len(all_phrases)}")

All phrases with song: 2241


### Famous People

https://www.biographyonline.net/people/famous-100.html

In [31]:
with open("clues_temp/people.txt", "r") as file:
    people = file.read().split("\n")

In [32]:
processed_people = process_phrases(people)
all_phrases += [(phrase, "People") for phrase in processed_people]
print(f"All phrases with people: {len(all_phrases)}")

All phrases with people: 2345


### Books 

https://en.wikipedia.org/wiki/List_of_best-selling_books

In [33]:
with open("clues_temp/books.txt", "r") as file:
    books_full = file.read().split("\n")

In [34]:
processed_book = process_phrases(books_full)
all_phrases += [(phrase, "Book") for phrase in processed_book]
print(f"All phrases with books: {len(all_phrases)}")

All phrases with books: 2456


## Write the results

In [36]:
with open("clues_full.txt", "w+") as file:
    for phrase in all_phrases:
        file.write(f"{phrase[0]} | {phrase[1]}\n")