In [7]:
#   Name: Andrew Cui, Remy Zhang, Haoyang Zhou
# Course: CSCI 185

from bs4 import BeautifulSoup
from urllib.request import urlopen
import math
from nltk.tokenize import word_tokenize

from nltk.corpus import stopwords

# key = author , val = quotes written by the author
quotes_by_author = {}
numquotes = 0

# key = quote, val = tags attached to the quote
quotes_tag = {}

for i in range(10):
    
    # the i-th link
    if i == 0:
        url = 'https://quotes.toscrape.com'
    else:
        url = 'https://quotes.toscrape.com/page/' + str(i + 1) + '/'
    
    # open the website xml file
    soup = BeautifulSoup(urlopen(url), 'html.parser')
    
    
    # scrape all qoutes on that page
    quotes = soup.find_all("div", class_="quote")
    
    
    for quote in quotes:
        
        # for each quote, we do the following
        
        # keep track of total number of quotes
        numquotes = numquotes + 1
        
        # the actual quote
        text = quote.find("span", class_="text").text
        text = text[1:len(text) - 1]
        
        # the author
        author = quote.find("small", class_="author").text
        
        # save the quote according to its author
        if author in quotes_by_author:
            quotes_by_author[author].append(text)
        else:
            quotes_by_author[author] = [text]
        
        # find the tags
        tags = []
        _tags = quote.find("div", class_="tags")
        
        for _tag in _tags.find_all("a", class_="tag"):
            tag = _tag.text
            tags.append(tag)
        
        # save the tags to the corresponding quote
        quotes_tag[text] = tags
        
        

# find term frequency

# tf (term frequency) is a 2-D dictionary whose first key is the quote
# and the second key is a unique token in the quote
tf = {}
for author in quotes_by_author:
    for quote in quotes_by_author[author]:
        # for each quote:
        
        # a dictionary that stores the term frequency of all tokens in the current quote
        # key = a unique token, val = count of the token
        term_frequency = {}
        
        # counts the max occurance of any token
        max = 0
        
        stop_words = set(stopwords.words("english"))
        tokens = []
        
        # filter stop words
        for word in word_tokenize(quote):
            if word not in stop_words:
                tokens.append(word)
        
        # count the occurance of each token
        for word in tokens:
            if word in term_frequency:
                term_frequency[word] = term_frequency[word] + 1
                if max < term_frequency[word]:
                    max = term_frequency[word]
            else:
                term_frequency[word] = 1
                if max < 1:
                    max = 1
        
        # normalize
        for token in term_frequency:
            term_frequency[token] = term_frequency[token] / max
        
        # save
        tf[quote] = term_frequency

        
# term frequency by tags
# since tages of a quote are unique, we use the quotes_tag dictionary to calculate tff


# find document frequency

df = {}
for quote in tf:
    for token in tf[quote]:
        # for each unique in a quote, we increment the count
        
        if token in df:
            df[token] = df[token] + 1
        else:
            df[token] = 1

            
# find document frequency by tags
df_tag = {}
for quote in tf_tag:
    for tag in tf_tag[quote]:
        if tag in df_tag:
            df_tag[tag] = df_tag[tag] + 1
        else:
            df_tag[tag] = 1            

            
# tf_idf is a 2D dictionary whose keys are the quote and tokens in the quote
# and the value is the tf-idf score
tf_idf = {}

# find tf-idf for each token
for quote in tf:
    tf_idf[quote] = {}
    for token in tf[quote]:
        # for each token in a quote, we calculate its tf-idf score
        _tf = tf[quote][token]
        _idf = math.log(numquotes / (df[token]))
        tf_idf[quote][token] = _tf * _idf

# tf-idf by tags
tf_idf_tag = {}

for quote in tf_tag:
    tf_idf_tag[quote] = {}
    for tag in tf_tag[quote]:
        _tf = 0
        if tag in quotes_tag[quote]:
            _tf = 1
        _idf = math.log(numquotes / (df_tag[tag]))
        tf_idf_tag[quote][tag] = _tf * _idf

# the search function by calculating the cosine similarity between tokens of each document and the query
def search_by_tokens(query):

    # tokenize query and rmeove stop words
    words = []
    for word in word_tokenize(query):
        if word not in stop_words:
            words.append(word)

    max = 0
    query_vector = {}

    # count the term frequencies for the query and store in query_vector
    for word in words:
        if word in query_vector:
            query_vector[word] = query_vector[word] + 1
            if max < query_vector[word]:
                max = query_vector[word]
        else:
            query_vector[word] = 1
            if max == 0:
                max = 1
    
    # a helper function to calculate idf
    def idf(key):
        if key in df:
            return math.log(numquotes / (df[key]))
        else:
            return 0
    
    # translate tf vector to tf-idf vector and find the |q|, query norm
    query_norm = 0
    for key in query_vector:
        
        # we first normalize the query vector, and multiply with idf
        query_vector[key] = query_vector[key] / max * idf(key)
        
        # calculate query_norm
        query_norm = query_norm + query_vector[key] * query_vector[key]

    query_norm = math.sqrt(query_norm)

    # a dictionary whose key is the quotes and whose values are the cosine similarity between the quote and the query
    similarity = {}

    # find the cosine sim between each file and query
    for file in tf:
        norm = 0

        dot_product = 0
        
        # grab all tokens in this quote
        for token in tf[file]:
            # calculate the norm of this quote
            norm = norm + tf_idf[file][token] * tf_idf[file][token]
            
            # calculate the dot product of the quote and the query
            if token in query_vector:
                dot_product = dot_product + query_vector[token] * tf_idf[file][token]
        norm = math.sqrt(norm)
        
        # if the dot product is 0, it means that the cosine similarity must be 0.
        # so this quote would have nothing in common with the query
        if dot_product == 0:
            continue
            
        # calculate the cosine_sim
        cosine_sim = dot_product / (norm * query_norm)
        similarity[file] = cosine_sim
    # end for

    # sort and return
    return sorted(similarity.items(), key=lambda x:x[1], reverse = 1)

# the search function by calculating the cosine similarity between tags of each document and the query
def search_by_tags(query):
   
    # tokenize query
    words = []
    for word in word_tokenize(query):
        if word not in stop_words:
            words.append(word)

    # find the term frequencies for the query
    max = 0
    query_vector = {}
    
    # count the term frequencies for the query and store in query_vector
    for word in words:
        if word in query_vector:
            query_vector[word] = query_vector[word] + 1
            if max < query_vector[word]:
                max = query_vector[word]
        else:
            query_vector[word] = 1
            if max == 0:
                max = 1
                
                
    # a helper function to calculate idf    
    def idf(key):
        if key in df_tag:
            return math.log(numquotes / (df_tag[key]))
        else:
            return 0
    
    # translate tf vector to tf-idf vector and find the |q|, query norm
    query_norm = 0
    for key in query_vector:
        query_vector[key] = query_vector[key] / max * idf(key)
        query_norm = query_norm + query_vector[key] * query_vector[key]

    query_norm = math.sqrt(query_norm)


    similarity = {}

    # find the cosine sim between each file and query
    for file in tf_tag:
        norm = 0

        dot_product = 0
        
        # grab all tags in this quote
        for tag in tf_tag[file]:
            # calculate the norm of the quote
            norm = norm + tf_idf_tag[file][tag] * tf_idf_tag[file][tag]
            
            # calculate the dot product
            if tag in query_vector:
                dot_product = dot_product + query_vector[tag] * tf_idf_tag[file][tag]
        norm = math.sqrt(norm)
        
        
        # if dot prod == 0, the quote has nothing in commen with the query.
        if dot_product == 0:
            continue
        cosine_sim = dot_product / (norm * query_norm)
        
        similarity[file] = cosine_sim

    # sort and return
    return sorted(similarity.items(), key=lambda x:x[1], reverse = 1)
    



results = search_by_tokens("love and life")
count = 1
for s in results:
    count = count + 1
    print(s[0][:80], ': ', str(s[1] * 100)[:7] + '%')
    if count > 10:
        break


Try not to become a man of success. Rather become a man of value. :  52.9447%
It takes courage to grow up and become who you really are. :  39.7146%
To love at all is to be vulnerable. Love anything and your heart will be wrung a :  10.7231%
