**IMPORTANT NOTE:** Do not change signatures of methods defined below. Those methods will be used while grading your homework.

## IMDB Scraping

In [1]:
import pickle
import re
import requests
import time

from collections import namedtuple
from collections import Counter
from functools import reduce
from heapq import nlargest
from itertools import islice
from math import log

In [2]:
MOVIE_ID_FILE = "movie_ids.csv"
ALL_MOVIE_CONTENTS = "all_movie_contents.pickle"
BASE_URL = "https://www.imdb.com/title/"
NUMBER_WORDS = 5000
NUMBER_RECS = 10
STOPWORDS = []# ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

In [3]:
def read_csv(path):
    """
    Read csv file
    
    Parameters
    ----------
    path: str
        File to read
    """
    with open(path, "r") as f:
        lines = f.readlines()
    return [line.strip() for line in lines if line]

In [4]:
def get_html_content(imdb_id):
    """Get HTML content of movie
    """
    url = BASE_URL + imdb_id
    response = requests.get(url)
    if response.status_code >= 400:
        raise ValueError("Something went wrong in request !: {}, {}".format(url, response.status_code))

    return response.text

In [5]:
def find_title(html_content):
    # TODO: Write doc-string
    pattern = re.compile(r'<div\s+?class\s*?=\s*?"title_wrapper".*?'
                         + r'<h1\s*?class\s*?=\s*?".*?">(.*?)&nbsp;', re.DOTALL)
    match = pattern.search(html_content)
    
    return match.group(1).strip()

In [6]:
def find_rec_item(html_content):
    # TODO: Write doc-string
    pattern = re.compile(r'<div\s+?class\s*?=\s*?"(?:rec_item|rec_item rec_selected)" data-info="" data-spec=".*?" data-tconst="(.*?)">')
    for match in pattern.finditer(html_content):
        yield match.group(1)
        

In [7]:
def find_recommendations(html_content):
    # TODO: Write doc-string
    return list(find_rec_item(html_content))

In [8]:
def find_storyline(html_content):
    # TODO: Write doc-string
    pattern = re.compile(r'<h2>\s*?Storyline\s*?</h2>.*?'
                         + r'<span>(.*?)</span>', re.DOTALL)

    match = pattern.search(html_content)

    return match.group(1).strip() # + " " + match.group(2).strip() + " " + match.group(3).strip()

In [9]:
def get_movie_contents(imdb_id):
    """
    Gets an imdb id and returns its title, storyline, list of IMDB recommendations respectively.
    """
    html_content = get_html_content(imdb_id)
    try:
        title = find_title(html_content)
        storyline = find_storyline(html_content)
        recommendations = find_recommendations(html_content)
    except AttributeError as e:
        print(html_content)
        raise AttributeError

    return title, storyline, recommendations

In [10]:
def scraping(path, pickle_path, restore=False):
    # TODO: Write doc-string
    movie_ids = read_csv(path)
    if restore:
        with open(pickle_path, "rb") as f:
            all_movie_contents = pickle.load(f)
    else:
        all_movie_contents = {}
    for i, movie_id in enumerate(movie_ids):
        if i % 20 == 0:
            print("Movie id: {}".format(movie_id))
            print("Number of collected movie: {}".format(len(all_movie_contents)))
        if movie_id in all_movie_contents:
            continue
        while True:
            try:
                title, storyline, recs = get_movie_contents(movie_id)
            except ConnectionError as e:
                time.sleep(4)
                continue
            except Exception as e:
                with open(pickle_path, "wb") as f:
                    pickle.dump(all_movie_contents, f)
                raise e
            break

        movie_content = MovieContent(
            title=title,
            storyline=storyline,
            recommendations=recs
        )
        all_movie_contents[movie_id] = movie_content
        for rec_id in recs:
            if rec_id in all_movie_contents:
                continue
            while True:
                try:
                    rec_title, rec_storyline, rec_recs = get_movie_contents(rec_id)
                except ConnectionError as e:
                    time.sleep(10)
                    continue
                except Exception as e:
                    with open(pickle_path, "wb") as f:
                        pickle.dump(all_movie_contents, f)
                    raise e
                break

            rec_movie_content = MovieContent(
                title=rec_title,
                storyline=rec_storyline,
                recommendations=rec_recs
            )
            all_movie_contents[rec_id] = rec_movie_content

    with open(pickle_path, "wb") as f:
        pickle.dump(all_movie_contents, f)

    return all_movie_contents

In [11]:
MovieContent = namedtuple("MovieContent", "title storyline recommendations")

In [None]:
all_movie_contents = scraping(MOVIE_ID_FILE, ALL_MOVIE_CONTENTS, restore=True)

In [12]:
with open(ALL_MOVIE_CONTENTS, "rb") as f:
    all_movie_contents = pickle.load(f)

## Tf-idf model

In [13]:
def remove_htlm_tags(doc):
    """Clean links"""
    pattern = re.compile(r"<[/]?a.*?>", re.DOTALL)
    return re.sub(pattern, "", doc)

In [14]:
def tokenize(doc, stopwords):
    """Tokenize document"""
    doc = doc.lower()
    doc = remove_htlm_tags(doc)
    pattern = re.compile(r"\w+")
    
    return [match.group() for match in pattern.finditer(doc) if match and match.group() not in stopwords]

In [15]:
def construct_corpus(contents, stopwords):
    """Construct corpus with given movie contents"""
    return {
        movie_id: tokenize(content.storyline, stopwords)
        for movie_id, content in contents.items()
    }

In [16]:
def construct_vocabulary(corpus):
    """Construct vocabulary by given corpus"""
    vocab = {}
    for doc_id, doc in corpus.items():
        for word, count in dict(Counter(doc)).items():
            vocab.setdefault(word, []).append((doc_id, count))
    return vocab

In [17]:
def take(n, iterable):
    return dict(islice(iterable, n))

In [18]:
def get_occurences(vocabulary):
    return {
        word: reduce(lambda total_count, doc_count: total_count + doc_count[1], doc_count_list, 0)
        for word, doc_count_list in vocabulary.items()
    }

In [19]:
def top_k_words(k, vocabulary):
    """Returns most frequent k words"""
    vocab_occurences = get_occurences(vocabulary)
    return sorted(vocab_occurences, key=lambda x: vocab_occurences[x], reverse=True)[:k]

In [20]:
def norm(sparse_vector):
    """Calculate L2 norm of given sparse vector"""
    return (reduce(lambda sum_, score: sum_ + score**2, sparse_vector.values(), 0))**0.5

In [21]:
def doc2vec(vocabulary, top_k_words, num_docs):
    """Compute document vectors"""
    doc2vec = {}
    for word in top_k_words:
        doc_count_list = vocabulary[word]
        document_freq = len(doc_count_list)
        for doc, term_freq in doc_count_list:
            doc2vec.setdefault(doc, {}).update({word: term_freq * (log(num_docs) - log(document_freq))})


    for doc_id, raw_tf_idf_vector in doc2vec.items():
        vector_norm = norm(raw_tf_idf_vector)
        doc2vec[doc_id] = {
            word: raw_tf_idf / vector_norm
            for word, raw_tf_idf in raw_tf_idf_vector.items()
        }
        
    return doc2vec

In [22]:
def construct_vector(text):
    tokens = tokenize(text, STOPWORDS)
    raw_vector = {
        word: tokens.count(word)
        for word in set(tokens)
    }
    vector_norm = norm(raw_vector)
    return {
        word: score / vector_norm
        for word, score in raw_vector.items()
    }

In [23]:
def sparse_dot_product(sparse_vec_1, sparse_vec_2):
    common_words = set(sparse_vec_1).intersection(set(sparse_vec_2))
    result = 0
    for word in common_words:
        result += sparse_vec_1[word] * sparse_vec_2[word]
    return result

In [24]:
def get_most_k_similar(k, text_vector, doc_vec):
    similarities = {
        doc_id: sparse_dot_product(doc_vector, text_vector)
        for doc_id, doc_vector in doc_vec.items()
    }
    return list(nlargest(k, similarities, key=similarities.get))

In [25]:
corpus = construct_corpus(all_movie_contents, STOPWORDS)

vocabulary = construct_vocabulary(corpus)

vocab_occurences = get_occurences(vocabulary)

top_n_words = top_k_words(NUMBER_WORDS, vocabulary)

num_docs = len(corpus)

doc_vec = doc2vec(vocabulary, top_n_words, num_docs)

## Recommendation

In [26]:
K = NUMBER_RECS

In [27]:
def recommend(imdb_id):
    """
    Gets an imdb id and returns a list of recommended movie ids for that movie. 
    """
    storyline = all_movie_contents[imdb_id].storyline
    
    text_vector = construct_vector(storyline)
    
    return get_most_k_similar(K, text_vector, doc_vec)

## Evaluation

In [28]:
def precision(rec_movie_ids, relevant_movie_ids):
    """Compute Precision"""
    count = 0
    for movie_id in relevant_movie_ids:
        if movie_id in rec_movie_ids:
            count += 1

    return count / (len(rec_movie_ids) + 10**-8)

In [29]:
def recall(rec_movie_ids, relevant_movie_ids):
    """Compute Recall"""
    count = 0
    for movie_id in rec_movie_ids:
        if movie_id in relevant_movie_ids:
            count += 1
            
    return count / (len(relevant_movie_ids) + 10**-8)

In [30]:
def evaluate_recommendations(rec_movie_ids, relevant_movie_ids, K):
    """
    Gets list of recommended and relevant movie ids and K value.
    
    Returns precision, recall, F1 values for K respectively. 
    """
    prec = precision(rec_movie_ids, relevant_movie_ids) 
    rec = recall(rec_movie_ids, relevant_movie_ids)
    F_1 = 2 * prec * rec / (prec + rec + 10**(-8))
    
    return prec, rec, F_1

In [31]:
def average_precision(rec_movie_ids, relevant_movie_ids):
    """Calculates average precision"""
    sum_prec = 0
    relevant_count = 0
    for i, rec in enumerate(rec_movie_ids):
        if rec in relevant_movie_ids:
            relevant_count += 1
            sum_prec += relevant_count / (i + 1)
    return sum_prec / (len(relevant_movie_ids) + 10**-8)

In [32]:
movie_ids = read_csv(MOVIE_ID_FILE)

In [37]:
precs = []
recalls = []
f_1s = []
for movie_id in movie_ids:
    rec_movie_ids = recommend(movie_id)
    relevant_movie_ids = all_movie_contents[movie_id].recommendations
    prec, rec, f_1 = evaluate_recommendations(rec_movie_ids, relevant_movie_ids, NUMBER_RECS)
    precs.append(prec)
    recalls.append(rec)
    f_1s.append(f_1)
    if prec > 0:
        print("Precision: {}, Recall: {}, F-1: {}".format(prec, rec, f_1)) 

Precision: 0.0999999999, Recall: 0.08333333326388888, F-1: 0.09090908586776886
Precision: 0.1999999998, Recall: 0.16666666652777776, F-1: 0.181818176694215
Precision: 0.0999999999, Recall: 0.08333333326388888, F-1: 0.09090908586776886
Precision: 0.0999999999, Recall: 0.1111111109876543, F-1: 0.10526315279778417
Precision: 0.1999999998, Recall: 0.16666666652777776, F-1: 0.181818176694215
Precision: 0.3999999996, Recall: 0.3333333330555555, F-1: 0.3636363583471074
Precision: 0.0999999999, Recall: 0.08333333326388888, F-1: 0.09090908586776886
Precision: 0.0999999999, Recall: 0.08333333326388888, F-1: 0.09090908586776886
Precision: 0.1999999998, Recall: 0.16666666652777776, F-1: 0.181818176694215
Precision: 0.0999999999, Recall: 0.12499999984374999, F-1: 0.11111110604938293
Precision: 0.1999999998, Recall: 0.16666666652777776, F-1: 0.181818176694215
Precision: 0.3999999996, Recall: 0.3333333330555555, F-1: 0.3636363583471074
Precision: 0.29999999969999996, Recall: 0.24999999979166665, F-1:

Precision: 0.3999999996, Recall: 0.3333333330555555, F-1: 0.3636363583471074
Precision: 0.1999999998, Recall: 0.16666666652777776, F-1: 0.181818176694215
Precision: 0.49999999949999996, Recall: 0.41666666631944443, F-1: 0.4545454491735537
Precision: 0.0999999999, Recall: 0.0999999999, F-1: 0.09999999490000026
Precision: 0.0999999999, Recall: 0.08333333326388888, F-1: 0.09090908586776886
Precision: 0.1999999998, Recall: 0.2857142853061225, F-1: 0.23529411252595167
Precision: 0.0999999999, Recall: 0.08333333326388888, F-1: 0.09090908586776886
Precision: 0.3999999996, Recall: 0.3333333330555555, F-1: 0.3636363583471074
Precision: 0.0999999999, Recall: 0.08333333326388888, F-1: 0.09090908586776886
Precision: 0.0999999999, Recall: 0.08333333326388888, F-1: 0.09090908586776886
Precision: 0.1999999998, Recall: 0.16666666652777776, F-1: 0.181818176694215
Precision: 0.0999999999, Recall: 0.08333333326388888, F-1: 0.09090908586776886
Precision: 0.0999999999, Recall: 0.14285714265306124, F-1: 0.1

Precision: 0.29999999969999996, Recall: 0.24999999979166665, F-1: 0.2727272675206612
Precision: 0.5999999993999999, Recall: 0.4999999995833333, F-1: 0.5454545399999999
Precision: 0.3999999996, Recall: 0.3333333330555555, F-1: 0.3636363583471074
Precision: 0.49999999949999996, Recall: 0.7142857132653061, F-1: 0.5882352885813149
Precision: 0.0999999999, Recall: 0.08333333326388888, F-1: 0.09090908586776886
Precision: 0.7999999992, Recall: 0.666666666111111, F-1: 0.7272727216528925
Precision: 0.1999999998, Recall: 0.16666666652777776, F-1: 0.181818176694215
Precision: 0.1999999998, Recall: 0.16666666652777776, F-1: 0.181818176694215
Precision: 0.1999999998, Recall: 0.16666666652777776, F-1: 0.181818176694215
Precision: 0.0999999999, Recall: 0.08333333326388888, F-1: 0.09090908586776886
Precision: 0.29999999969999996, Recall: 0.24999999979166665, F-1: 0.2727272675206612
Precision: 0.0999999999, Recall: 0.08333333326388888, F-1: 0.09090908586776886
Precision: 0.1999999998, Recall: 0.1666666

Precision: 0.49999999949999996, Recall: 0.41666666631944443, F-1: 0.4545454491735537
Precision: 0.49999999949999996, Recall: 0.41666666631944443, F-1: 0.4545454491735537
Precision: 0.0999999999, Recall: 0.08333333326388888, F-1: 0.09090908586776886
Precision: 0.3999999996, Recall: 0.3333333330555555, F-1: 0.3636363583471074
Precision: 0.0999999999, Recall: 0.08333333326388888, F-1: 0.09090908586776886
Precision: 0.0999999999, Recall: 0.08333333326388888, F-1: 0.09090908586776886
Precision: 0.49999999949999996, Recall: 0.41666666631944443, F-1: 0.4545454491735537
Precision: 0.0999999999, Recall: 0.0999999999, F-1: 0.09999999490000026
Precision: 0.0999999999, Recall: 0.08333333326388888, F-1: 0.09090908586776886
Precision: 0.0999999999, Recall: 0.08333333326388888, F-1: 0.09090908586776886
Precision: 0.0999999999, Recall: 0.08333333326388888, F-1: 0.09090908586776886
Precision: 0.1999999998, Recall: 0.16666666652777776, F-1: 0.181818176694215
Precision: 0.0999999999, Recall: 0.0833333332

Precision: 0.29999999969999996, Recall: 0.24999999979166665, F-1: 0.2727272675206612
Precision: 0.49999999949999996, Recall: 0.41666666631944443, F-1: 0.4545454491735537
Precision: 0.0999999999, Recall: 0.08333333326388888, F-1: 0.09090908586776886
Precision: 0.0999999999, Recall: 0.08333333326388888, F-1: 0.09090908586776886
Precision: 0.1999999998, Recall: 0.16666666652777776, F-1: 0.181818176694215
Precision: 0.5999999993999999, Recall: 0.4999999995833333, F-1: 0.5454545399999999
Precision: 0.49999999949999996, Recall: 0.41666666631944443, F-1: 0.4545454491735537
Precision: 0.1999999998, Recall: 0.16666666652777776, F-1: 0.181818176694215
Precision: 0.0999999999, Recall: 0.1999999996, F-1: 0.13333332871111125
Precision: 0.3999999996, Recall: 0.3333333330555555, F-1: 0.3636363583471074
Precision: 0.0999999999, Recall: 0.08333333326388888, F-1: 0.09090908586776886
Precision: 0.1999999998, Recall: 0.16666666652777776, F-1: 0.181818176694215
Precision: 0.0999999999, Recall: 0.0833333332

In [36]:
sum_ap = 0
for movie_id in movie_ids:
    rec_movie_ids = recommend(movie_id)
    if not rec_movie_ids:
        print(movie_id)
    relevant_movie_ids = all_movie_contents[movie_id].recommendations
    sum_ap += average_precision(rec_movie_ids, relevant_movie_ids)
print("Mean Average Precision :{}".format(sum_ap / len(movie_ids)))

Mean Average Precision :0.039216018491483526
