**IMPORTANT NOTE:** Do not change signatures of methods defined below. Those methods will be used while grading your homework.

## IMDB Scraping

In [1]:
import pickle
import re
import requests
import time

from collections import namedtuple
from collections import Counter
from functools import reduce
from heapq import nlargest
from itertools import islice
from math import log

## CONSTANTS

In [2]:
MOVIE_ID_FILE = "movie_ids.csv"
ALL_MOVIE_CONTENTS = "all_movie_contents.pickle"
BASE_URL = "https://www.imdb.com/title/"
NUMBER_WORDS = 5000
NUMBER_RECS = 12
TF_IDF_THRESHOLD = 10.7

### Reading movie ids from csv file

In [3]:
def read_csv(path):
    """Read csv file
    
    Parameters
    ----------
    path: str
        File to read
    
    Returns
    -------
    list
        List of IMDB ids read from file
    """
    with open(path, "r") as f:
        lines = f.readlines()
    return [line.strip() for line in lines if line]

### Making Request to IMDB

-  It requires `requests` library to make `GET` reqest to IMDB
-  Link format is https://www.imdb.com/title/{imdb_id}

In [4]:
def get_html_content(imdb_id):
    """Get HTML content of movie
    
    Parameters
    ----------
    imdb_id: str
        Movie id

    Returns
    -------
    str
        Raw text, which should be in html format
    """
    url = BASE_URL + imdb_id
    response = requests.get(url)
    if response.status_code >= 400:
        raise ValueError("Something went wrong in request !: {}, {}".format(url, response.status_code))

    return response.text

In [5]:
def find_title(html_content):
    """Find title in given html content

    Parameters
    ----------
    html_content: str
        Raw html text

    Returns
    -------
    str:
        Matched title in html text
    """
    pattern = re.compile(r'<div\s+?class\s*?=\s*?"title_wrapper".*?'
                         + r'<h1\s*?class\s*?=\s*?".*?">(.*?)&nbsp;', re.DOTALL)
    match = pattern.search(html_content)
    
    return match.group(1).strip()

In [6]:
def find_recommendations(html_content):
    """Find recommended items

    It uses regex to extract recommendation item in `html_content`.
    In regex pattern, I have concerned that all recommendation items are
    in a <div> tag which has some attributes specific to recommendation items

    Parameters
    ----------
    html_content: str
        Raw html text

    Returns
    -------
    list
        Matched all recommended items in html text
    """
    pattern = re.compile(r'<div\s+?class\s*?=\s*?"(?:rec_item|rec_item rec_selected)" data-info="" data-spec=".*?" data-tconst="(.*?)">')
    
    return pattern.findall(html_content)

In [7]:
def find_storyline(html_content):
    """Find storylin in given html content
    
    By using regex expression, this function extracts storyline of movie
    from given `html_content`.
    
    Parameters
    ----------
    html_content: str
        Raw html text

    Returns
    -------
    str
        Matched storyline in `html_content`
    
    """
    pattern = re.compile(r'<h2>\s*?Storyline\s*?</h2>.*?'
                         + r'<span>(.*?)</span>', re.DOTALL)

    match = pattern.search(html_content)

    return match.group(1).strip()

In [8]:
def get_movie_contents(imdb_id):
    """
    Gets an imdb id and returns its title, storyline, list of IMDB recommendations respectively.

    Parameters
    ----------
    imdb_id: str
        IMDB id of the movie

    Returns
    -------
    title: str
        Title of the movie
    storyline: str
        Storyline of the movie
    recommendations: list
        List of recommended films on IMDB website
    """
    html_content = get_html_content(imdb_id)
    try:
        title = find_title(html_content)
        storyline = find_storyline(html_content)
        recommendations = find_recommendations(html_content)
    except AttributeError as e:
        print(html_content)
        raise AttributeError

    return title, storyline, recommendations

## Scraping Procedure

In the scraping procedure I have followed the procedure:

1. Get all movie ids from a file
2. Create or Restore a dictionary that keeps movie contents, then for each movie id that is not in this dictionary:
2. Get html content from imdb
3. Get contents from html content, which are title, storyline, recommendations
4. Store it to the dictionary

### Notes

-  IMDB website has a limit on number of requests, that is why in scraping method,if a `ConnectionError` happens, the program will try to get content again, after waiting 10 seconds
-  To be able to scrape partially, if something bad happens when try to getting contents of a movie, scraping method save immediate dictionary, so you could restore it from where it stops
-  As prerequisite, you <b>MUST</b> declare `MovieContent` namedtuple

In [9]:
def scraping(path, pickle_path, restore=False):
    """Whole scraping procedure
    
    It first gets all movie ids from a csv file in `path`,
    then if `restore` is True, it restores pickle file from `pickle_path`, else
    it starts to keep scraped data in new generated dictionary. This dictionary,
    all movie contents that are scraped are kept in following format:
    
    {
        {imdb_id}: MovieContent(
            title={movie_title},
            storyline={movie_storyline},
            recommendations={movie_recommendations}
        )
    }
    
    Since I have used a namedtuple `MovieContent`, before calling this function,
    A namedtuple MovieContent MUST be declared, i.e. see following cells
    
    After declaring `all_movie_contents` dict this method gets all contents for movie ids
    got from csv file and for the movie ids in their recommendation list


    Parameters
    ----------
    path: str
        Path to read movie ids
    pickle_path: str
        Path to pickle file, which is used both restore and save contents
    restore: bool
        Decided restore from `pickle_path` or start from scratch

    Returns
    -------
    all_movie_contens: dict
        All contents of movies
    """
    movie_ids = read_csv(path)
    if restore:
        with open(pickle_path, "rb") as f:
            all_movie_contents = pickle.load(f)
    else:
        all_movie_contents = {}
    for i, movie_id in enumerate(movie_ids):
        if i % 20 == 0:
            print("Movie id: {}".format(movie_id))
            print("Number of collected movie: {}".format(len(all_movie_contents)))
        if movie_id in all_movie_contents:
            continue
        while True:
            try:
                title, storyline, recs = get_movie_contents(movie_id)
            except ConnectionError as e:
                time.sleep(4)
                continue
            except Exception as e:
                with open(pickle_path, "wb") as f:
                    pickle.dump(all_movie_contents, f)
                raise e
            break

        movie_content = MovieContent(
            title=title,
            storyline=storyline,
            recommendations=recs
        )
        all_movie_contents[movie_id] = movie_content
        for rec_id in recs:
            if rec_id in all_movie_contents:
                continue
            while True:
                try:
                    rec_title, rec_storyline, rec_recs = get_movie_contents(rec_id)
                except ConnectionError as e:
                    time.sleep(10)
                    continue
                except Exception as e:
                    with open(pickle_path, "wb") as f:
                        pickle.dump(all_movie_contents, f)
                    raise e
                break

            rec_movie_content = MovieContent(
                title=rec_title,
                storyline=rec_storyline,
                recommendations=rec_recs
            )
            all_movie_contents[rec_id] = rec_movie_content

    with open(pickle_path, "wb") as f:
        pickle.dump(all_movie_contents, f)

    return all_movie_contents

In [10]:
MovieContent = namedtuple("MovieContent", "title storyline recommendations")

# To scrape whole data uncomment below line
# all_movie_contents = scraping(MOVIE_ID_FILE, ALL_MOVIE_CONTENTS, restore=False)

In [11]:
# If you have already scraped data, you can load from pickle file
with open(ALL_MOVIE_CONTENTS, "rb") as f:
    all_movie_contents = pickle.load(f)

## Tf-idf model

Tf-idf model requires some preprocessing on content data

-  Constructing corpus
-  Constructing vocabulary, with document frequencies
-  Selecting words to represent documents
    -  To select words, I have set a threshold for tf-idf score, which is 10.7
    -  I have selected this threshold because if a word occurs once in only one document, their tf-idf scores will be $1 \times \log{\frac{N}{1}} \approx 10.61$ where $N = 2820$ for my corpus
-  Creating document vectors

In [12]:
def remove_htlm_tags(doc):
    """Clean links
    
    Some storylines contain <a> html tag, they should be removed
    
    Parameters
    ----------
    doc: str
        Document to remove <a> html tags

    Returns
    -------
    str
        HTML tag <a> removed version of doc
    """
    pattern = re.compile(r"<[/]?a.*?>", re.DOTALL)
    return re.sub(pattern, "", doc)

In [13]:
def tokenize(doc, stopwords=[]):
    """Tokenize document
    
    Tokenize document and use `stopwords` to exclude some tokens.
    This tokenization process ignores puntuations and it considers
    only portions that contains alphanumeric characters and underscore
    
    Note: Before tokenization process, it lowers all the characters in `doc`
    and removes <a> tag in `doc`
    
    Parameters
    ----------
    doc: str
        Document to tokenize
    stopwords: list
        Lowercased stopword list

    Returns
    -------
    list
        Tokenized version of document as token list
    """
    doc = doc.lower()
    doc = remove_htlm_tags(doc)
    pattern = re.compile(r"\w+")
    
    return [match.group() for match in pattern.finditer(doc) if match and match.group() not in stopwords]

In [14]:
def construct_corpus(contents, stopwords=[]):
    """Construct corpus with given movie contents
    
    For each movie in `contents` dictionary program tokenize its
    storyline, and store it in a dictionary
    
    Parameters
    ----------
    contents: dict
        Dictionary of movie_id: movie_contents
    stopwords: list
        List of stopwords, default is empty

    Returns
    -------
    dict
        Movie id as key, tokenized storyline as value
    """
    return {
        movie_id: tokenize(content.storyline, stopwords)
        for movie_id, content in contents.items()
    }

In [15]:
def construct_vocabulary(corpus):
    """Construct vocabulary by given corpus
    
    For each document in corpus, for each word in the document
    a (document id, term frequency) tuple added to vocabulary dict.
    So, example resulted dictionary would be:
    
    {
        "example_word": [
            N / document frequency of example_word
            ("imdb-id-1", 12),
            ("imdb-id-2", 2),
            ...
        ],
        ...
    }
    
    Parameters
    ----------
    corpus: dict
        Corpus which contains documents
        
    Returns
    -------
    vocab: dict
        Vocabulary dictionary
    """
    vocab = {}
    N = len(corpus)
    for doc_id, doc in corpus.items():
        for word, count in dict(Counter(doc)).items():
            vocab.setdefault(word, []).append((doc_id, count))
    return {
        word: [N / len(postings)] + postings
        for word, postings in vocab.items()
    }

In [16]:
def get_max_tf_idf_score(vocabulary):
    """Get total number of occurences for each word in vocabulary
    
    Parameters
    ----------
    vocabulary: dict
        Vocabulary of words
        
    Returns
    -------
    dict
        Dictionary of word: max tf-idf score of word
    """
    tf_idfs = {}
    for word, postings in vocabulary.items():
        idf = log(postings[0])
        for doc, tf in postings[1:]:
            tf_idfs.setdefault(word, []).append(tf * idf)

    return {
        word: max(tf_idf_scores)
        for word, tf_idf_scores in tf_idfs.items()
    }

In [17]:
def top_k_words(k, vocabulary):
    """Returns top tf-idf scored k words
    
    Parameters
    ----------
    k: int
        Number of words to select
    vocabulary: dict
        Vocabulary of words

    Returns
    -------
    list
        List of most tf-idf scored `k` words
    """
    max_tf_idf_scores = get_max_tf_idf_score(vocabulary)

    return list(nlargest(k, max_tf_idf_scores, key=max_tf_idf_scores.get))

In [18]:
def top_words(vocabulary, threshold):
    """Returns top tf-idf scoerd words
    
    It calculates max tf-idf score of each word, then returns
    words that their tf-idf scores are greater than some `threshold`
    
    Parameters
    ----------
    vocabulary: dict
        Vocabulary of words
    threshold: float
        Threshold for tf-idf scores
        
    Returns
    -------
    list
        List of words that they are above of some `threshold`
    """
    max_tf_idf_scores = get_max_tf_idf_score(vocabulary)
    
    return [word for word, max_tf_idf in max_tf_idf_scores.items() if max_tf_idf > threshold]
    

In [19]:
def construct_document_lengths(vocabulary, feature_words):
    """Construct document length dict
    
    Calculate length of each document, which is normalization factor

    Parameters
    ----------
    vocabulary: dict
        Postings dict of all words
    feature_words
        List of words to encode documents

    Returns 
    -------
    dict
        Length of each document
    """
    
    lengths = {}

    for word in feature_words:
        postings = vocabulary[word]
        idf = log(postings[0])
        for doc, tf in postings[1:]:
            if doc in lengths:
                lengths[doc] += (tf * idf) ** 2
            else:
                lengths[doc] = (tf * idf) ** 2
    
    return {
        doc: norm_squared ** 0.5
        for doc, norm_squared in lengths.items()
    }

In [20]:
corpus = construct_corpus(all_movie_contents)

vocabulary = construct_vocabulary(corpus)

max_tf_idf_scores = get_max_tf_idf_score(vocabulary)

feature_words = top_words(vocabulary, TF_IDF_THRESHOLD)
# To use top `NUMBER_WORDS` as encoding words comment out above line
# Uncomment below line
# feature_words = top_k_words(NUMBER_WORDS ,vocabulary)

lengths = construct_document_lengths(vocabulary, feature_words)

## Recommendation

In the recommendation part,

-  First, program gets storyline from IMDB site for the movie
-  Then, it computes normalized vector representation for this storyline
-  Calculate similarity to each element in the corpus
-  Recommend a most similar K movies from the corpus

In [21]:
def construct_vector(text):
    """Constructs vector for given text
    
    It calculates tf-idf scores and normalize vector
    
    Parameters
    ----------
    text: str
        Text to compute vector

    Returns
    -------
    dict
        Sparse normalized vector of text
    """
    tokens = tokenize(text)
    vector = {}
    for word in feature_words:
        if word in tokens:
            tf = tokens.count(word)
            idf = log(vocabulary[word][0])
            vector[word] = tf * idf

    return vector

In [22]:
def get_most_k_similar(k, text_vector):
    """Gets most similar movies
    
    Calculate similarities between `text_vector` and each document in the corpus
    and returns most similar `k` elements
    
    Parameters
    ----------
    k: int
        Number of recommendations
    text_vector: dict
        Sparse vector representation of a text
        
    Returns
    -------
    list
        List of most similar documents' ids
    """
    scores = {}
    for word, score in text_vector.items():
        idf = log(vocabulary[word][0])
        for doc, tf in vocabulary[word][1:]:
            if doc in scores:
                scores[doc] += tf * idf * score
            else:
                scores[doc] = tf * idf * score
    
    scores = {doc_id: score / lengths[doc_id] for doc_id, score in scores.items()}

    return list(nlargest(k, scores, key=scores.get))

In [23]:
def recommend(imdb_id):
    """
    Gets an imdb id and returns a list of recommended movie ids for that movie. 
    
    Parameters
    ----------
    imdb_id: str
        IMDB id

    Returns
    -------
    list
        List of recommended movie ids for that movie
    """
    _, storyline, _ = get_movie_contents(imdb_id)
    
    text_vector = construct_vector(storyline)
    
    recommendations = get_most_k_similar(NUMBER_RECS + 1, text_vector)
    if imdb_id in recommendations:
        recommendations.remove(imdb_id)
    
    return recommendations

## Evaluation

In the evaluation part, for given recomended movie ids and relevant movie ids(which are IMDB recommendations), I have calculated 3 metrics <b>Precision</b>, <b>Recall</b>, and <b>F-1 score</b>.

In last method, <b>evaluation_system</b>, for a given movie id, I have tested system for different K values and printed out each metric's result for each K

K: $1, 2, 3, 10$

In [24]:
def precision(rec_movie_ids, relevant_movie_ids, K):
    """Compute Precision"""
    count = 0
    for movie_id in relevant_movie_ids:
        if movie_id in rec_movie_ids[:K]:
            count += 1

    return count / K

In [25]:
def recall(rec_movie_ids, relevant_movie_ids, K):
    """Compute Recall"""
    count = 0
    for movie_id in rec_movie_ids[:K]:
        if movie_id in relevant_movie_ids:
            count += 1
            
    return count / (len(relevant_movie_ids) + 10**-8)

In [26]:
def evaluate_recommendations(rec_movie_ids, relevant_movie_ids, K):
    """
    Gets list of recommended and relevant movie ids and K value.
    
    Returns precision, recall, F1 values for K respectively. 
    """
    prec = precision(rec_movie_ids, relevant_movie_ids, K) 
    rec = recall(rec_movie_ids, relevant_movie_ids, K)
    F_1 = 2 * prec * rec / (prec + rec + 10**(-8))
    
    return prec, rec, F_1

In [27]:
def evaluate_system(movie_id):
    """Evaluate system for given movie id"""
    rec_movie_ids = recommend(movie_id)
    relevant_movie_ids = all_movie_contents[movie_id].recommendations

    for K in (1, 2, 3, 10):
        prec, rec, f_1 = evaluate_recommendations(rec_movie_ids, relevant_movie_ids, K)
        print("Precision: {}, Recall: {}, F-1: {}".format(prec, rec, f_1))

In [28]:
movie_ids = read_csv(MOVIE_ID_FILE)

In [29]:
evaluate_system(movie_ids[30])

Precision: 0.0, Recall: 0.0, F-1: 0.0
Precision: 0.5, Recall: 0.08333333326388888, F-1: 0.14285714030612245
Precision: 0.3333333333333333, Recall: 0.08333333326388888, F-1: 0.13333333004444453
Precision: 0.4, Recall: 0.3333333330555555, F-1: 0.3636363585123967
