# Term Similarity
## subsection of _Text Similarity and Clustering_

* Analyzing Document Similarity
* Building a Movie Recommender
    1. Load and View Dataset
    2. Text Preprocessing
    3. Extract TF-IDF Features
    4. Cosine Similarity for Pairwise Document Similarity
    5. Find Top Similar Movies for a Sample Movie
    6. Build a Movie Recommender
    7. Get a List of Popular Movies
    8. Okapi BM25 Ranking for Pairwise Document Similarity

# Building a Movie Recommender

## Load and View Dataset

In [None]:
import pandas as pd

filepath = '/data/tmdb_5000_movies.csv.gz' # need to import
df = pd.read_csv(filepath, compression='gzip')
df.info()

In [None]:
df.head()

In [None]:
# combine text content from movie tagline and overview columns into a new column called description
df = df[['title', 'tagline', 'overview', 'genres', 'popularity']]
df.tagline.fillna('', inplace=True)
df['description'] = df['tagline'].map(str) + ' ' + df['overview']
df.dropna(inplace=True)
df.info()

In [None]:
df.head()

## Text Preprocessing

In [None]:
# preprocessing on movie descriptions before building features
import nltk
import re
import numpy as np

stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # lower case and remove special characters/whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = nltk.word_tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)

norm_corpus = normalize_corpus(list(df['description']))

len(norm_corpus)

## Extract TF-IDF Features

In [None]:
# use TF-IDF to vectorize preprocessed movie descriptions and converting them into numeric vectors
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(ngram_range=(1,2), min_df=2)
tfidf_matrix = tf.fit_transform(norm_corpus)
tfidf_matrix.shape

## Cosine Similarity for Pairwise Document Similarity

In [None]:
# compute Cosine similarity scores for documents instead of terms
from sklearn.metrics.pairwise import cosine_similarity

doc_sim = cosine_similarity(tfidf_matrix)
doc_sim_df = pd.DataFrame(doc_sim)
doc_sim_df.head()

In [None]:
# build a list of all move titles in dataset
movies_list = df['title'].values
movies_list, movies_list.shape

## Find Top Similar Movies for a Sample Movie

In [None]:
# find movie ID
movie_idx = np.where(movies_list == 'Minions')[0][0]
movie_idx

In [None]:
# get movie similarities
# use positional index to obtain vector of pairwise movie similarities for all movies with movie Minions
movie_similarities = doc_sim_df.iloc[movie_idx].values
movie_similarities

In [None]:
# get top five similar movie IDs
similar_movie_idxs = np.argsort(-movie_similarities)[1:6]
similar_movie_idxs

In [None]:
# get top five similar movies
similar_movies = movies_list[similar_movie_idxs]
similar_movies

## Build a Movie Recommender

In [None]:
# build movie recommender function
def movie_recommender(movie_title, movies=movies_list, doc_sims=doc_sim_df):
    # find movie id
    movie_idx = np.where(movies == movie_title)[0][0]
    # get movie similarities
    movie_similarities = doc_sims.iloc[movie_idx].values
    # get top 5 similar movie IDs
    similar_movie_idxs = np.argsort(-movie_similarities)[1:6]
    # get top 5 movies
    similar_movies = movies[similar_movie_idxs]
    # return the top 5 movies
    return similar_movies

In [None]:
# get a list of popular movies
# sort movies dataset based on popularity score
# select some of most popular movies
# view recommendations
pop_movies = df.sort_values(by='popularity', ascending=False)
pop_movies.head()

In [None]:
popular_movies = ['Minions', 'Interstellar', 'Deadpool', 'Jurassic World', 
                  'Pirates of the Caribbean: The Curse of the Black Pearl', 
                  'Dawn of the Planet of the Apes', 'The Hunger Games: Mockingjay - Part 1', 
                  'Terminator Genisys', 'Captain America: Civil War', 'The Dark Knight', 
                  'The Martian', 'Batman v Superman: Dawn of Justice', 'Pulp Fiction', 
                  'The Godfather', 'The Shawshank Redemption', 
                  'The Lord of the Rings: The Fellowship of the Ring', 
                  'Harry Potter and the Chamber of Secrets', 'Star Wars', 
                  'The Hobbit: The Battle of the Five Armies', 'Iron Man']

In [None]:
# get top five recommended movies for each of these movies using movie recommender function
for movie in popular_movies:
    print('Movie:', movie)
    print('Top 5 recommended Movies:', movie_recommender(movie_title=movie))
    print()

## Okapi BM25 Ranking for Pairwise Document Similarity

In [None]:
"""
Data:
-----
.. data:: PARAM_K1 - Free smoothing parameter for BM25.
.. data:: PARAM_B - Free smoothing parameter for BM25.
.. data:: EPSILON - Constant used for negative idf of document in corpus.
"""

import math
from six import iteritems
from six.moves import xrange

PARAM_K1 = 2.5
PARAM_B = 0.85
EPSILON = 0.2

class BM25(object):
    """Implementation of Best Matching 25 ranking function.
    Attributes
    ----------
    corpus_size : int
        Size of corpus (number of documents).
    avgdl : float
        Average length of document in `corpus`.
    corpus : list of list of str
        Corpus of documents.
    f : list of dicts of int
        Dictionary with terms frequencies for each document in `corpus`. Words used as keys and frequencies as values.
    df : dict
        Dictionary with terms frequencies for whole `corpus`. Words used as keys and frequencies as values.
    idf : dict
        Dictionary with inversed terms frequencies for whole `corpus`. Words used as keys and frequencies as values.
    doc_len : list of int
        List of document lengths.
    """

    def __init__(self, corpus):
        """
        Parameters
        ----------
        corpus : list of list of str
            Given corpus.
        """
        self.corpus_size = len(corpus)
        self.avgdl = sum(float(len(x)) for x in corpus) / self.corpus_size
        self.corpus = corpus
        self.f = []
        self.df = {}
        self.idf = {}
        self.doc_len = []
        self.initialize()

    def initialize(self):
        """Calculates frequencies of terms in documents and in corpus. Also computes inverse document frequencies."""
        for document in self.corpus:
            frequencies = {}
            self.doc_len.append(len(document))
            for word in document:
                if word not in frequencies:
                    frequencies[word] = 0
                frequencies[word] += 1
            self.f.append(frequencies)

            for word, freq in iteritems(frequencies):
                if word not in self.df:
                    self.df[word] = 0
                self.df[word] += 1

        for word, freq in iteritems(self.df):
            self.idf[word] = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5)

    def get_score(self, document, index, average_idf):
        """Computes BM25 score of given `document` in relation to item of corpus selected by `index`.
        Parameters
        ----------
        document : list of str
            Document to be scored.
        index : int
            Index of document in corpus selected to score with `document`.
        average_idf : float
            Average idf in corpus.
        Returns
        -------
        float
            BM25 score.
        """
        score = 0
        for word in document:
            if word not in self.f[index]:
                continue
            idf = self.idf[word] if self.idf[word] >= 0 else EPSILON * average_idf
            score += (idf * self.f[index][word] * (PARAM_K1 + 1)
                      / (self.f[index][word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * self.doc_len[index] / self.avgdl)))
        return score

    def get_scores(self, document, average_idf):
        """Computes and returns BM25 scores of given `document` in relation to
        every item in corpus.
        Parameters
        ----------
        document : list of str
            Document to be scored.
        average_idf : float
            Average idf in corpus.
        Returns
        -------
        list of float
            BM25 scores.
        """
        scores = []
        for index in xrange(self.corpus_size):
            score = self.get_score(document, index, average_idf)
            scores.append(score)
        return scores


def get_bm25_weights(corpus):
    """Returns BM25 scores (weights) of documents in corpus.
    Each document has to be weighted with every document in given corpus.
    Parameters
    ----------
    corpus : list of list of str
        Corpus of documents.
    Returns
    -------
    list of list of float
        BM25 scores.
    Examples
    --------
    >>> from gensim.summarization.bm25 import get_bm25_weights
    >>> corpus = [
    ...     ["black", "cat", "white", "cat"],
    ...     ["cat", "outer", "space"],
    ...     ["wag", "dog"]
    ... ]
    >>> result = get_bm25_weights(corpus)
    """
    bm25 = BM25(corpus)
    average_idf = sum(float(val) for val in bm25.idf.values()) / len(bm25.idf)

    weights = []
    for doc in corpus:
        scores = bm25.get_scores(doc, average_idf)
        weights.append(scores)

    return weights

In [None]:
# before using function need to tokenize corpus frist
norm_corpus_tokens = np.array([nltk.word_tokenize(doc) for doc in norm_corpus])
norm_corpus_tokens[:3]

In [None]:
%%time
# use function to build pairwise document similarity matrix
wts = get_bm25_weights(norm_corpus_tokens)

In [None]:
# viewing our pairwise similarity matrix
bm25_wts_df = pd.DataFrame(wts)
bm25_wts_df.head()

In [None]:
# use movie recommender function to get top five movie recommendations for popular movies selected earlier
for movie in popular_movies:
    print('Movie:', movie)
    print('Top 5 recommended Movies:', movie_recommender(movie_title=movie, doc_sims=bm25_wts_df))
    print()