# Week 8 – Document Similarity and Clustering
## Analyzing Term Similarity - Starting on page 459

In [5]:
import numpy as np
import pandas as pd
from scipy.stats import itemfreq

def vectorize_terms(terms):
    terms = [term.lower() for term in terms]
    terms = [np.array(list(term)) for term in terms]
    terms = [np.array([ord(char) for char in term]) 
                for term in terms]
    return terms

root = 'Believe'
term1 = 'beleive'
term2 = 'bargain'
term3 = 'Elephant'

terms = [root, term1, term2, term3]
print('Terms:\n', terms, '\n')

# Character vectorization
term_vectors = vectorize_terms(terms)

# show vector representations
vec_df = pd.DataFrame(term_vectors, index=terms)
print('Term vectors:\n', vec_df, '\n')

root_term = root
other_terms = [term1, term2, term3]

root_term_vec = vec_df[vec_df.index == root_term].dropna(axis=1).values[0]
other_term_vecs = [vec_df[vec_df.index == term].dropna(axis=1).values[0]
                   for term in other_terms]

Terms:
 ['Believe', 'beleive', 'bargain', 'Elephant'] 

Term vectors:
             0    1    2    3    4    5    6      7
Believe    98  101  108  105  101  118  101    NaN
beleive    98  101  108  101  105  118  101    NaN
bargain    98   97  114  103   97  105  110    NaN
Elephant  101  108  101  112  104   97  110  116.0 



## Hamming Distance starting on page 461

In [6]:
print('Hamming Distance:')
def hamming_distance(u, v, norm=False):
    if u.shape != v.shape:
        raise ValueError('The vectors must have equal lengths.')
    return (u != v).sum() if not norm else (u != v).mean()

# compute Hamming distance
for term, term_vector in zip(other_terms, other_term_vecs):
    try:
        print('Hamming distance between root: {} and term: {} is {}'.
              format(root_term, term, hamming_distance(root_term_vec, term_vector,
                                                       norm=False)))
    except ValueError as ve:
        print('An error occurred:' + str(ve))
        continue

# computer normalized Hamming distance - I caught the exception unlike the book
for term, term_vector in zip(other_terms, other_term_vecs):
    try:
        print('Normalized Hamming distance between root: {} and term: {} is {}'
              .format(root_term, term,round(hamming_distance(root_term_vec,
                                                             term_vector, norm=True), 2)))
    except ValueError as ve:
        print('An error occurred:' + str(ve))
        continue

Hamming Distance:
Hamming distance between root: Believe and term: beleive is 2
Hamming distance between root: Believe and term: bargain is 6
An error occurred:The vectors must have equal lengths.
Normalized Hamming distance between root: Believe and term: beleive is 0.29
Normalized Hamming distance between root: Believe and term: bargain is 0.86
An error occurred:The vectors must have equal lengths.


## Manhattan Distance - Page 463

In [7]:
print('\nManhattan Distance:')
def manhattan_distance(u, v, norm=False):
    if u.shape != v.shape:
        raise ValueError('The vectors must have equal lengths.')
    return abs(u - v).sum() if not norm else abs(u - v).mean()

# compute Manhattan distance
for term, vector_term in zip(other_terms, other_term_vecs):
    try:
        print('Manhattan distance between root: {} and term: {} is {}'
              .format(root_term, term, manhattan_distance(root_term_vec,
                                                          term_vector, norm=False)))
    except ValueError as ve:
        print('An error occurred:' + str(ve))
        continue

# computer normalized Manhattan distance
for term, term_vector in zip(other_terms, other_term_vecs):
    try:
        print('Normalized Manhattan distance between root: {} and term: {} is {}'
              .format(root_term, term, round(manhattan_distance(root_term_vec, term_vector,
                                                                norm=True), 2)))
    except ValueError as ve:
        print('An error occurred:' + str(ve))
        continue


Manhattan Distance:
An error occurred:The vectors must have equal lengths.
An error occurred:The vectors must have equal lengths.
An error occurred:The vectors must have equal lengths.
Normalized Manhattan distance between root: Believe and term: beleive is 1.14
Normalized Manhattan distance between root: Believe and term: bargain is 5.43
An error occurred:The vectors must have equal lengths.


## Euclidean Distance starting on page 463

In [8]:
print('\nEuclidean Distance:')
def euclidean_distance(u,v):
    if u.shape != v.shape:
        raise ValueError('The vectors must have equal lengths.')
    distance = np.sqrt(np.sum(np.square(u - v)))
    return distance

# compute Euclidean distance
for term, term_vector in zip(other_terms, other_term_vecs):
    try:
        print('Euclidean distance between root: {} and term: {} is {}'
              .format(root_term, term, round(euclidean_distance(root_term_vec,
                                                                term_vector), 2)))
    except ValueError as ve:
        print('An error occurred:' + str(ve))
        continue


Euclidean Distance:
Euclidean distance between root: Believe and term: beleive is 5.66
Euclidean distance between root: Believe and term: bargain is 17.94
An error occurred:The vectors must have equal lengths.


## Levenshtein Edit Distance starting on page 467

In [9]:
print('\nLevenshtein Edit Distance')
import copy
def levenshtein_edit_distance(u, v):
    # convert to lower case
    u = u.lower()
    v = v.lower()
    # base cases
    if u == v: return 0
    elif len(u) == 0: return len(v)
    elif len(v) == 0: return len(u)
    # initialize edit distance matrix
    edit_matrix = []
    # initialize two distance matrices 
    du = [0] * (len(v) + 1)
    dv = [0] * (len(v) + 1)
    # du: the previous row of edit distances
    for i in range(len(du)):
        du[i] = i
    # dv : the current row of edit distances    
    for i in range(len(u)):
        dv[0] = i + 1
        # compute cost as per algorithm
        for j in range(len(v)):
            cost = 0 if u[i] == v[j] else 1
            dv[j + 1] = min(dv[j] + 1, du[j + 1] + 1, du[j] + cost)
        # assign dv to du for next iteration
        for j in range(len(du)):
            du[j] = dv[j]
        # copy dv to the edit matrix
        edit_matrix.append(copy.copy(dv))
    # compute the final edit distance and edit matrix    
    distance = dv[len(v)]
    edit_matrix = np.array(edit_matrix)
    edit_matrix = edit_matrix.T
    edit_matrix = edit_matrix[1:,]
    edit_matrix = pd.DataFrame(data=edit_matrix,
                               index=list(v),
                               columns=list(u))
    return distance, edit_matrix

# Computer Levenshtein distance
for term, term_vector in zip(other_terms, other_term_vecs):
    edit_d, edit_m = levenshtein_edit_distance(root_term_vec, term_vector)
    print('Computing distance between root: {} and term: {}'.format(root_term,
                                                                    term))
    print('Levenshtein edit distance is {}'.format(edit_d))
    print('The complete edit distance matrix is depicted below')
    print(edit_m)
    print('-'*30)

print('\n')



Levenshtein Edit Distance


AttributeError: 'numpy.ndarray' object has no attribute 'lower'

## Cosine Distance and Similarity starting on page 473

In [10]:
print('\nCosine Distance:')
def boc_term_vectors(word_list):
    word_list = [word.lower() for word in word_list]
    unique_chars = np.unique(np.hstack([list(word)
                                        for word in word_list]))
    word_list_term_counts = [{char: count for char, count in np.stack(
        np.unique(list(word), return_counts=True), axis=1)} for word in word_list]

    boc_vectors = [np.array([int(word_term_counts.get(char, 0))
                             for char in unique_chars])
                   for word_term_counts in word_list_term_counts]
    return list(unique_chars), boc_vectors

feature_names, feature_vectors = boc_term_vectors(terms)
boc_df = pd.DataFrame(feature_vectors, columns=feature_names, index=terms)
print('Bag of characters vectors:\n', boc_df)

def cosine_distance(u, v):
    distance = 1.0 - (np.dot(u, v) / 
                        (np.sqrt(sum(np.square(u))) * np.sqrt(sum(np.square(v)))))
    return distance

root_term_boc = boc_df[vec_df.index == root_term].values[0]
other_term_bocs = [boc_df[vec_df.index == term].values[0]
                   for term in other_terms]

# Compute Cosine Distance
print('\nCompute Cosine Distance:')
for term, boc_term in zip(other_terms, other_term_bocs):
    print('Analyzing similarity between root: {} and term: {}'.format(root_term, term))
    distance = round(cosine_distance(root_term_boc, boc_term), 2)
    similarity = 1 - distance                                                           
    print('Cosine distance  is {}'.format(distance))
    print('Cosine similarity  is {}'.format(similarity))
    print('-'*40)


Cosine Distance:
Bag of characters vectors:
           a  b  e  g  h  i  l  n  p  r  t  v
Believe   0  1  3  0  0  1  1  0  0  0  0  1
beleive   0  1  3  0  0  1  1  0  0  0  0  1
bargain   2  1  0  1  0  1  0  1  0  1  0  0
Elephant  1  0  2  0  1  0  1  1  1  0  1  0

Compute Cosine Distance:
Analyzing similarity between root: Believe and term: beleive
Cosine distance  is -0.0
Cosine similarity  is 1.0
----------------------------------------
Analyzing similarity between root: Believe and term: bargain
Cosine distance  is 0.82
Cosine similarity  is 0.18000000000000005
----------------------------------------
Analyzing similarity between root: Believe and term: Elephant
Cosine distance  is 0.39
Cosine similarity  is 0.61
----------------------------------------


## Building a Movie Recommender - starting on page 477

In [11]:
df = pd.read_csv('./data/tmdb_5000_movies.csv')
df.info()
print('Columns\n', df.columns, '\n')

df = df[['title', 'tagline', 'overview', 'genres', 'popularity']]
df.tagline.fillna('', inplace=True)
df['description'] = df['tagline'].map(str) + ' ' + df['overview']
df.dropna(inplace=True)
df.info()
print('\nSimplified DF:\n', df.head(), '\n')

# Text preprocessing - starting on page 480
import nltk
import numpy as np
import re

stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # lower case and remove special chars/whitespace
    doc = re.sub('[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize doc
    tokens = nltk.word_tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)

df['description'] = \
        df['description'].apply(lambda x: normalize_corpus(x))
norm_corpus = df['description']
print('Length of normalized corpus:', len(norm_corpus), '\n')
print(df.info(), '\n')

# Save this updated corpus df
df.to_csv('./data/norm_corpus.csv')

# Extract TF-IDF Features - page 481
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(ngram_range=(1, 2), min_df=2)
tfidf_matrix = tf.fit_transform(norm_corpus)
print('TFIDF matrix shape:\n', tfidf_matrix.shape, '\n')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

### Cosine similarity and pairwise doc similarity page 482

In [12]:
from sklearn.metrics.pairwise import cosine_similarity
doc_sim = cosine_similarity(tfidf_matrix)
doc_sim_df = pd.DataFrame(doc_sim)
print('Document similarity df:\n', doc_sim_df.head(),'\n')

# Movie list page 482
movies_list = df['title'].values
print('Movies list:\n', movies_list, movies_list.shape, '\n')

# Find top similar movies for a sample movie page 483
movie_idx = np.where(movies_list == 'Minions')[0][0]
print('Movie like Minions:\n', movie_idx, '\n')

# Movie similarities
movie_similarities = doc_sim_df.iloc[movie_idx].values
print('Movie similarities, like Minions:\n', movie_similarities, '\n')

# Top five similar movie IDs
similar_movie_idxs = np.argsort(-movie_similarities)[1:6]
print('Similar movie indices like Minions:\n', similar_movie_idxs, '\n')

# Get top five similar movies page 484
similar_movies = movies_list[similar_movie_idxs]
print('Similar movies to Minions:\n', similar_movies, '\n')

Document similarity df:
        0         1         2         3         4         5         6     \
0  1.000000  0.010701  0.000000  0.019030  0.028687  0.024901  0.000000   
1  0.010701  1.000000  0.011891  0.000000  0.041623  0.000000  0.014564   
2  0.000000  0.011891  1.000000  0.000000  0.000000  0.000000  0.000000   
3  0.019030  0.000000  0.000000  1.000000  0.008793  0.000000  0.015976   
4  0.028687  0.041623  0.000000  0.008793  1.000000  0.000000  0.022912   

       7         8         9     ...      4790  4791      4792      4793  \
0  0.026516  0.000000  0.007420  ...  0.009702   0.0  0.023336  0.033549   
1  0.027122  0.034688  0.007614  ...  0.009956   0.0  0.004818  0.000000   
2  0.022242  0.015854  0.004891  ...  0.042617   0.0  0.000000  0.000000   
3  0.023172  0.027452  0.073610  ...  0.000000   0.0  0.009667  0.000000   
4  0.028676  0.000000  0.023538  ...  0.014800   0.0  0.000000  0.000000   

       4794      4795  4796      4797      4798      4799  
0  0.00

### Build a movie recommender page 484

In [13]:
def movie_recommender(movie_title, movies=movies_list, doc_sims=doc_sim_df):
    # find movie id
    movie_idx = np.where(movies == movie_title)[0][0]
    # get movie similarities
    movie_similarities = doc_sims.iloc[movie_idx].values
    # get top 5 similar movie ids
    similar_movie_idxs = np.argsort(-movie_similarities)[1:6]
    # get top 5 movies
    similar_movies = movies[similar_movie_idxs]
    # return the top 5 movies
    return similar_movies

popular_movies = df.sort_values(by='popularity', ascending=False)
print('Popular movies:\n', popular_movies, '\n')

# Just 5 movies
for movie in popular_movies['title'][0:5]:
    print('Movie:', movie)
    print('Top 5 recommended movies:', movie_recommender(movie_title=movie), '\n')

Popular movies:
                            title  \
546                      Minions   
95                  Interstellar   
788                     Deadpool   
94       Guardians of the Galaxy   
127           Mad Max: Fury Road   
...                          ...   
4625            Midnight Cabaret   
4118      Hum To Mohabbat Karega   
4727                Penitentiary   
3361                  Alien Zone   
4553  America Is Still the Place   

                                                tagline  \
546        Before Gru, they had a history of bad bosses   
95    Mankind was born on Earth. It was never meant ...   
788             Witness the beginning of a happy ending   
94                          All heroes start somewhere.   
127                                  What a Lovely Day.   
...                                                 ...   
4625                The hot spot where Satan's waitin'.   
4118                                                      
4727  There's only 