In [5]:
import pickle

with open('simpsons_scripts_tfidf.pickle', 'rb') as eps_info:
    episodes = pickle.load(eps_info)

In [10]:
# Flatten out episode data for easier comparisons
flat_eps = [{'id':'S{s_num}E{ep_num}'.format(s_num=s_num, ep_num=ep_num),
                'top_words':set([top_word['phrase'] for top_word in episodes[s_num][ep_num]['top_words']]),
                'tfidf_vector':episodes[s_num][ep_num]['tfidf_vector']
            } 
            for s_num in episodes for ep_num in episodes[s_num]
           ]

In [11]:
# Perform pairwise comparisons of Jaccard and Cosine Similarity

import numpy as np

def jaccard_similarity(phrases1, phrases2):
    """Intersection over Union
    """
    return len(phrases1.intersection(phrases2)) / len(phrases1.union(phrases2))

def cosine_similarity(ep1_vec, ep2_vec):
    """A.B / norm(A).norm(B)
    """
    numerator = np.dot(ep1_vec, ep2_vec)
    denominator = np.dot(np.linalg.norm(ep1_vec), np.linalg.norm(ep2_vec))
    
    if not denominator:
        return 0
    else:
        return numerator / denominator

similarities = []
for ep1 in range(len(flat_eps) - 1):
    for ep2 in range(ep1 + 1, len(flat_eps)):
        comparison = {}
        comparison['id1'] = flat_eps[ep1]['id']
        comparison['id2'] = flat_eps[ep2]['id']
        comparison['jaccard_similarity'] = jaccard_similarity(flat_eps[ep1]['top_words'], flat_eps[ep2]['top_words'])
        comparison['cosine_similarity'] = cosine_similarity(flat_eps[ep1]['tfidf_vector'], flat_eps[ep2]['tfidf_vector'])
        similarities.append(comparison)

In [24]:
top_n = 25

top_jaccard_similarities = sorted(similarities, key=lambda x: x['jaccard_similarity'], reverse=True)
top_cosine_similarities = sorted(similarities, key=lambda x: x['cosine_similarity'], reverse=True)

print('Most similar episodes:')
print('Jaccard\t\t\t\tCosine')
for i in range(top_n):
    print('{j1}, {j2}\t\t\t{c1}, {c2}'.format(j1=top_jaccard_similarities[i]['id1'],
                                                            j2=top_jaccard_similarities[i]['id2'],
                                                            c1=top_cosine_similarities[i]['id1'],
                                                            c2=top_cosine_similarities[i]['id2']
                                                           ))

Most similar episodes:
Jaccard				Cosine
S16E1, S16E2			S13E10, S15E14
S4E12, S9E11			S3E20, S19E16
S1E9, S6E3			S13E18, S22E14
S5E9, S6E3			S2E9, S7E18
S5E8, S9E11			S2E21, S7E2
S10E5, S13E17			S6E15, S7E15
S22E12, S22E20			S25E15, S29E12
S7E1, S7E10			S13E12, S17E22
S4E15, S6E3			S9E18, S19E10
S23E13, S24E10			S1E12, S6E15
S10E18, S12E17			S11E22, S30E22
S12E18, S12E21			S6E4, S7E18
S12E17, S12E21			S14E19, S26E17
S13E14, S20E9			S6E5, S8E16
S2E11, S4E18			S4E12, S9E11
S22E7, S22E10			S7E18, S8E14
S5E2, S10E18			S1E3, S27E19
S24E10, S25E14			S8E16, S19E8
S22E7, S22E12			S16E1, S16E2
S22E7, S25E14			S6E5, S19E8
S22E7, S24E10			S24E10, S30E18
S23E12, S25E14			S6E5, S17E8
S20E14, S20E21			S8E19, S15E17
S10E18, S12E21			S17E8, S19E8
S23E12, S23E13			S1E12, S7E15
