In [2]:
import pickle

with open('simpsons_scripts_tfidf.pickle', 'rb') as eps_info:
    episodes = pickle.load(eps_info)

In [23]:
# Flatten out episode data for easier comparisons
flat_eps = [{'id':'S{s_num}E{ep_num}'.format(s_num=s_num, ep_num=ep_num),
                'top_words':set([top_word['phrase'] for top_word in episodes[s_num][ep_num]['top_words']])
            } 
            for s_num in episodes for ep_num in episodes[s_num]
           ]

In [24]:
# Perform pairwise comparisons of Jaccard Similarity

def jaccard_similarity(phrases1, phrases2):
    """Intersection over Union
    """
    return len(phrases1.intersection(phrases2)) / len(phrases1.union(phrases2))

similarities = []
for ep1 in range(len(flat_eps) - 1):
    for ep2 in range(ep1 + 1, len(flat_eps)):
        comparison = {}
        comparison['id1'] = flat_eps[ep1]['id']
        comparison['id2'] = flat_eps[ep2]['id']
        comparison['jaccard_similarity'] = jaccard_similarity(flat_eps[ep1]['top_words'], flat_eps[ep2]['top_words'])
        similarities.append(comparison)

In [22]:
top_similarities = sorted(similarities, key=lambda x: x['jaccard_similarity'], reverse=True)
top_similarities[:25]

[{'id1': 'S16E1', 'id2': 'S16E2', 'jaccard_similarity': 0.16346713205351948},
 {'id1': 'S4E12', 'id2': 'S9E11', 'jaccard_similarity': 0.13219741480611047},
 {'id1': 'S1E9', 'id2': 'S6E3', 'jaccard_similarity': 0.09950522264980759},
 {'id1': 'S5E9', 'id2': 'S6E3', 'jaccard_similarity': 0.0958904109589041},
 {'id1': 'S5E8', 'id2': 'S9E11', 'jaccard_similarity': 0.08502252252252253},
 {'id1': 'S10E5', 'id2': 'S13E17', 'jaccard_similarity': 0.08401084010840108},
 {'id1': 'S22E12', 'id2': 'S22E20', 'jaccard_similarity': 0.07642626480086114},
 {'id1': 'S7E1', 'id2': 'S7E10', 'jaccard_similarity': 0.07469102632993015},
 {'id1': 'S4E15', 'id2': 'S6E3', 'jaccard_similarity': 0.07353730542136339},
 {'id1': 'S23E13', 'id2': 'S24E10', 'jaccard_similarity': 0.07123583378305451},
 {'id1': 'S10E18', 'id2': 'S12E17', 'jaccard_similarity': 0.0711864406779661},
 {'id1': 'S12E18', 'id2': 'S12E21', 'jaccard_similarity': 0.07111111111111111},
 {'id1': 'S12E17', 'id2': 'S12E21', 'jaccard_similarity': 0.0710

In [26]:
with open('simpsons_scripts_similarities.pickle', 'wb') as similarities:
    pickle.dump(top_similarities, similarities, protocol=pickle.HIGHEST_PROTOCOL)