In [2]:
import pickle

In [3]:
# Load episode data from previous scraping exercise
with open('simpsons_scripts.pickle', 'rb') as eps_file:
    episodes = pickle.load(eps_file)

In [4]:
# Go through the seasons and add scripts to a list to facilitate vectorization
ep_scripts = []
ep_script_lookup = {}
for season in episodes:
    for episode in episodes[season]:
        ep_scripts.append(episodes[season][episode]['script'])
        ep_script_lookup[len(ep_scripts)-1] = {}
        ep_script_lookup[len(ep_scripts)-1]['season_num'] = season
        ep_script_lookup[len(ep_scripts)-1]['ep_num'] = episode
        ep_script_lookup[len(ep_scripts)-1]['ep_title'] = episodes[season][episode]['title']

In [5]:
# Use Scikit-Learn TF-IDF feature
from sklearn.feature_extraction.text import TfidfVectorizer

# TODO: experiment with parameters
tfidf_vectorizer = TfidfVectorizer(stop_words='english', lowercase=True, ngram_range=(1,3), min_df=2, max_df=0.5)
X = tfidf_vectorizer.fit_transform(ep_scripts)
X_array = X.toarray()

In [71]:
import numpy as np

# Pick out top words for each episode
features = tfidf_vectorizer.get_feature_names()
current_ep_idx = 0
for season_num in episodes:
    for ep_num in episodes[season_num]:
        current_ep = np.array(X_array[current_ep_idx])
        current_ep_words = np.nonzero(current_ep)[0]  # Take only first dimension since there's only one
        words_dict = [{'index':idx, 'phrase': features[idx], 'score': current_ep[idx]} for idx in current_ep_words]
        max_num_words = min(1000, len(words_dict))
        top_words = sorted(words_dict, key=lambda x: x['score'], reverse=True)[:max_num_words]
        episodes[season_num][ep_num]['top_words'] = top_words
        current_ep_idx += 1

In [76]:
# Save file with top 1000 words for later analysis

with open('simpsons_scripts_tfidf.pickle', 'wb') as eps_file_tfidf:
    pickle.dump(episodes, eps_file_tfidf, protocol=pickle.HIGHEST_PROTOCOL)

In [77]:
# Write episode summaries with top 5 phrases to file

with open('simpsons_scripts_summaries.txt', 'w') as eps_summaries:
    for season_num in episodes:
        eps_summaries.write('Season {season_num}\n'.format(season_num=season_num))
        for ep_num in episodes[season_num]:
            eps_summaries.write('\tTitle: {title}\n'.format(title=episodes[season_num][ep_num]['title']))
            eps_summaries.write('\tSummary: {summary}\n'.format(summary=episodes[season_num][ep_num]['summary']))

            top_5_words_info = episodes[season_num][ep_num]['top_words'][:5]
            top_5_words = [x['phrase'] for x in top_5_words_info]

            eps_summaries.write('\tTop Words: {top_words}\n\n'.format(top_words=top_5_words))
            
        eps_summaries.write('\n\n')