In [1]:
import numpy as np
import json
import glob


#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# #spacy
import spacy
from nltk.corpus import stopwords

#visualization
import pyLDAvis
import pyLDAvis.gensim

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


In [2]:
#prepping data
def load_data(file):
    with open(file, "r", encoding="utf-8") as f:
        data = json.load(f)

    return data


def write_data(file, data):
    with open(file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)

In [3]:
stopwords = stopwords.words("english")


In [4]:
# loads descriptions of episodes
episodes = load_data("data/plots.json")

# episode titles in the form of "Season;Episode"
names = [str(episode["Season"]) + ";" + str(episode["No. inseason"])
         for episode in episodes]

# maps episode titles to their index in the names array
names_map = {}
for index in range(len(names)):
    names_map[names[index]] = index

# episode summaries
plots = [episode["plot"] for episode in episodes]

# episode summaries in the form of "Season;Episode\nSummary"
summaries = [str(episode["Season"]) + ";" + str(episode["No. inseason"]
                                                ) + "\n" + str(episode["plot"]) for episode in episodes]

# loads episode scripts
episodes_data = {}
data = load_data("data/scripts.json")

for line in data:
    episode_title = line["episode_name"]
    script_line = line["dialogue"]

    if episode_title not in episodes_data:
        episodes_data[episode_title] = ""

    episodes_data[episode_title] += script_line + " "

episodes_array = []
# Iterate through the episodes_data dictionary and extract the script lines
for lines in episodes_data.values():
    # Combine all script lines into a single string for each episode
    episode_string = "".join(lines)
    episodes_array.append(episode_string)

# clean up the episode scripts and combine them with the episode summaries
for i in range(len(episodes_array)):
    episodes_array[i] = episodes_array[i].replace("\\n", " ")
    episodes_array[i] = episodes_array[i].replace("\\", "")
    episodes_array[i] = episodes_array[i].replace("  ", " ")
    plots[i] = plots[i] + " " + episodes_array[i]
    
    

In [7]:
def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    """https://spacy.io/api/annotation"""
    nlp = spacy.load("/Users/anishpalakurthi/opt/anaconda3/lib/python3.8/site-packages/en_core_web_sm/en_core_web_sm-3.6.0", disable=["parser", "ner"])
    texts_out = []
    for sent in texts:
        #contains metadata about the word
        doc = nlp(sent)
        #lemmatizes each word by appending allowed tokens from the doc object's metadata
        texts_out.append(
            " ".join([token.lemma_ for token in doc if token.pos_ in allowed_postags]))
    return texts_out

print("path successfully hit")
lemmatized_texts = lemmatization(plots)
print(lemmatized_texts[0][0:90])

path successfully hit
unsuccessful visit high iq sperm bank return home find aspire actress penny new neighbor h


In [11]:
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
        
    return final

#preprocess our lemmatized keywords
data_words = gen_words(lemmatized_texts)

print(data_words[0][0:10])

['unsuccessful', 'visit', 'high', 'iq', 'sperm', 'bank', 'return', 'home', 'find', 'aspire']


In [14]:
id2word = corpora.Dictionary(data_words)

corpus = [id2word.doc2bow(text) for text in data_words]

for text in data_words:
    new = id2word.doc2bow(text)
    corpus.append(new)

#[index, frequency]
print(corpus[0][0:10])


[(0, 3), (1, 1), (2, 1), (3, 1), (4, 2), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)]
actress


In [15]:
#generate LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,id2word = id2word, num_topics=30, random_state=100, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True)

In [16]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds = "mmds", R = 30)
vis

