In [34]:
import numpy as np
import json
import glob


#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# #spacy
import spacy
from nltk.corpus import stopwords

#visualization
import pyLDAvis
import pyLDAvis.gensim

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


from gensim.models import TfidfModel


In [35]:
#prepping data
def load_data(file):
    with open(file, "r", encoding="utf-8") as f:
        data = json.load(f)

    return data


def write_data(file, data):
    with open(file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)

In [36]:
stopwords = stopwords.words("english") + ["go", "re", "just", "get", "want", "know", "so", "need", "knock", "look", "guy", "work", "say", "let", "come", "here", "make", "see",  "tell", "thing", "talk", "ve", "really"]


In [37]:
# loads descriptions of episodes
episodes = load_data("data/plots.json")

# episode titles in the form of "Season;Episode"
names = [str(episode["Season"]) + ";" + str(episode["No. inseason"])
         for episode in episodes]

# maps episode titles to their index in the names array
names_map = {}
for index in range(len(names)):
    names_map[names[index]] = index

# episode summaries
plots = [episode["plot"] for episode in episodes]

# episode summaries in the form of "Season;Episode\nSummary"
summaries = [str(episode["Season"]) + ";" + str(episode["No. inseason"]
                                                ) + "\n" + str(episode["plot"]) for episode in episodes]

# loads episode scripts
episodes_data = {}
data = load_data("data/scripts.json")

for line in data:
    episode_title = line["episode_name"]
    script_line = line["dialogue"]

    if episode_title not in episodes_data:
        episodes_data[episode_title] = ""

    episodes_data[episode_title] += script_line + " "

episodes_array = []
# Iterate through the episodes_data dictionary and extract the script lines
for lines in episodes_data.values():
    # Combine all script lines into a single string for each episode
    episode_string = "".join(lines)
    episodes_array.append(episode_string)

# clean up the episode scripts and combine them with the episode summaries
for i in range(len(episodes_array)):
    episodes_array[i] = episodes_array[i].replace("\\n", " ")
    episodes_array[i] = episodes_array[i].replace("\\", "")
    episodes_array[i] = episodes_array[i].replace("  ", " ")
    plots[i] = plots[i] + " " + episodes_array[i]
    
    

In [50]:
def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    """https://spacy.io/api/annotation"""
    nlp = spacy.load("/Users/anishpalakurthi/opt/anaconda3/lib/python3.8/site-packages/en_core_web_sm/en_core_web_sm-3.6.0", disable=["parser", "ner"])
    texts_out = []
    for sent in texts:
        #contains metadata about the word
        doc = nlp(sent)
        #lemmatizes each word by appending allowed tokens from the doc object's metadata
        texts_out.append(
            " ".join([token.lemma_ for token in doc if token.pos_ in allowed_postags]))
        
    for word in texts_out:
        if word in stopwords:
            texts_out.remove(word)
    return texts_out

print("path successfully hit")
lemmatized_texts = lemmatization(plots)
print(lemmatized_texts[0])

path successfully hit
unsuccessful visit high iq sperm bank return home find aspire actress penny new neighbor hall apartment think immediately interested chase dream never catch invite apartment indian food ask use shower break wrap towel get meet visit friend wannabe lady man try hit unable speak suffer selective mutism presence woman so infatuate help use shower agree retrieve tv ex - boyfriend however physical superiority overwhelm combined iq return pant tv penny feel bad offer take guy dinner initiate friendship corridor sperm bank so photon direct plane slit slit observe go slit ’ unobserved however ’ observe ’ leave plane hit target go slit agree ’ point ’ point just think ’ good idea tee shirt excuse hang across aegean down down move finger phylum make au see capital idea ’ port au help high sperm bank ask maybe here think place fill thank right back take time just finish crossword puzzle wait think kid ’re semi - pro commit genetic fraud ’ guarantee sperm go generate high iq 

In [51]:
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
        
    return final

#preprocess our lemmatized keywords
data_words = gen_words(lemmatized_texts)

print(data_words[0][0:10])

['unsuccessful', 'visit', 'high', 'iq', 'sperm', 'bank', 'return', 'home', 'find', 'aspire']


In [52]:
#bigrams and trigrams
bigram_phrases = gensim.models.Phrases(data_words, min_count=5, threshold=50)
trigram_phrases = gensim.models.Phrases(bigram_phrases[data_words], threshold=50)

bigram = gensim.models.phrases.Phraser(bigram_phrases)
trigram = gensim.models.phrases.Phraser(trigram_phrases)


def make_bigrams(texts):
    return [bigram[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram[bigram[doc]] for doc in texts]

data_bigrams = make_bigrams(data_words)
data_bigrams_trigrams = make_trigrams(data_words)


print(data_bigrams_trigrams[0])


['unsuccessful', 'visit', 'high', 'iq', 'sperm', 'bank', 'return', 'home', 'find', 'aspire', 'actress', 'penny', 'new', 'neighbor', 'hall', 'apartment', 'think', 'immediately', 'interested', 'chase', 'dream', 'never', 'catch', 'invite', 'apartment', 'indian', 'food', 'ask', 'use', 'shower', 'break', 'wrap', 'towel', 'get', 'meet', 'visit', 'friend', 'wannabe', 'lady', 'man', 'try', 'hit', 'unable', 'speak', 'suffer', 'selective_mutism', 'presence', 'woman', 'so', 'infatuate', 'help', 'use', 'shower', 'agree', 'retrieve', 'tv', 'ex_boyfriend', 'however', 'physical', 'superiority', 'overwhelm', 'combined', 'iq', 'return', 'pant', 'tv', 'penny', 'feel', 'bad', 'offer', 'take', 'guy', 'dinner', 'initiate', 'friendship', 'corridor', 'sperm', 'bank', 'so', 'photon', 'direct', 'plane', 'slit', 'slit', 'observe', 'go', 'slit', 'unobserved', 'however', 'observe', 'leave', 'plane', 'hit', 'target', 'go', 'slit', 'agree', 'point', 'point', 'just', 'think', 'good', 'idea', 'tee_shirt', 'excuse', '

In [53]:
#tfidf removal

id2word = corpora.Dictionary(data_bigrams_trigrams)

texts = data_bigrams_trigrams

corpus = [id2word.doc2bow(text) for text in texts]

tfidf = TfidfModel(corpus, id2word=id2word)

low_value = 0.03
words = []

words_missing_in_tfidf = []

for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = []
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    
    drops = low_value_words + words_missing_in_tfidf
    for item in drops:
        words.append(id2word[item])
        
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids]
    
    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
    
    corpus[i] = new_bow



In [54]:
id2word = corpora.Dictionary(data_words)

corpus = [id2word.doc2bow(text) for text in data_words]

for text in data_words:
    new = id2word.doc2bow(text)
    corpus.append(new)

#[index, frequency]
print(corpus[0][0:10])


[(0, 3), (1, 1), (2, 1), (3, 1), (4, 2), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)]


In [56]:
#generate LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,id2word = id2word, num_topics=5, random_state=100, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True)

In [57]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds = "mmds", R = 30)
vis

