In [1]:
# Imports
import spacy
import pandas as pd
import pickle


In [2]:
# Loading in the BERT word embeddings via spaCy
nlp = spacy.load('en_trf_bertbaseuncased_lg')

In [3]:
# Loading in the original Seinfeld scripts.csv
df = pd.read_csv('./data/scripts.csv', index_col=0)

In [4]:
# Cleaning the dialogue
df['clean_dialogue'] = df['Dialogue'].str.replace(r"\(.*\)","")
df = df.dropna()
df.reset_index(inplace=True, drop=True)

In [7]:
pwd

'/Users/alexander.fioto/personal_github/Seinfeld-Chatbot'

In [8]:
# Saving the clean csv
df.to_csv('./data/clean_scripts.csv')

In [10]:
# List of all unique episode IDs
episodes = df['SEID'].unique()

## Episode Dictionary

In [11]:
def get_episode_dialogue(df):
    '''
    This function returns a dictionary of all the episode IDs (keys) and complete dialogue from each
    episode (values).
    '''
    episodes = df['SEID'].unique()
    episodes_dialogue = {}
    for episode in episodes:
        episode_df = df[df['SEID'] == episode]
        episode_df.reset_index(inplace=True, drop=True)
        dialogue = ''
        for i in range(len(episode_df)):
            dialogue += episode_df['clean_dialogue'][i]
            
        episodes_dialogue[episode] = dialogue
    return episodes_dialogue
    

In [12]:
# Run the function and save the variable
episode_dialogues = get_episode_dialogue(df)

In [65]:
# Saving the episode dictionary to the data folder

# This sa
# with open('./data/episode_dialogues.pkl', 'wb') as f:
#     pickle.dump(episode_dialogues, f)

In [91]:
example = episode_dialogues['S01E01']

In [14]:
chat_doc = nlp('I like coffee. Can we be freinds? I like Jerry.')

In [24]:
with open('./data/episode_dialogues.pkl', 'rb') as f:
    seinfeld_vectors = pickle.load(f)

In [25]:
seinfeld_vectors['S01E01']

"Do you know what this is all about? Do you know, why were here? To be out, this is out...and out is one of the single most enjoyable experiences of life. People...did you ever hear people talking about We should go out? This is what theyre talking about...this whole thing, were all out now, no one is home. Not one person here is home, were all out! There are people tryin to find us, they dont know where we are.  Did you ring?, I cant find him. Where did he go? He didnt tell me where he was going. He must have gone out. You wanna go out you get ready, you pick out the clothes, right? You take the shower, you get all ready, get the cash, get your friends, the car, the spot, the reservation...Then youre standing around, whatta you do? You go We gotta be getting back. Once youre out, you wanna get back! You wanna go to sleep, you wanna get up, you wanna go out again tomorrow, right? Where ever you are in life, its my feeling, youve gotta go. See, to me, that button is in the worst possibl

## Retreiving spaCy objects for quicker similarity scores.

One of the problems I was having with my chatbot is that everytime you need to gather word embeddings, it took way too long. Here I am creating a dictionary that will use episode IDs as keys and the associated spaCy object utilizing the BERT word embeddings as values in hopes to speed up the process.

In [13]:
# Creating spaCy object and assigning them as values to their associated episode keys.
seinfeld_bert_spacy = {}
for episode in episodes:
    seinfeld_bert_spacy[episode] = nlp(episode_dialogues[episode])

In [19]:
seinfeld_bert_spacy['S01E01'].has_vector

True

In [20]:
chat_doc = nlp('I like coffee. Can we be freinds? I like Jerry.')

In [21]:
chat_doc.similarity(seinfeld_bert_spacy['S01E01'])

0.7984366573068519

In [22]:
with open('./data/seinfeld_bert_spacy.pkl', 'wb') as f:
    pickle.dump(seinfeld_bert_spacy, f)

## Testing Episode Recommendation Function

In [54]:
def recommend_episode(chat_dialogue):
    similarity_scores = []
    for episode in episodes:
        doc1 = nlp(chat_dialogue)
        doc2 = nlp(episode_dialogues[episode])
        similarity_scores.append((episode, doc1.similarity(doc2)))
    return similarity_scores.sort(key=lambda x: x[1], reverse = True)
        
        

In [55]:
recs = recommend_episode('Hi Jerry. How are you doing? Want to get some coffee? I like you so much. Your show is my favorite')

In [58]:
recs.sort(key=lambda x: x[1], reverse = True)

In [15]:
recs[:5]

NameError: name 'recs' is not defined