# spaCy Recommender

In [28]:
# Imports
import spacy
import pandas as pd
import pickle

import gensim
from gensim.summarization.summarizer import summarize 
from gensim.summarization import keywords 

In [2]:
# Loading in the BERT word embeddings via spaCy
nlp = spacy.load('en_trf_bertbaseuncased_lg')

In [3]:
# Loading in the original Seinfeld scripts.csv
df = pd.read_csv('./data/scripts.csv', index_col=0)

In [4]:
# Cleaning the dialogue
df['clean_dialogue'] = df['Dialogue'].str.replace(r"\(.*\)","")
df = df.dropna()
df.reset_index(inplace=True, drop=True)

In [8]:
# Saving the clean csv
df.to_csv('./data/clean_scripts.csv')

In [10]:
# List of all unique episode IDs
episodes = df['SEID'].unique()

## Episode Dictionary

In [11]:
def get_episode_dialogue(df):
    '''
    This function returns a dictionary of all the episode IDs (keys) and complete dialogue from each
    episode (values).
    '''
    episodes = df['SEID'].unique()
    episodes_dialogue = {}
    for episode in episodes:
        episode_df = df[df['SEID'] == episode]
        episode_df.reset_index(inplace=True, drop=True)
        dialogue = ''
        for i in range(len(episode_df)):
            dialogue += episode_df['clean_dialogue'][i]
            
        episodes_dialogue[episode] = dialogue
    return episodes_dialogue
    

In [12]:
# Run the function and save the variable
episode_dialogues = get_episode_dialogue(df)

In [65]:
# Saving the episode dictionary to the data folder

with open('./data/episode_dialogues.pkl', 'wb') as f:
    pickle.dump(episode_dialogues, f)

In [91]:
example = episode_dialogues['S01E01']

In [14]:
chat_doc = nlp('I like coffee. Can we be freinds? I like Jerry.')

## Retreiving spaCy objects for quicker similarity scores.

One of the problems I was having with my chatbot is that everytime you need to gather word embeddings, it took way too long. Here I am creating a dictionary that will use episode IDs as keys and the associated spaCy object utilizing the BERT word embeddings as values in hopes to speed up the process.

In [13]:
# Creating spaCy object and assigning them as values to their associated episode keys.
seinfeld_bert_spacy = {}
for episode in episodes:
    seinfeld_bert_spacy[episode] = nlp(episode_dialogues[episode])

In [19]:
seinfeld_bert_spacy['S01E01'].has_vector

True

In [20]:
chat_doc = nlp('I like coffee. Can we be freinds? I like Jerry.')

In [21]:
chat_doc.similarity(seinfeld_bert_spacy['S01E01'])

0.7984366573068519

In [22]:
# Saving the spaCy BERT vectors

# This saves a BIG file

# with open('./data/seinfeld_bert_spacy.pkl', 'wb') as f:
#     pickle.dump(seinfeld_bert_spacy, f)

In [26]:
with open('/Users/alexander.fioto/Models/seinfeld_bert_spacy.pkl', 'rb') as f:
    seinfeld_vectors = pickle.load(f)

## Testing Episode Recommendation Function

In [54]:
def recommend_episode(chat_dialogue):
    similarity_scores = []
    for episode in episodes:
        doc1 = nlp(chat_dialogue)
        doc2 = nlp(episode_dialogues[episode])
        similarity_scores.append((episode, doc1.similarity(doc2)))
    return similarity_scores.sort(key=lambda x: x[1], reverse = True)
        
        

In [55]:
recs = recommend_episode('Hi Jerry. How are you doing? Want to get some coffee? I like you so much. Your show is my favorite')

In [58]:
recs.sort(key=lambda x: x[1], reverse = True)

In [15]:
recs[:5]

NameError: name 'recs' is not defined