In [64]:
import spacy
import pandas as pd
import pickle


In [2]:
nlp = spacy.load('en_core_web_lg')

In [5]:
df = pd.read_csv('./data/scripts.csv', index_col=0)

In [21]:
df['clean_dialogue'] = df['Dialogue'].str.replace(r"\(.*\)","")

In [36]:
df = df.dropna()

In [37]:
df.reset_index(inplace=True, drop=True)

In [67]:
pwd

'/Users/alexander.fioto/personal_github/Seinfeld-Chatbot'

In [69]:
df.to_csv('./data/clean_scripts.csv')

In [18]:
episodes = df['SEID'].unique()

## Episode Dictionary

In [38]:
def get_episode_dialogue(df):
    episodes = df['SEID'].unique()
    episodes_dialogue = {}
    for episode in episodes:
        episode_df = df[df['SEID'] == episode]
        episode_df.reset_index(inplace=True, drop=True)
        dialogue = ''
        for i in range(len(episode_df)):
            dialogue += episode_df['clean_dialogue'][i]
            
        episodes_dialogue[episode] = dialogue
    return episodes_dialogue
    

In [40]:
episode_dialogues = get_episode_dialogue(df)

In [65]:
with open('./data/episode_dialogues.pkl', 'wb') as f:
    pickle.dump(episode_dialogues, f)

In [54]:
def recommend_episode(chat_dialogue):
    similarity_scores = []
    for episode in episodes:
        doc1 = nlp(chat_dialogue)
        doc2 = nlp(episode_dialogues[episode])
        similarity_scores.append((episode, doc1.similarity(doc2)))
    return similarity_scores.sort(key=lambda x: x[1], reverse = True)
        
        

In [55]:
recs = recommend_episode('Hi Jerry. How are you doing? Want to get some coffee? I like you so much. Your show is my favorite')

In [58]:
recs.sort(key=lambda x: x[1], reverse = True)

In [59]:
recs

[('S07E19', 0.9769975004944613),
 ('S04E24', 0.9751695629129786),
 ('S07E24', 0.9745268665977951),
 ('S08E21', 0.9737905365652617),
 ('S05E02', 0.9733777757620253),
 ('S05E01', 0.9733359976936261),
 ('S06E20', 0.9733254368537856),
 ('S07E23', 0.9730422240040872),
 ('S05E18', 0.972853797835285),
 ('S09E13', 0.9726466322239913),
 ('S06E09', 0.9726069619938952),
 ('S07E09', 0.9719457817219286),
 ('S03E12', 0.9719307664444864),
 ('S06E11', 0.9719271595757075),
 ('S02E07', 0.9718875811356588),
 ('S05E09', 0.9718633626383872),
 ('S08E05', 0.9718188507186672),
 ('S07E03', 0.9717059810737815),
 ('S03E07', 0.9716866564086488),
 ('S05E13', 0.9716734079354518),
 ('S01E03', 0.9715858499893288),
 ('S09E03', 0.9715723803149826),
 ('S05E15', 0.9715685264128173),
 ('S07E20', 0.9715504577728449),
 ('S06E19', 0.9712860191472017),
 ('S08E17', 0.9711897892935125),
 ('S05E10', 0.9710401204450736),
 ('S07E11', 0.9710400809260007),
 ('S04E02', 0.9709997951957259),
 ('S04E23', 0.9709856102101107),
 ('S04E05',

In [23]:
example = df[df['SEID'] == 'S01E01']

In [25]:
dia = ''
for i in range(len(example)):
    dia += df['clean_dialogue'][i]

In [27]:
doc1 = 'Hi Jerry. How are you doing? Want to get some coffee? I like you so much. Your show is my favorite'
doc2 = dia

In [28]:
doc1 = nlp(doc1)
doc2 = nlp(dia)

In [29]:
doc1.similarity(doc2)

0.9665833862893434