<h4> Objectives</h4>
<ul>
<li> Finding Similar characters based on their dialogues
</ul>

In [70]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn
import os
from gensim.models.keyedvectors import KeyedVectors
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import pickle
import gensim
from gensim.models.doc2vec import TaggedDocument
import random
import numpy as np
%matplotlib inline

In [2]:
path=os.path.abspath("vectors.bin")

In [3]:
model = KeyedVectors.load_word2vec_format(path,binary=True)

In [4]:
lines = pd.read_json("dataset"+os.sep+"movie_lines.json")

In [5]:
lines.head(3)

Unnamed: 0,character_id,character_name,line_id,movie_id,text
0,u0,BIANCA,L1045,m0,They do not!
1,u2,CAMERON,L1044,m0,They do to!
10,u2,CAMERON,L868,m0,"The ""real you""."


In [6]:
user_dialogues = lines[["character_id","movie_id","text"]].groupby(["character_id","movie_id"]).apply(lambda x: ','.join(x.text))

In [7]:
user_dialogues.head(3)

character_id  movie_id
u0            m0          They do not!,Patrick -- is that- a.,Is that wo...
u1            m0          Just sent 'em through.,Never,Didn't have you p...
u10           m0          Absolutely not.,Your daughters went to the pro...
dtype: object

In [8]:
user_dialogues = user_dialogues.reset_index()

In [9]:
user_dialogues.head(3)

Unnamed: 0,character_id,movie_id,0
0,u0,m0,"They do not!,Patrick -- is that- a.,Is that wo..."
1,u1,m0,"Just sent 'em through.,Never,Didn't have you p..."
2,u10,m0,"Absolutely not.,Your daughters went to the pro..."


In [10]:
user_dialogues.columns=["character_id","movie_id","text"]

In [69]:
def clean_text(corpus):
    punctuation = """.,?!:;(){}[]-"""
    corpus = [z.lower().replace("\n","").strip() for z in corpus.split(" ")]
    corpus = [z for z in corpus if not z in stopwords.words('english')]
    for c in punctuation:
        corpus = [z.replace(c," ") for z in corpus]
    return corpus

In [41]:
user_dialogues["dialogue_text"] = user_dialogues["text"].apply(lambda x : clean_text(x))

In [42]:
user_dialogues.head(3)

Unnamed: 0,character_id,movie_id,text,dialogue_text
0,u0,m0,"They do not!,Patrick -- is that- a.,Is that wo...","[they, do, not patrick, , is, that , a is,..."
1,u1,m0,"Just sent 'em through.,Never,Didn't have you p...","[just, sent, 'em, through never didn't, have,..."
2,u10,m0,"Absolutely not.,Your daughters went to the pro...","[absolutely, not your, daughters, went, to, t..."


In [43]:
user_dialogues.to_json("dataset"+os.sep+"user_dialogues_agg.json")

In [61]:
x_train = [ TaggedDocument(words = row["dialogue_text"],tags = [row["movie_id"]+"_"+row["character_id"]])
            for idx,row in user_dialogues.iterrows()] 

In [62]:
size=400
model_dm = gensim.models.Doc2Vec(min_count=1,window=10,size=size,sample=1e-3,negative=3,workers=8)

In [63]:
model_dm.build_vocab(x_train)

In [None]:
'''
Running this will take a lot of time
'''
for epoch in range(10):
    perm = np.random.permutation(np.array(x_train).shape[0])
    model_dm.train(x_train[perm])

In [96]:
model_dm.train(x_train)

In [73]:
with open("chracter_model.pkl","wb") as f:
    pickle.dump(model_dm,f,pickle.HIGHEST_PROTOCOL)

With a model now in place, we can use it to find similar characters. The below is the code for achieving it.

In [79]:
user_dialogues.iloc[10]

character_id                                                 u1006
movie_id                                                       m65
text             I would say that's a very safe assumption.,Thi...
dialogue_text    [i, would, say, that's, a, very, safe, assumpt...
Name: 10, dtype: object

In [94]:
tokens =user_dialogues.dialogue_text[10]
new_vector = model_dm.infer_vector(tokens)
sims = model_dm.docvecs.most_similar(positive=[new_vector],topn=3)

The result is that we have the top 3 similar characters and thei cosine similarity scores

In [95]:
sims

[('m269_u4050', 0.6910540461540222),
 ('m611_u8966', 0.6833600997924805),
 ('m186_u2836', 0.6817970275878906)]

Now lets try to find a character that resembles Indiana Jones !! 

In [97]:
chracters = pd.read_json("dataset"+os.sep+"movie_characters_metadata.json")

In [98]:
chracters.head(3)

Unnamed: 0,character_id,character_name,gender,movie_id,movie_title,position_credits
0,u0,BIANCA,f,m0,10 things i hate about you,4
1,u1,BRUCE,?,m0,10 things i hate about you,?
10,u10,SHARON,?,m0,10 things i hate about you,?


In [105]:
for idx,row in chracters.iterrows():
    name = row["character_name"].strip().lower()
    if "indiana" in name:
        print(row)

character_id                                       u1463
character_name                                   INDIANA
gender                                                 m
movie_id                                             m99
movie_title         indiana jones and the temple of doom
position_credits                                       1
Name: 1463, dtype: object


In [106]:
indiana_id = "m99_u1463"

In [107]:
new_vector = model_dm.docvecs[indiana_id]

In [108]:
sims = model_dm.docvecs.most_similar(positive=[new_vector],topn=3)

In [109]:
sims

[('m99_u1463', 1.0),
 ('m89_u1352', 0.8716634511947632),
 ('m29_u482', 0.8518658876419067)]

In [110]:
chracters[(chracters.movie_id=="m89") & (chracters.character_id=="u1352")]

Unnamed: 0,character_id,character_name,gender,movie_id,movie_title,position_credits
1352,u1352,TAVERN MAN #2,?,m89,highlander,?


In [111]:
chracters[(chracters.movie_id=="m29") & (chracters.character_id=="u482")]

Unnamed: 0,character_id,character_name,gender,movie_id,movie_title,position_credits
482,u482,OSBORNE,f,m29,basic,2


<ol>
<li>Highlander: http://www.imdb.com/title/tt0091203/
<li>Basic: http://www.imdb.com/title/tt0264395/
</ol>