In [54]:
import os
import re
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm

## Game of Thrones meets Flair

In [2]:
with open("GoT_script.txt", 'r') as f:
    script = f.read()

In [3]:
script[:1000]

'[First scene opens with three Rangers riding through a tunnel, leaving the Wall, and going into the woods. (Eerie music in background) One Ranger splits off and finds a campsite full of mutilated bodies, including a child hanging from a tree branch. A birds-eye view shows the bodies arranged in a shield-like pattern. The Ranger rides back to the other two.]\n\nWAYMAR ROYCE: What dyou expect? Theyre savages. One lot steals a goat from another lot and before you know it, theyre ripping each other to pieces.\n\nWILL: Ive never seen wildlings do a thing like this. Ive never seen a thing like this, not ever in my life.\n\nWAYMAR ROYCE: How close did you get?\n\nWILL: Close as any man would.\n\nGARED: We should head back to the wall.\n\nROYCE: Do the dead frighten you?\n\nGARED: Our orders were to track the wildlings. We tracked them. They wont trouble us no more.\n\nROYCE: You dont think hell ask us how they died? Get back on your horse.\n\n[GARED grumbles.]\n\nWILL: Whatever did it to the

In [4]:
def clean_script(script):
    script = script.split('\n')
    #get rid of [scene setters], (parentheticals), and worthless characters talking like 'MAN #2'
    script = [line for line in script if re.match('^[A-Z\s]+\:', line)!=None]
    return script

In [5]:
script = clean_script(script)

In [6]:
script[:5]

['WAYMAR ROYCE: What dyou expect? Theyre savages. One lot steals a goat from another lot and before you know it, theyre ripping each other to pieces.',
 'WILL: Ive never seen wildlings do a thing like this. Ive never seen a thing like this, not ever in my life.',
 'WAYMAR ROYCE: How close did you get?',
 'WILL: Close as any man would.',
 'GARED: We should head back to the wall.']

In [7]:
print('# of lines in script: ', len(script))
print('# of words in corpus: ', len(" ".join([line.split(':')[1] for line in script]).split()))
print('# of unique characters: ', len(list(set([line.split(':')[0] for line in script]))))

# of lines in script:  11806
# of words in corpus:  139477
# of unique characters:  375


### NER

<img src="imgs/Titan_of_Braavos.jpg">

In [8]:
ex = [x for x in script if 'Braavos' in x][2]
print(ex)

JAQEN: If the day comes when you must find me again, just give that coin to any man from Braavos and say these words to him, "Valar Morghulis."


In [9]:
from flair.data import Sentence
from flair.models import SequenceTagger

# make a sentence
sentence = Sentence(ex)

# load the NER tagger
tagger = SequenceTagger.load('ner')

# run NER over sentence
tagger.predict(sentence)

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
2019-02-24 00:50:03,873 loading file C:\Users\nick\.flair\models\en-ner-conll03-v0.4.pt


[Sentence: "JAQEN: If the day comes when you must find me again, just give that coin to any man from Braavos and say these words to him, "Valar Morghulis."" - 28 Tokens]

In [10]:
for entity in sentence.get_spans('ner'):
    print(entity)

LOC-span [20]: "Braavos"
PER-span [27,28]: ""Valar Morghulis.""


### POS Tagging

In [11]:
# load the basic POS tagger
pos_tagger = SequenceTagger.load('pos')

# run POS tagger over sentence (same as ner)
pos_tagger.predict(sentence)

2019-02-24 00:50:08,273 loading file C:\Users\nick\.flair\models\en-pos-ontonotes-v0.2.pt


[Sentence: "JAQEN: If the day comes when you must find me again, just give that coin to any man from Braavos and say these words to him, "Valar Morghulis."" - 28 Tokens]

In [12]:
print(sentence.to_tagged_string())

JAQEN: <NNP> If <IN> the <DT> day <NN> comes <VBZ> when <WRB> you <PRP> must <MD> find <VB> me <PRP> again, <IN> just <RB> give <VB> that <DT> coin <NN> to <IN> any <DT> man <NN> from <IN> Braavos <S-LOC/NNP> and <CC> say <VB> these <DT> words <NNS> to <IN> him, <PRP> "Valar <B-PER/NNP> Morghulis." <E-PER/NNP>


### Embeddings

In [13]:
from flair.embeddings import StackedEmbeddings, WordEmbeddings, FlairEmbeddings, BertEmbeddings, ELMoEmbeddings

In [14]:
# load and concatenate embeddings
embeddings = StackedEmbeddings([
                                #WordEmbeddings('glove'), 
                                #FlairEmbeddings('news-forward'), 
                                #FlairEmbeddings('news-backward'),
                                BertEmbeddings('bert-base-cased'), 
                                #ELMoEmeddings('small')
                               ])

Word Embeddings

In [15]:
ex = "I drink and I know things."

sentence = Sentence(ex)

# just embed a sentence using the StackedEmbedding as you would with any single embedding.
embeddings.embed(sentence)

# now check out the embedded tokens.
for token in sentence:
    print(token)
    print(token.embedding, '\n')

Token: 1 I
tensor([ 0.3647, -0.7599,  0.8395,  ...,  0.7779, -0.2211,  0.7162]) 

Token: 2 drink
tensor([ 0.4605, -0.1452, -0.0958,  ..., -0.3545, -0.3075,  0.4771]) 

Token: 3 and
tensor([ 0.6379, -0.5159, -0.3396,  ..., -0.3434, -0.4052, -0.6125]) 

Token: 4 I
tensor([ 0.5343, -0.9951,  0.5720,  ...,  1.1661, -0.3379,  0.2322]) 

Token: 5 know
tensor([ 0.8331, -0.4658, -0.1732,  ...,  0.2154, -0.7959,  0.4447]) 

Token: 6 things.
tensor([ 0.2392, -0.0800,  0.4743,  ..., -0.3976, -0.7713,  0.2037]) 



In [16]:
from torch.nn import functional as F

# see which is most similar to the first 'I'
compare_word = sentence[0]

for word, token in zip(ex.split()[1:], sentence[1:]):
    print("Similarity between  'I'  and  '{}'  : {:0.2f}".format(\
           word, F.cosine_similarity(compare_word.embedding, token.embedding, dim=0)), '\n')

Similarity between  'I'  and  'drink'  : 0.52 

Similarity between  'I'  and  'and'  : 0.57 

Similarity between  'I'  and  'I'  : 0.87 

Similarity between  'I'  and  'know'  : 0.51 

Similarity between  'I'  and  'things.'  : 0.49 



Document Embeddings<br>
1. we'll consider each corpus of a character's line as a document
2. then check similarity

In [17]:
documents={}

#instantiate characters 'documents'
characters = list(set([line.split(':')[0] for line in script]))
for char in characters:
    documents[char] = ''

for line in script:
    char, dialogue = line.split(':', 1) #only want first split
    documents[char] += dialogue

In [18]:
assert len(documents) == len(characters)

In [19]:
documents['TYRION'][:100]

' Mmh. It is true what they say about the Northern girls. I did hear something about that. And the ot'

Get Document Embeddings
* can either Pool ('mean', 'min', or 'max' word embeddings)
* OR RNN (pass through word embeddings to another RNN layer)

In [20]:
from flair.embeddings import DocumentPoolEmbeddings

# initialize the word embeddings
glove_embedding = WordEmbeddings('glove')
flair_embedding_forward = FlairEmbeddings('news-forward')
flair_embedding_backward = FlairEmbeddings('news-backward')

doc_embeddings = DocumentPoolEmbeddings([#glove_embedding,
                                         flair_embedding_backward,
                                         flair_embedding_forward],
                                        mode = 'mean')

In [21]:
char_embeddings = {}
for char in tqdm(characters):
    # get the corpus for character (we're referring to as document)
    char_lines = documents[char]
    #intialize for flair
    char_lines = Sentence(char_lines)
    #embed
    doc_embeddings.embed(char_lines)
    embedded_char_lines = char_lines.get_embedding()
    
    #transform to numpy so we can play with it
    embedded_char_lines = embedded_char_lines.data.numpy()
    
    char_embeddings[char] = embedded_char_lines

HBox(children=(IntProgress(value=0, max=375), HTML(value='')))




In [22]:
char_embeddings['TYRION'].shape

(4096,)

In [93]:
def cosine_similarity(vA, vB):
    return np.dot(vA, vB) / (np.linalg.norm(vA) * np.linalg.norm(vB))

def most_similar_chars(char, N=5):
    similar_chars = pd.DataFrame(columns=['Character', 'Cosine_Similarity'])
    for other_char, embedding in char_embeddings.items():
        if other_char != char:
            similarity = cosine_similarity(char_embeddings[char], char_embeddings[other_char])
            similar_chars = similar_chars.append({'Character': other_char, 'Cosine_Similarity': similarity}, ignore_index=True)
    print('Most Similar Characters to {}:\n'.format(char))
    return similar_chars.sort_values(by='Cosine_Similarity', ascending=False).head(n=N).reset_index(drop=True)
    
most_similar_chars('TYRION', N=5)

Most Similar Characters to TYRION:



Unnamed: 0,Character,Cosine_Similarity
0,CERSEI,0.998411
1,JAIME,0.997206
2,LITTLEFINGER,0.995935
3,OLENNA,0.995551
4,BRIENNE,0.994879


TSNE Visualization

In [47]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, n_iter=1000)
tsne_result = tsne.fit_transform(pd.DataFrame(char_embeddings).T)

In [102]:
df_plot = pd.DataFrame(tsne_result, columns=['x', 'y'])
df_plot['char_name'] = char_embeddings.keys()

print(df_plot.shape)
df_plot.head()

(375, 3)


Unnamed: 0,x,y,char_name
0,3.320947,-1.171026,LORD MORMONT
1,2.07687,-2.836047,PROTESTER
2,-0.816712,-3.576952,CRESSEN
3,-7.633727,1.440769,MYRCELLA
4,-0.253823,-5.430227,FARMER HAMLET


In [103]:
char_imgs = [img_file for img_file in os.listdir(os.path.join('imgs', 'char_imgs')) if img_file.endswith('.jpg')]
print('HBO has pictures for {} characters'.format(len(char_imgs)))
char_imgs[:5]

HBO has pictures for 83 characters


['alliser-thorne.jpg',
 'archmaester-ebrose.jpg',
 'arya-stark.jpg',
 'balon-greyjoy.jpg',
 'barristan-selmy.jpg']

In [104]:
missed_names = {'GRAND MAESTER PYRCELLE':'grand-maester-pycelle.jpg', 'NED':'eddard-ned-stark.jpg', 
                'PRIESTESS':'high-priestess.jpg', 'BLACKFISH':'brynden-the-blackfish-tully.jpg'}

df_plot['img_path'] = ''
for index, row in df_plot.iterrows():
    if row['char_name'] in missed_names.keys():
        df_plot.at[index, 'img_path'] = missed_names[row['char_name']]
    for img in char_imgs:
        clean_img = img.replace('.jpg', '').replace('-', ' ').upper()
        if (row['char_name'] == clean_img or row['char_name'] == clean_img.split()[0]):
            df_plot.at[index, 'img_path'] = img

In [105]:
df_plot = df_plot[df_plot['img_path']!=''].reset_index(drop=True)
print(df_plot.shape)
df_plot.head()

(85, 4)


Unnamed: 0,x,y,char_name,img_path
0,-7.633727,1.440769,MYRCELLA,myrcella-baratheon.jpg
1,-2.424271,-0.583655,RENLY,renly-baratheon.jpg
2,3.823956,-4.120924,BARRISTAN,barristan-selmy.jpg
3,-6.401622,-5.000396,ORELL,orell.jpg
4,-0.270835,-2.012497,JORAH,jorah-mormont.jpg


In [149]:
img_json = {'images':[]}
for _, row in df_plot.iterrows():
    img_json['images'].append({
        "source": os.path.join('imgs', 'char_imgs', row['img_path']),
        "xref": "x",
        "yref": "y",
        "x": row['x'],
        "y": row['y'],
        "sizex": 1.0,
        "sizey": 1.0,
        "opacity": 1,
        "xanchor": "center",
        "yanchor": "middle"
      })

In [119]:
list(range(math.floor(df_plot['x'].min()),math.ceil(df_plot['x'].max())))

[-8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [151]:
import plotly.offline as offline
import plotly.graph_objs as go
import plotly.io as pio

offline.init_notebook_mode(connected=True)

trace1= go.Scatter(x=list(df_plot['x']), 
                   y=list(df_plot['y']), 
                   mode='markers')
layout= go.Layout(images= img_json['images'])
fig=go.Figure(data=[trace1],layout=layout)
offline.iplot(fig)

#pio.write_image(fig, os.path.join('imgs', 'GoT_tsne.jpeg'))