In [302]:
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import re
%matplotlib inline

In [301]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aliya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

### Pipeline Design

The original paper uses the BookNLP pipeline to get the attributes of interest from the text. We will try either replicating that or building something gets the elements we need to create the story kernels. This notebook attempts to solve this text preprocessing problem.  

[BookNLP](https://github.com/dbamman/book-nlp) is a natural language processing pipeline that scales to books and other long documents (in English), including:
- Part-of-speech tagging (Stanford)
- Dependency parsing (MaltParser)
- Named entity recognition (Stanford)
- Character name clustering (e.g., "Tom", "Tom Sawyer", "Mr. Sawyer", "Thomas Sawyer" -> TOM_SAWYER)
- Quotation speaker identification
- Pronominal coreference resolution

Reference: David Bamman, Ted Underwood and Noah Smith, "A Bayesian Mixed Effects Model of Literary Character," ACL 2014.

### Example Processed File
This is what the paper provided in their data source. We will take a look at this to understand what is being done. 

In [333]:
df = pd.read_csv("long_df.csv")
df.head()

Unnamed: 0,cluster_id,movie_id,title,plot_summary,num_words,num_sents
0,1,14141235.0,12_(2007_film),The jury decides whether a young Chechen boy i...,178,10
1,1,11094452.0,Ek_Ruka_Hua_Faisla,The story begins in a courtroom where a teenag...,1676,56
2,1,92605.0,12_Angry_Men_(1957_film),The story begins in a courtroom where an 18-ye...,1193,42
3,1,11081144.0,12_Angry_Men_(1997_film),After the final closing arguments have been pr...,912,34
4,2,21798180.0,13_(2010_film),"Vincent ""Vince"" Ferro overhears people talkin...",535,24


In [129]:
p1 = pd.read_csv("data/processedSummaries/14141235.processed", sep = '\t',  error_bad_lines=False)

In [131]:
text = df.iloc[0,3]

In [132]:
text

"The jury decides whether a young Chechen boy is guilty of the murder of his stepfather, a Russian military officer. Initially it seems that the boy was the murderer. However, one of the jurors  votes in favour of acquittal. Since the verdict must be rendered unanimously, the jurors review the case, and one by one come to the conclusion that the boy was framed. The murder was performed by criminals involved in the construction business. The discussion is repeatedly interrupted by flashbacks from the boy's wartime childhood. In the end the foreman states that he was sure the boy did not commit the crime but he will not vote in favour of acquittal since the acquitted boy will be subsequently killed by the same criminals. In addition, the foreman reveals that he is a former intelligence agency officer. After a brief argument, the foreman agrees to join the majority. Later the foreman tells the boy that he will find the murderers."

## Spacy Pipelines


In [133]:
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span

In [134]:
nlp = spacy.load("en_core_web_lg")

In [135]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x26083321588>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x26084abea08>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x260832e1d08>)]

### Part 1 : Plot Similarity
1. We model events mentioned in a story by identifying all verbs occurring in the text of the narrative. 
2. We capture entities and their properties by identifying nouns and the adjectives that modify them.

This is for text entities that do not represent a character mention.

3. We represent the plot of a narrative using a bag-of-word representation of its events and entities (and their characteristics) as described above.
4. We then define Splot(si, sj ) as the cosine similarity between these representations for narratives si and sj. 

In [272]:
def preprocessing(text):
    doc = nlp(text)
    
    token_list = []
    token_ent = []
    token_lemmas = []
    token_pos =[]
    token_tags = []
    token_dep = []
    token_head_text = []
    token_head_pos = []
    token_children = []
    token_punct = []
    
    for token in doc:
        token_list.append(token.text)
        token_ent.append(token.ent_type_)
        token_lemmas.append(token.lemma_)
        token_pos.append(token.pos_)
        token_tags.append(token.tag_)
        token_dep.append(token.dep_)
        token_head_text.append(token.head)
        token_head_pos.append(token.head.pos_)
        token_children.append([child for child in token.children])
        token_punct.append(token.is_punct)
    
    processed = pd.DataFrame(token_list, token_lemmas).reset_index()
    processed.columns = ['tokens','lemmas']
    processed['ent'] = token_ent
    processed['pos'] = token_pos
    processed['tags'] = token_tags
    processed['dep'] = token_dep
    processed['head_text'] = token_head_text
    processed['head_pos'] = token_head_pos
    processed['children'] = token_children
    processed['punct'] = token_punct
    
    return processed

In [273]:
df.head()

Unnamed: 0,cluster_id,movie_id,title,plot_summary
0,1,14141235.0,12_(2007_film),The jury decides whether a young Chechen boy i...
1,1,11094452.0,Ek_Ruka_Hua_Faisla,The story begins in a courtroom where a teenag...
2,1,92605.0,12_Angry_Men_(1957_film),The story begins in a courtroom where an 18-ye...
3,1,11081144.0,12_Angry_Men_(1997_film),After the final closing arguments have been pr...
4,2,21798180.0,13_(2010_film),"Vincent ""Vince"" Ferro overhears people talkin..."


In [274]:
story1 = df.iloc[1,3]
story2 = df.iloc[2,3]

In [276]:
story1_p = preprocessing(story1)
story2_p = preprocessing(story2)

In [216]:
def cos_sim(a, b):
    """Takes 2 vectors a, b and returns the cosine similarity according 
    to the definition of the dot product
    """
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)

In [218]:
def similarity_calculator(a,b):
    N1 = len(a)
    N2 = len(b)

    counts = pd.DataFrame(a).reset_index()
    counts.columns = ['word', 'count1']

    df_temp = pd.DataFrame(b).reset_index()
    df_temp.columns = ['word', 'count2']

    final_counts = pd.merge(counts, df_temp, on ='word', how ='outer')
    final_counts.fillna(0, inplace = True)
    final_counts['count1'] = final_counts['count1']/N1
    final_counts['count2'] = final_counts['count2']/N2
    
    return cos_sim(final_counts['count1'].values,final_counts['count2'].values )

In [228]:
def s_plot(story1, story2):
    story1_p = preprocessing(story1)
    story2_p = preprocessing(story2)
    
    adj_counts1 = story1_p[(story1_p['pos']=='ADJ')&(story1_p['head_pos']=='NOUN')]['lemmas'].value_counts()
    adj_counts2 = story2_p[(story2_p['pos']=='ADJ')&(story2_p['head_pos']=='NOUN')]['lemmas'].value_counts()
    
    verb_counts1 = story1_p[story1_p['pos']=='VERB']['lemmas'].value_counts()
    verb_counts2 = story2_p[story2_p['pos']=='VERB']['lemmas'].value_counts()
    
    noun_counts1 = story1_p[(story1_p['pos']=='NOUN')&(story1_p['dep']!='nsubj')]['lemmas'].value_counts()
    noun_counts2 = story2_p[(story2_p['pos']=='NOUN')&(story2_p['dep']!='nsubj')]['lemmas'].value_counts()
    
    adj_similarity = similarity_calculator(adj_counts1, adj_counts2)
    verb_similarity = similarity_calculator(verb_counts1, verb_counts2)
    noun_similarity = similarity_calculator(noun_counts1, noun_counts2)
    
    return  np.mean(np.asarray(adj_similarity, verb_similarity, noun_similarity))

In [229]:
s_plot(story1, story2)

0.8709296863229077

### Part 2 : Character Similarity

In [323]:
main_chars1 = story1_p[(story1_p['tags']=='NN')&(story1_p['head_pos']=='VERB')&(story1_p['dep']=='nsubj')]['lemmas'].value_counts()
main_chars2 = story2_p[(story2_p['tags']=='NN')&(story2_p['head_pos']=='VERB')&(story2_p['dep']=='nsubj')]['lemmas'].value_counts()

In [324]:
temp1 = pd.DataFrame(main_chars1).reset_index()
temp1.columns = ['word', 'count1']
temp1['count1'] = temp1['count1']/len(main_chars1)

In [325]:
def counter(ls, w):
    lemmatizer = WordNetLemmatizer()
    count = 0
    for word in ls:
        if w.lower() == word.lower(): count += 1  
    return count

In [326]:
story_len = len(story1_p[story1_p['punct']==False])
tokenized_story = story1_p[story1_p['punct']==False].lemmas.str.lower().tolist()

In [327]:
counter(tokenized_story,'juror')

38

In [329]:
temp1

Unnamed: 0,word,count1
0,Juror,0.263158
1,defendant,0.263158
2,boy,0.157895
3,witness,0.157895
4,story,0.105263
5,vote,0.105263
6,judge,0.105263
7,jury,0.105263
8,storm,0.052632
9,murder,0.052632


In [330]:
total_mentions = 0
counts = []
for word in temp1['word'].tolist():
    count = counter(tokenized_story, word)
    counts.append(count)
    total_mentions += count

In [331]:
temp1['total_counts'] = counts
temp1['prominence']

Unnamed: 0,word,count1,total_counts
0,Juror,0.263158,38
1,defendant,0.263158,8
2,boy,0.157895,8
3,witness,0.157895,6
4,story,0.105263,2
5,vote,0.105263,20
6,judge,0.105263,2
7,jury,0.105263,5
8,storm,0.052632,1
9,murder,0.052632,8


In [246]:
len(temp1)*.2

6.0

In [None]:


df_temp = pd.DataFrame(main_chars2).reset_index()
df_temp.columns = ['word', 'count2']

final_counts = pd.merge(counts, df_temp, on ='word', how ='outer')
final_counts.fillna(0, inplace = True)

In [237]:
pd.DataFrame(main_chars1).reset_index()

Unnamed: 0,index,lemmas
0,defendant,8
1,Juror,7
2,jurors,6
3,boy,6
4,witness,4
5,vote,4
6,judge,2
7,murder,2
8,jury,2
9,Jurors,2
