# Understanding the sentence structure

In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
from islabtextminer.detection.event import PathClassifier

In [3]:
import pymongo

In [4]:
db = pymongo.MongoClient()['movie-dialogs']['lines']

In [5]:
sample = [r['text'] for r in db.find({'character.movie.id': 'm42'})]

In [6]:
sample[:10]

["-- He's just a lucky guy.",
 'Monsieur Rick, I --',
 "Yes, well, everybody in Casablanca has problems. Yours may work out. You'll excuse me.",
 'Oh, but if you knew what it means to us to leave Europe, to get to America! Oh, but if Jan should find out! He is such a boy. In many ways I am so much older than he is.',
 'Go back to Bulgaria.',
 'Oh, yes, please.',
 'You want my advice?',
 "And he never knew, and the girl kept this bad thing locked in her heart? That would be all right, wouldn't it?",
 'Nobody ever loved me that much.',
 'He always has.']

## Parsing of dialogues

In [7]:
from nltk.tokenize import sent_tokenize
import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")

In [8]:
pattern_map = {}
ntags = ['amod', 'compound', 'prep', 'pobj', 'conj', 'cc', 'acl', 'nmod', 'det', 'appos', 'poss']
vtags = ['aux', 'xcomp', 'acomp', 'neg', 'attr', 'advmod', 'auxpass']

In [12]:
k = sample[2]
sentences = [x for x in sent_tokenize(k)]

In [13]:
for sent in sentences:
    S = PathClassifier(sent, nlp, pattern_map, subj=['nsubj', 'nsubjpass'], 
                   obj=['dobj', 'prep'], noun_tags=ntags, verb_tags=vtags)
    displacy.render(S.doc)

In [14]:
predicates, missing = S.parse()

print('\n', '==========predicates==========')
for s, p, o in predicates:
    print(s, p, o)
print('\n', '==========missing==========')
for s, p, o in missing:
    print(s, p, o)


[[You]] ['ll, excuse] [[me]]

[] ['ll] []


## Naive LM

In [21]:
import nltk
from nltk.tokenize import word_tokenize
from collections import defaultdict

In [22]:
I = defaultdict(lambda: defaultdict(lambda: 0))
for line in sample:
    for sentence in sent_tokenize(line):
        tokens = ['#S'] + word_tokenize(sentence) + ['#E']
        for (a, b) in nltk.ngrams(tokens, n=2):
            I[a][b] += 1

In [26]:
I['I']['mean'] / sum(I['I'].values())

0.008968609865470852

In [37]:
test = 'good idea is think this is a I this.'

In [43]:
bgrams = nltk.ngrams(['#S'] + word_tokenize(test) + ['#E'], n=2)

In [44]:
([I[a][b] / sum(I[a].values()) for a, b in bgrams])

[0.15295815295815296,
 0.01569506726457399,
 0.043478260869565216,
 0.11428571428571428,
 0.07964601769911504,
 0.0330188679245283,
 0.058823529411764705,
 0.3333333333333333,
 0.9935543278084714]

## Parsing of argumentative texts

In [None]:
brat = pymongo.MongoClient()['argumentmining']['brat_corpus']

In [None]:
document = 'essay01'
sample = [x['text'] for x in brat.find({'doc_id': document})]

In [None]:
for sent in sample:
    S = PathClassifier(sent, nlp, pattern_map, subj=['nsubj', 'nsubjpass'], 
                   obj=['dobj', 'prep'], noun_tags=ntags, verb_tags=vtags)
    displacy.render(S.doc)

In [None]:
S = PathClassifier(sample[0], nlp, pattern_map, subj=['nsubj', 'nsubjpass'], 
                   obj=['dobj', 'prep'], noun_tags=ntags, verb_tags=vtags)

In [None]:
sent = sample[6] 
print(sent)

S = PathClassifier(sent, nlp, pattern_map, subj=['nsubj', 'nsubjpass'], 
                   obj=['dobj', 'prep'], noun_tags=ntags, verb_tags=vtags)


predicates, missing = S.parse()
print('\n', '==========predicates==========')
for s, p, o in predicates:
    print(s, p, o)
print('\n', '==========missing==========')
for s, p, o in missing:
    print(s, p, o)
displacy.render(S.doc)