# Sentiment Classifier - Feature Extractor

Load text data and preprocessing pipeline.

In [1]:
import os
import sys
preproc_path = os.path.abspath(os.path.join('../..'))
print(preproc_path)
if preproc_path not in sys.path:
    sys.path.append(preproc_path)

/Users/andrew/Documents/college/cs8803-css/replication-project/code


In [2]:
from src import util
test_file = '../../data/database.mpqa.3.0/docs/20010926/23.17.57-23406'
with open(test_file, 'r') as f:
    parsed, sentences, coref_chains = util.load_from_text(f)

In [21]:
import importlib
from src.preproc import entity_extractor, annotate
from src.base_models import sentiment_lexicon

In [22]:
annotate.annotate(sentences, coref_chains)

# create new EntityExtractor after annotating coref mentions
ee = entity_extractor.EntityExtractor.from_sentences(sentences)

In [10]:
entity_ids = list(set(eid for _, eid in ee.ids))
entity_ids, len(entity_ids)

([('wikidata', 'Q207'),
  ('wikidata', 'Q127840'),
  ('wikidata', 'Q42295'),
  ('wikidata', 'Q956'),
  ('wikidata', 'Q824'),
  ('wikidata', 'Q40464'),
  ('wikidata', 'Q30'),
  ('wikidata', 'Q740345'),
  ('wikidata', 'Q22368'),
  ('wikidata', 'Q865'),
  ('wikidata', 'Q1867'),
  ('wikidata', 'Q1317'),
  ('wikidata', 'Q1124'),
  ('wikidata', 'Q148'),
  ('wikidata', 'Q804988'),
  ('manual', 0),
  ('wikidata', 'Q889'),
  ('wikidata', 'Q9317972'),
  ('wikidata', 'Q846570'),
  ('wikidata', 'Q61')],
 20)

In [11]:
features = {}
for i, eid in enumerate(entity_ids):
    for eid2 in entity_ids[i + 1:]:
        features[eid, eid2] = {}
        features[eid2, eid] = {}
len(features)

380

In [12]:
test_pair = list(features.keys())[0]
test_pair

(('wikidata', 'Q207'), ('wikidata', 'Q127840'))

In [13]:
sent_lex_path = '../../data/subjectivity_clues_hltemnlp05/subjclueslen1-HLTEMNLP05.tff'
sl, _, _ = sentiment_lexicon.SentimentLexicon.from_mpqa_file(sent_lex_path)

## Utilities

In [49]:
# return the sentences in which the eid occurs
eid_sents = lambda eid: set(sent_idx for sent_idx, _, _ in ee.occurances[eid])
span_to_slice = lambda span: [token for token in sentences[span[0]]['tokens'][span[1]:span[2]]]

## Dependency Features



In [119]:
from src.preproc.util import get_text

In [121]:
get_text(sentences[0]['tokens'])

'McCully said any action against Henry is a matter entirely for TVNZ'

Map dependency parse to tokens so we can do a search.

In [124]:
depparse = sentences[0]['enhancedPlusPlusDependencies']
depparse[:5]

[{'dep': 'ROOT',
  'dependent': 2,
  'dependentGloss': 'said',
  'governor': 0,
  'governorGloss': 'ROOT'},
 {'dep': 'nsubj',
  'dependent': 1,
  'dependentGloss': 'McCully',
  'governor': 2,
  'governorGloss': 'said'},
 {'dep': 'det',
  'dependent': 3,
  'dependentGloss': 'any',
  'governor': 4,
  'governorGloss': 'action'},
 {'dep': 'nsubj',
  'dependent': 4,
  'dependentGloss': 'action',
  'governor': 9,
  'governorGloss': 'matter'},
 {'dep': 'case',
  'dependent': 5,
  'dependentGloss': 'against',
  'governor': 6,
  'governorGloss': 'Henry'}]

In [129]:
for dep in depparse:
    gov_idx = dep['governor'] - 1
    dep_idx = dep['dependent'] - 1
    dep_type = dep['dep']
    if dep_type == 'ROOT':
        sentences[0]['dep_root'] = dep['dependent']
    else:
        gov = sentences[0]['tokens'][gov_idx]
        if 'dependents' not in gov:
            gov['dependents'] = set()
        gov['dependents'].add((dep_idx, dep_type))
    sentences[0]['tokens'][dep_idx]['governor'] = (gov_idx, dep_type)

sentences[0]['tokens'][:1]

[{'after': ' ',
  'before': '',
  'characterOffsetBegin': 0,
  'characterOffsetEnd': 7,
  'entity_id': ('wikidata', 'Q36594850'),
  'governor': (1, 'nsubj'),
  'index': 1,
  'lemma': 'McCully',
  'ner': 'PERSON',
  'originalText': 'McCully',
  'pos': 'NNP',
  'speaker': 'PER0',
  'word': 'McCully'}]

In [130]:
from enum import Enum, auto as enum_auto

class DepDirection(Enum):
    GOV = enum_auto()
    DEP = enum_auto()

def find_dep_path(tokens, source_idx, dest_idx):
    visited = set()
    q = []
    q.append((source_idx, [((None, None), source_idx)]))
    curr, path = q.pop(0)
    while curr != dest_idx:
        visited.add(curr)
        gov, gov_type = tokens[curr]['governor']
        if gov_type != 'ROOT' and gov not in visited:
            q.append((gov, path + [((DepDirection.GOV, gov_type), gov)]))
        for dep, dep_type in tokens[curr]['dependents'] if 'dependents' in tokens[curr] else []:
            if dep not in visited:
                q.append((dep, path + [((DepDirection.DEP, dep_type), dep)]))
        if len(q) == 0:
            return None
        curr, path = q.pop(0)
    return path

find_dep_path(sentences[0]['tokens'], 0,1)

[((None, None), 0), ((<DepDirection.GOV: 1>, 'nsubj'), 1)]

In [131]:
get_text(sentences[0]['tokens'])

'McCully said any action against Henry is a matter entirely for TVNZ'

In [132]:
for token in sentences[0]['tokens']:
    print(token['index'] - 1, token['originalText'], token['entity_id'] if 'entity_id' in token else None)

0 McCully ('wikidata', 'Q36594850')
1 said None
2 any None
3 action None
4 against None
5 Henry ('wikidata', 'Q1158477')
6 is None
7 a None
8 matter None
9 entirely None
10 for None
11 TVNZ ('wikidata', 'Q1186424')


In [133]:
find_dep_path(sentences[0]['tokens'], 0, 5)

[((None, None), 0),
 ((<DepDirection.GOV: 1>, 'nsubj'), 1),
 ((<DepDirection.DEP: 2>, 'ccomp'), 8),
 ((<DepDirection.DEP: 2>, 'nsubj'), 3),
 ((<DepDirection.DEP: 2>, 'nmod:against'), 5)]

In [134]:
occurs_by_sentence = [set() for sentence in sentences]
for eid, occurs in ee.occurances.items():
    for occur in occurs:
        occurs_by_sentence[occur[0]].add((occur, eid))

occurs_by_sentence[0]

{((0, 0, 1), ('wikidata', 'Q36594850')),
 ((0, 5, 6), ('wikidata', 'Q1158477')),
 ((0, 11, 12), ('wikidata', 'Q1186424'))}

In [159]:
def dep_path_to_token_slice(sentence, dep_path):
    t_slice = []
    for _, idx in dep_path:
        t_slice.append(sentence[idx])
    return t_slice

def feat_sentiment_dobj_nsubjrev(sentence, dep_path):
    if len(dep_path) > 3:
        return None
    deps = [dep for dep, _ in dep_path]
    if (DepDirection.DEP, 'dobj') not in deps or (DepDirection.GOV, 'nsubj') not in deps:
        return None
    return sl.get_sentiment_label(dep_path_to_token_slice(sentence, dep_path))[0]

def feat_sentiment_nsubj_ccomp_nsubj(sentence, dep_path):
    target = [
        (DepDirection.GOV, 'nsubj'),
        (DepDirection.DEP, 'ccomp'),
        (DepDirection.DEP, 'nsubj')
    ]
    deps = [(dep) for dep, _ in dep_path]
    if len(deps) < 3:
        return None
    if not all([dep in deps for dep in target]):
        return None
    start_idx = deps.index(target[0]) # first occurance of target[0]
    end_idx = len(deps) - list(reversed(deps)).index(target[2]) # last occurance of target[2]
    if target[1] not in deps[start_idx:end_idx]: # is target[1] between target[0] and target[2]
        return None
    return sl.get_sentiment_label(dep_path_to_token_slice(sentence, dep_path))[0]

def feat_sentiment_no_named_entity(sentence, dep_path):
    t_slice = dep_path_to_token_slice(sentence, dep_path)[1:-1] # exclude first and last tokens
    num_entity_mentions = sum(1 for token in t_slice if 'entity_id' in token)
    if num_entity_mentions != 0:
        return None
    return sl.get_sentiment_label(t_slice)[0]

def feat_indicator_nmod_against(sentence, dep_path):
    deps = [dep_type for (_, dep_type), _ in dep_path]
    return 'nmod:against' in deps

feat_map = {
    'sentiment_dobj_nsubjrev': feat_sentiment_dobj_nsubjrev,
    'sentiment_nsubj_ccomp_nsubj': feat_sentiment_nsubj_ccomp_nsubj,
    'sentiment_no_named_entity': feat_sentiment_no_named_entity,
    'indicator_nmod_against': feat_indicator_nmod_against
}
def dependency_features(sentence, dep_path):
    return dict(
        (key, feat_val) 
        for key, feat_val in (
            (key, feat_func(sentence, dep_path)) 
            for key, feat_func in feat_map.items()) 
        if feat_val
    )

In [160]:
for (_, source_idx, _), s_eid in occurs_by_sentence[0]:
    for (_, dest_idx, _), d_eid in occurs_by_sentence[0]:
        if source_idx == dest_idx:
            continue
        dep_path = find_dep_path(sentences[0]['tokens'], source_idx, dest_idx)
        print(s_eid, d_eid, dep_path)
        print(dependency_features(sentences[0]['tokens'], dep_path), '\n')

('wikidata', 'Q1186424') ('wikidata', 'Q1158477') [((None, None), 11), ((<DepDirection.GOV: 1>, 'nmod:for'), 8), ((<DepDirection.DEP: 2>, 'nsubj'), 3), ((<DepDirection.DEP: 2>, 'nmod:against'), 5)]
{'indicator_nmod_against': True} 

('wikidata', 'Q1186424') ('wikidata', 'Q36594850') [((None, None), 11), ((<DepDirection.GOV: 1>, 'nmod:for'), 8), ((<DepDirection.GOV: 1>, 'ccomp'), 1), ((<DepDirection.DEP: 2>, 'nsubj'), 0)]
{} 

('wikidata', 'Q1158477') ('wikidata', 'Q1186424') [((None, None), 5), ((<DepDirection.GOV: 1>, 'nmod:against'), 3), ((<DepDirection.GOV: 1>, 'nsubj'), 8), ((<DepDirection.DEP: 2>, 'nmod:for'), 11)]
{'indicator_nmod_against': True} 

('wikidata', 'Q1158477') ('wikidata', 'Q36594850') [((None, None), 5), ((<DepDirection.GOV: 1>, 'nmod:against'), 3), ((<DepDirection.GOV: 1>, 'nsubj'), 8), ((<DepDirection.GOV: 1>, 'ccomp'), 1), ((<DepDirection.DEP: 2>, 'nsubj'), 0)]
{'indicator_nmod_against': True} 

('wikidata', 'Q36594850') ('wikidata', 'Q1186424') [((None, None), 0

In [171]:
print(sl.get_sentiment('against', '*'))

None


## Document Features

1. Flag feature if both:
    - mentioned in headline
    - appear only once in the document
3. When both are most frequent entities, add document sentiment label as feature
4. When entities do not co-occur in any sentence, add # of mentions rank of holder and target 

### Headline mentions

In [10]:
# TODO headline mention

### Single appearance

Number of appearances in the document for each entity:

In [11]:
len(ee.occurances[test_pair[0]]), len(ee.occurances[test_pair[1]])

(1, 3)

In [12]:
len(ee.occurances[test_pair[0]]) == 1 and len(ee.occurances[test_pair[1]]) == 1

False

### Most frequent entities

In [13]:
set([eid for eid, _ in ee.most_common(2)]) == set(test_pair)

False

In [19]:
doc_slice = (token for sentence in sentences for token in sentence['tokens'])
doc_sent, lex_match_count = sl.get_sentiment_label(doc_slice)
doc_sent, lex_match_count
# if 2 most frequent entities, add doc_sentiment as feature for pair

(<Sentiment.NEGATIVE: 'negative'>,
 Counter({None: 288, <Sentiment.NEGATIVE: 'negative'>: 5}))

### Co-occurance and Rank

Check for co-occurance in sentences

In [63]:
eid_sents(test_pair[0]), eid_sents(test_pair[1])

({0, 2, 3, 5, 6, 8}, {1, 2, 4, 5})

In [77]:
cooccurs = eid_sents(test_pair[0]) & eid_sents(test_pair[1])
cooccurs, len(cooccurs) == 0

({2, 5}, False)

In [95]:
# order eids by descending number of occurances
eid_by_num_occurs = sorted(ee.occurances.keys(), 
                           key=lambda eid: len(ee.occurances[eid]), 
                           reverse=True)
eid_by_num_occurs

[('wikidata', 'Q865'),
 ('wikidata', 'Q30'),
 ('wikidata', 'Q1867'),
 ('wikidata', 'Q804988'),
 ('wikidata', 'Q61'),
 ('wikidata', 'Q889'),
 ('wikidata', 'Q42295'),
 ('wikidata', 'Q40464'),
 ('wikidata', 'Q22368'),
 ('wikidata', 'Q207'),
 ('wikidata', 'Q824'),
 ('wikidata', 'Q740345'),
 ('wikidata', 'Q846570'),
 ('manual', 0),
 ('wikidata', 'Q127840'),
 ('wikidata', 'Q1317'),
 ('wikidata', 'Q9317972'),
 ('wikidata', 'Q1124'),
 ('wikidata', 'Q148'),
 ('wikidata', 'Q956')]

In [96]:
occur_rank = lambda eid: eid_by_num_occurs.index(eid)
occur_rank(test_pair[0]), occur_rank(test_pair[1])

(9, 14)

In [101]:
occur_rank(('wikidata', 'Q956'))

19

## Quotation Features

**TODO**

## e2e Test

In [175]:
from src import util
from src.base_models import sentiment
importlib.reload(sentiment);

In [176]:
util.write_deps_to_tokens(sentences)
features = sentiment.get_features(sl, sentences, ee) 

In [177]:
for key, feats in features.items():
    print(key, '\n', feats, '\n')

(('wikidata', 'Q1867'), ('wikidata', 'Q61')) 
 [] 

(('wikidata', 'Q1867'), ('wikidata', 'Q40464')) 
 [] 

(('wikidata', 'Q1867'), ('wikidata', 'Q30')) 
 [] 

(('wikidata', 'Q1867'), ('wikidata', 'Q865')) 
 [] 

(('wikidata', 'Q1867'), ('wikidata', 'Q22368')) 
 [] 

(('wikidata', 'Q1867'), ('wikidata', 'Q889')) 
 [('indicator_nmod_against', True), ('indicator_nmod_against', True)] 

(('wikidata', 'Q61'), ('wikidata', 'Q1867')) 
 [] 

(('wikidata', 'Q61'), ('wikidata', 'Q40464')) 
 [] 

(('wikidata', 'Q61'), ('wikidata', 'Q30')) 
 [] 

(('wikidata', 'Q61'), ('wikidata', 'Q865')) 
 [] 

(('wikidata', 'Q61'), ('wikidata', 'Q22368')) 
 [] 

(('wikidata', 'Q61'), ('wikidata', 'Q889')) 
 [('indicator_nmod_against', True)] 

(('wikidata', 'Q40464'), ('wikidata', 'Q1867')) 
 [] 

(('wikidata', 'Q40464'), ('wikidata', 'Q61')) 
 [] 

(('wikidata', 'Q40464'), ('wikidata', 'Q30')) 
 [] 

(('wikidata', 'Q40464'), ('wikidata', 'Q865')) 
 [] 

(('wikidata', 'Q40464'), ('wikidata', 'Q22368')) 
 [('ind

 [('indicator_occurs_once', True), ('rank_no_cooccur_holder', 17), ('rank_no_cooccur_target', 14)] 

(('wikidata', 'Q1124'), ('wikidata', 'Q1317')) 
 [('indicator_occurs_once', True), ('rank_no_cooccur_holder', 17), ('rank_no_cooccur_target', 15)] 

(('wikidata', 'Q1124'), ('wikidata', 'Q9317972')) 
 [('indicator_occurs_once', True), ('rank_no_cooccur_holder', 17), ('rank_no_cooccur_target', 16)] 

(('wikidata', 'Q1124'), ('wikidata', 'Q956')) 
 [('indicator_occurs_once', True), ('rank_no_cooccur_holder', 17), ('rank_no_cooccur_target', 19)] 

(('wikidata', 'Q148'), ('wikidata', 'Q1867')) 
 [('rank_no_cooccur_holder', 18), ('rank_no_cooccur_target', 2)] 

(('wikidata', 'Q148'), ('wikidata', 'Q40464')) 
 [('indicator_occurs_once', True), ('rank_no_cooccur_holder', 18), ('rank_no_cooccur_target', 7)] 

(('wikidata', 'Q148'), ('wikidata', 'Q865')) 
 [('rank_no_cooccur_holder', 18), ('rank_no_cooccur_target', 0)] 

(('wikidata', 'Q148'), ('wikidata', 'Q22368')) 
 [('indicator_occurs_once',

In [178]:
len(features)

380

In [179]:
from collections import Counter
feat_count = Counter()
for _, feats in features.items():
    for feat in feats:
        feat_count[feat] += 1
feat_count.most_common()

[(('indicator_occurs_once', True), 156),
 (('indicator_nmod_against', True), 22),
 (('rank_no_cooccur_target', 17), 17),
 (('rank_no_cooccur_target', 18), 17),
 (('rank_no_cooccur_holder', 17), 17),
 (('rank_no_cooccur_holder', 18), 17),
 (('rank_no_cooccur_target', 12), 16),
 (('rank_no_cooccur_target', 13), 16),
 (('rank_no_cooccur_target', 14), 16),
 (('rank_no_cooccur_target', 15), 16),
 (('rank_no_cooccur_target', 19), 16),
 (('rank_no_cooccur_holder', 12), 16),
 (('rank_no_cooccur_holder', 13), 16),
 (('rank_no_cooccur_holder', 14), 16),
 (('rank_no_cooccur_holder', 15), 16),
 (('rank_no_cooccur_holder', 19), 16),
 (('rank_no_cooccur_target', 9), 15),
 (('rank_no_cooccur_target', 10), 15),
 (('rank_no_cooccur_target', 11), 15),
 (('rank_no_cooccur_target', 6), 15),
 (('rank_no_cooccur_target', 16), 15),
 (('rank_no_cooccur_holder', 9), 15),
 (('rank_no_cooccur_holder', 10), 15),
 (('rank_no_cooccur_holder', 11), 15),
 (('rank_no_cooccur_holder', 16), 15),
 (('rank_no_cooccur_hold