# Base Models

## Sentiment Classifier Feature Extraction

In [82]:
from glob import glob
import os
preproc_train_data_dir = './data/processed/mpqa'
filepaths = glob(os.path.join(preproc_train_data_dir, '*.json'))
len(filepaths)

54

In [83]:
from src.base_models import sentiment_lexicon
sent_lex_path = 'data/subjectivity_clues_hltemnlp05/subjclueslen1-HLTEMNLP05.tff'
sl, _, _ = sentiment_lexicon.SentimentLexicon.from_mpqa_file(sent_lex_path)

In [152]:
from enum import Enum
import json

from src import util
from src.base_models import sentiment
from src.preproc import entity_extractor

outfiledir = './data/base_model/sentiment/mpqa'
print("Extracting features for {} documents".format(len(filepaths)))
for i, infilepath in enumerate(filepaths):
    infilename = os.path.basename(infilepath)
    print("Processing file #{}\t{}".format(i, infilename))
    with open(infilepath, 'r') as infile:
        sentences = json.load(infile)['sentences']
        util.write_deps_to_tokens(sentences)
        ee = entity_extractor.EntityExtractor.from_sentences_with_entity_ids(sentences)
        features = sentiment.get_features(sl, sentences, ee)
    
    outfilepath = os.path.join(outfiledir, infilename)
    print(outfilepath)

    features = [
        {
            'holder': holder,
            'target': target,
            'features': [
                (
                    key, 
                    val.value if isinstance(val, Enum) else val
                ) for key, val in feats
            ]
        } for (holder, target), feats in features.items()
    ]
    with open(outfilepath, 'w+') as outfile:
        json.dump(features, outfile)

Extracting features for 54 documents
Processing file #0	04.22.14-2532.json
./data/base_model/sentiment/mpqa/04.22.14-2532.json
Processing file #1	04.51.05-27505.json
./data/base_model/sentiment/mpqa/04.51.05-27505.json
Processing file #2	05.20.33-11163.json
./data/base_model/sentiment/mpqa/05.20.33-11163.json
Processing file #3	06.10.04-18139.json
./data/base_model/sentiment/mpqa/06.10.04-18139.json
Processing file #4	06.12.31-26764.json
./data/base_model/sentiment/mpqa/06.12.31-26764.json
Processing file #5	06.28.56-23638.json
./data/base_model/sentiment/mpqa/06.28.56-23638.json
Processing file #6	07.05.30-9348.json
./data/base_model/sentiment/mpqa/07.05.30-9348.json
Processing file #7	08.21.04-13527.json
./data/base_model/sentiment/mpqa/08.21.04-13527.json
Processing file #8	08.36.15-7509.json
./data/base_model/sentiment/mpqa/08.36.15-7509.json
Processing file #9	08.39.09-12713.json
./data/base_model/sentiment/mpqa/08.39.09-12713.json
Processing file #10	08.40.56-18707.json
./data/ba

In [151]:
from importlib import reload
reload(sentiment);
reload(util);
from src.preproc import entity_extractor
reload(entity_extractor);

In [85]:
sentences[1]['tokens'][30]

{'after': ' ',
 'before': ' ',
 'characterOffsetBegin': 455,
 'characterOffsetEnd': 462,
 'entity_id': ['wikidata', 'Q600040'],
 'governor': (32, 'compound'),
 'index': 31,
 'lemma': 'Foreign',
 'ner': 'ORGANIZATION',
 'originalText': 'Foreign',
 'pos': 'NNP',
 'speaker': 'PER0',
 'word': 'Foreign'}

In [86]:
sentences[1]['tokens'][13]

{'after': ' ',
 'before': ' ',
 'characterOffsetBegin': 362,
 'characterOffsetEnd': 365,
 'entity_id': ['wikidata', 'Q843'],
 'governor': (14, 'det'),
 'index': 14,
 'lemma': 'the',
 'ner': 'O',
 'originalText': 'the',
 'pos': 'DT',
 'speaker': '22',
 'word': 'the'}

In [92]:
util.find_dep_path(sentences[1]['tokens'], 27, 13)

[((None, None), 27),
 ((<DepDirection.GOV: 1>, 'nsubj'), 25),
 ((<DepDirection.DEP: 2>, 'ccomp'), 3),
 ((<DepDirection.DEP: 2>, 'xcomp'), 5),
 ((<DepDirection.DEP: 2>, 'dobj'), 9),
 ((<DepDirection.DEP: 2>, 'nmod:by'), 14),
 ((<DepDirection.DEP: 2>, 'det'), 13)]

In [94]:
util.find_dep_path(sentences[1]['tokens'], 13, 27)

[((None, None), 13),
 ((<DepDirection.GOV: 1>, 'det'), 14),
 ((<DepDirection.GOV: 1>, 'nmod:by'), 9),
 ((<DepDirection.GOV: 1>, 'dobj'), 5),
 ((<DepDirection.GOV: 1>, 'xcomp'), 3),
 ((<DepDirection.GOV: 1>, 'ccomp'), 25),
 ((<DepDirection.DEP: 2>, 'nsubj'), 27)]

In [95]:
util.find_dep_path(sentences[1]['tokens'], 30, 27)

In [93]:
util.find_dep_path(sentences[1]['tokens'], 27, 30)

[((None, None), 27),
 ((<DepDirection.DEP: 2>, 'acl'), 28),
 ((<DepDirection.DEP: 2>, 'nmod:by'), 32),
 ((<DepDirection.DEP: 2>, 'compound'), 30)]

In [89]:
sentences[1]['dep_root']

26

In [98]:
sentences[1]['tokens'][27]

{'after': ' ',
 'before': ' ',
 'characterOffsetBegin': 435,
 'characterOffsetEnd': 444,
 'dependents': {(26, 'det'), (28, 'acl')},
 'governor': (25, 'nsubj'),
 'index': 28,
 'lemma': 'statement',
 'ner': 'O',
 'originalText': 'statement',
 'pos': 'NN',
 'speaker': 'PER0',
 'word': 'statement'}