In [42]:
import nltk
import gensim
import numpy as np
import pandas as pd

import pickle

from nltk.corpus import gutenberg, brown, reuters, stopwords

from collections import Counter, defaultdict
from pprint import pprint
from functools import reduce
from operator import itemgetter

from utils.nltk_utils import *
from utils.ngrams import *
from utils.grammar import parse_phrases
from utils.eval import get_test_keywords
stop_words = stopwords.words('english')

In [3]:
import traceback
from typing import List, Tuple

from tqdm import tqdm

from utils.grammar_utils import tags_seq_to_symbols
from utils.io import Cache
from utils.ngrams import ngram2str


observed_tags = Cache.load_observed_tags()
terminal_rules = Cache.load_terminal_rules()


def parse_phrases(tt_ngrams) -> Tuple[List, List[List[Tuple]]]:
    global observed_tags, terminal_rules
    
    phrases = []
    phrases_types = []

    if observed_tags is None:
        observed_tags = dict()

    for tt_gram in tt_ngrams:
        symbols = tuple(tags_seq_to_symbols([tag
                                             for _, tag in tt_gram]))

        phrase = tt_gram

        # check if tags phrase has been already observed
        tags_str = ngram2str(symbols)

        if tags_str in observed_tags:
            if observed_tags[tags_str] is not None:
                phrases.append(phrase)
                phrases_types.append(observed_tags[tags_str])

            continue

        p_types_dict = terminal_rules.get(tags_str)

        if p_types_dict:
            p_type = max(p_types_dict, key=lambda k: p_types_dict[k])

            phrases.append(phrase)
            phrases_types.append(p_type)

            observed_tags[tags_str] = p_type

        else:
            observed_tags[tags_str] = None

#     try:
#         Cache.save_observed_tags(observed_tags)

#     except Exception as e:
#         traceback.print_exc(e)

    return phrases_types, phrases


# Preparing corpora

## Load data from NLTK 

In [4]:
sents = gutenberg.sents() + brown.sents() + reuters.sents()

In [5]:
sents = [tuple(s) for s in sents]

words = [w.lower() 
         for s in sents for w in s]

vocab = sorted(list(set(words)))

print('Sents:', len(sents))
print('Words:', len(words))
print('Vocab:', len(vocab))

Sents: 210608
Words: 5503894
Vocab: 87046


In [6]:
# creating indexes

# {sentence: sent_index}
sents_ind = {sents[i]: i for i in range(len(sents))}

# {word: word_index}
vocab_ind = {vocab[j]: j for j in range(len(vocab))}

## Create mapping from words to sentences

In [7]:
stemmer = Stemmer()

stems = [stemmer.stem(word) for word in vocab]

# {word_index: stem}
stems_map = {vocab_ind[word]: stem 
             for word, stem in zip(vocab, stems)}

print('Stems:', len(set(stems)))

Stems: 58858


In [8]:
stem2word = {stem: vocab[ind]
             for ind, stem in stems_map.items()}

## Parsing corpora

In [None]:
# load

with open('data/cache/stems_phrases', mode='rb') as fp:
    stems_phrases = pickle.load(fp)

In [9]:
parser = NGramsParser()

sents_words_indexes = parser.parse_sents_tokens(sents)

In [10]:
# {stem: Set[sentence_ind]}
stems_phrases = defaultdict(dict)

for i, (s_words, words_indexes) in tqdm(enumerate(zip(sents, sents_words_indexes), start=1)):
    words_ttokens = nltk.pos_tag([w.lower() for w in s_words])
    
    tt_ngrams = [ngr
                 for i in range(1, 5 + 1) 
                 for ngr in n_grams(words_ttokens, i, words_indexes, pad_left=False)]

    types, phrases = parse_phrases(tt_ngrams)

    # format and store phrases
    for t, p in zip(types, phrases):
        phrase_inds = tuple(vocab_ind[token] for token, _ in p)
        
        for word_ind in phrase_inds:
            if vocab[word_ind] not in stop_words:
                stem = stems_map[word_ind]
                
                if not stems_phrases[stem].get(t):
                    stems_phrases[stem][t] = defaultdict(set)
                
                stem_phr_t = stems_phrases[stem][t]
                
                stem_phr_t[phrase_inds].add(sents_ind[s_words])

210608it [07:18, 480.01it/s]


In [11]:
def get_phrases(word, stemmed=True):
    if stemmed:
        stem = word
    else:
        stem = stemmer.stem(word)
        
    phrs = stems_phrases[stem]
    
    if not phrs:
        return {}
    
    phrases_ = defaultdict(set)
    
    for p_type, phr_dict in phrs.items():
        for phrase, sents in phr_dict.items():
            phrase_ = tuple(vocab[ind] for ind in phrase)
            
            phrases_[p_type].add(phrase_)
            
    return phrases_

In [12]:
# save

with open('data/cache/stems_phrases', mode='wb') as fp:
    pickle.dump(stems_phrases, fp)

## Word2Vec

In [25]:
# loading
model = gensim.models.Word2Vec.load('data/w2v/CBOW_300_10_all')

In [14]:
sents_stemmed = [[stems_map[vocab_ind[w.lower()]] for w in s] for s in sents]

In [33]:
model = gensim.models.Word2Vec(sentences=sents_stemmed, size=300, window=15, min_count=1, hs=1, negative=0)
model.save('data/w2v/CBOW_300_10_hs_all')

In [34]:
# testing

print(model.wv.most_similar(positive=[stemmer.stem('woman'), stemmer.stem('king')], negative=[stemmer.stem('man')]))
print(model.wv.similarity(stemmer.stem('campus'), stemmer.stem('dormitory')))
print(model.wv.doesnt_match([stemmer.stem(w) for w in "dormitory bring campus".split()]))

[('queen', 0.3553354740142822), ('daughter', 0.35418325662612915), ('esther', 0.3361766040325165), ('husband', 0.3246108889579773), ('absalom', 0.3200189769268036), ('mordecai', 0.31594496965408325), ('samaria', 0.3088769316673279), ('vashti', 0.30666205286979675), ('hebron', 0.298782080411911), ('selleth', 0.29791125655174255)]
0.20989265750352995
bring


In [35]:
w2v = model.wv

In [19]:
del model

## Working with keywords

In [20]:
kws = get_test_keywords('data/lingualeo_words.csv')

print('Keywords: ', len(kws))

Keywords:  874


In [21]:
TEST_SIZE = 5
np.random.seed = 0

all_kws_found = False

while not all_kws_found:    
    test_kws = list(np.random.choice(list(kws), size=TEST_SIZE, replace=False))

    print('Test keywords:', test_kws)

    test_kws = [stemmer.stem(kw) for kw in test_kws]
    kws_phrases = [stems_phrases.get(kw) for kw in test_kws]
    print('Test keywords stems:', test_kws)

    all_kws_found = all(kws_phrases)
    
    print('All kws found:', all_kws_found)
    print()

Test keywords: ['advice', 'drab', 'shame', 'proclivity', 'admissible']
Test keywords stems: ['advic', 'drab', 'shame', 'procliv', 'admiss']
All kws found: True



In [22]:
def get_pairs_counter(test_kws):
    pairs = []

    for i, kw in enumerate(test_kws):
        cp = list(test_kws)
        cp.remove(kw)

        dsts = w2v.distances(kw, cp)
        print(kw, dsts)
        max_ind = np.argmin(dsts)
        print(i, max_ind)
        sim_kw = test_kws[max_ind + (1 if i <= max_ind else 0)]
        print(kw, sim_kw)

        pair = (kw, sim_kw)

        if (sim_kw, kw) in pairs:
            pair = (sim_kw, kw)

        pairs.append(pair)

    return Counter(pairs)

get_pairs_counter(test_kws)

advic [1.0484983  0.5876609  0.67798024 0.6448846 ]
0 1
advic shame
drab [1.0484983  0.85772705 0.5455078  0.85652786]
1 2
drab procliv
shame [0.5876609  0.85772705 0.9682493  0.80564225]
2 0
shame advic
procliv [0.67798024 0.5455078  0.9682493  0.66142946]
3 1
procliv drab
admiss [0.6448846  0.85652786 0.8056423  0.66142946]
4 0
admiss advic


Counter({('admiss', 'advic'): 1,
         ('advic', 'shame'): 2,
         ('drab', 'procliv'): 2})

In [48]:
phrases_ = get_phrases('advice', stemmed=False)

scored_nps = defaultdict(list)

for p_type, phrases in phrases_.items():
    if p_type != 'NP':
        continue
    
    for phr in list(phrases):
        sc = model.score([[stemmer.stem(w) for w in phr]])
        
        scored_nps[len(phr)].append((phr, sc[0]))
        
scored_nps

defaultdict(list,
            {1: [(('advice',), -10.578289)],
             2: [(('advice', 'the'), -16.795265),
              (('our', 'advice'), -15.781087),
              (('my', 'advice'), -10.6507635),
              (('advice', '."'), -18.548414),
              (('earnest', 'advice'), -22.291565),
              (('outside', 'advice'), -23.856993),
              (('further', 'advice'), -17.321638),
              (('thy', 'advice'), -18.719074),
              (('reiterating', 'advice'), -19.005373),
              (('personalized', 'advice'), -24.278847),
              (('considerate', 'advice'), -20.146019),
              (('which', 'advice'), -20.232025),
              (('advice', ',"'), -19.815565),
              (("hetman's", 'advice'), -22.865692),
              (('and', 'advice'), -13.891079),
              (('advice', 'i'), -17.141329),
              (('aid', 'advice'), -21.918184),
              (('wholesome', 'advice'), -21.598211),
              (('this', 'advice'), -14.548

In [47]:
for i in range(1, 5 + 1):
    nps = scored_nps[i]
    best_nps = sorted(nps, key=itemgetter(1), reverse=True)[:3]
    
    best_nps = [(' '.join(p), s) for p, s in best_nps]
    
    print(i)
    pprint(best_nps)
    print()

1
[('shames', -9.87307), ('shamed', -9.87307), ('shame', -9.87307)]

2
[('my shame', -13.203539),
 ('my shames', -13.203539),
 ('their shame', -13.206809)]

3
[('their own shame', -15.359652),
 ('their shame with', -16.79156),
 ('thine own shame', -18.197964)]

4
[('bear thine own shame', -25.143646),
 ('such a shameful sight', -27.247032),
 ('the shamed and angry', -27.456865)]

5
[('the measureless shame and humiliation', -41.16781),
 ('an ignominy and shame beneath', -41.686546),
 ('a prudent man covereth shame', -43.493618)]



In [51]:
for i in range(1, 5 + 1):
    nps = scored_nps[i]
    best_nps = sorted(nps, key=itemgetter(1), reverse=True)[:5]
    
    best_nps = [(' '.join(p), s) for p, s in best_nps]
    
    print(i)
    pprint(best_nps)
    print()

1
[('advice', -10.578289)]

2
[('my advice', -10.6507635),
 ('give advice', -11.619597),
 ('wanted advice', -13.039914),
 ('and advice', -13.891079),
 ('him advice', -13.957809)]

3
[('such advice as', -12.637161),
 ('of the advice', -18.26074),
 ('the advice of', -18.26074),
 ('their advice for', -18.533184),
 ('their advice ."', -18.64547)]

4
[('for some other advice', -23.814346),
 ('your advice and counsel', -25.596104),
 ('her kindness and advice', -27.856691),
 ('the free advice of', -28.051847),
 ("the hetman's advice of", -29.096107)]

5
[('the close-in support and advice', -41.098873)]



## Using processed corpora

In [None]:
# test

t = 'grievance'

s = stemmer.stem(t)
# stems_phrases[s]
vocab_ind['grievance']

In [None]:
# test

t = 'grievance'

s = stemmer.stem(t)

ss = []
for k, v in stems_phrases[s]['NP'].items():
    ss.extend([s_id for s_id in v])
    
Counter(ss)