In [6]:
import nltk
import gensim
import numpy as np
import pandas as pd

import pickle

from nltk.corpus import gutenberg, brown, reuters, stopwords

from collections import Counter, defaultdict
from pprint import pprint
from functools import reduce
from operator import itemgetter
from itertools import product

from utils.nltk_utils import *
from utils.ngrams import *
from utils.eval import get_test_keywords

In [2]:
import traceback
from typing import List, Tuple

from tqdm import tqdm

from utils.grammar_utils import tags_seq_to_symbols
from utils.io import Cache
from utils.ngrams import ngram2str


observed_tags = Cache.load_observed_tags()
terminal_rules = Cache.load_terminal_rules()


def parse_phrases(tt_ngrams) -> Tuple[List, List[List[Tuple]]]:
    global observed_tags, terminal_rules
    
    phrases = []
    phrases_types = []

    if observed_tags is None:
        observed_tags = dict()

    for tt_gram in tt_ngrams:
        symbols = tuple(tags_seq_to_symbols([tag
                                             for _, tag in tt_gram]))

        phrase = tt_gram

        # check if tags phrase has been already observed
        tags_str = ngram2str(symbols)

        if tags_str in observed_tags:
            if observed_tags[tags_str] is not None:
                phrases.append(phrase)
                phrases_types.append(observed_tags[tags_str])

            continue

        p_types_dict = terminal_rules.get(tags_str)

        if p_types_dict:
            p_type = max(p_types_dict, key=lambda k: p_types_dict[k])

            phrases.append(phrase)
            phrases_types.append(p_type)

            observed_tags[tags_str] = p_type

        else:
            observed_tags[tags_str] = None

#     try:
#         Cache.save_observed_tags(observed_tags)

#     except Exception as e:
#         traceback.print_exc(e)

    return phrases_types, phrases


# Preparing corpora

## Load data from NLTK 

In [8]:
[nltk.download(corp_) for corp_ in ['gutenberg', 'brown', 'reuters']]
nltk.download('punkt')
stop_words = stopwords.words('english')
sents = gutenberg.sents() + brown.sents() + reuters.sents()

[nltk_data] Downloading package gutenberg to /Users/tk/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package brown to /Users/tk/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package reuters to /Users/tk/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /Users/tk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
sents = [tuple(s) 
         for s in sents]

words = [w.lower() 
         for s in sents for w in s]

vocab = sorted(list(set(words)))

print('Sents:', len(sents))
print('Words:', len(words))
print('Vocab:', len(vocab))

Sents: 210608
Words: 5503894
Vocab: 87046


In [15]:
# creating indexes

# {sentence: sent_index}
sent2ind = {sents[i]: i for i in range(len(sents))}

# {word: word_index}
word2ind = {vocab[j]: j for j in range(len(vocab))}

## Create mapping from words to sentences

In [16]:
stemmer = Stemmer()

stems = [stemmer.stem(word) for word in vocab]

# {word_index: stem}
stems_map = {word2ind[word]: stem 
             for word, stem in zip(vocab, stems)}

print('Stems:', len(set(stems)))

Stems: 58858


In [17]:
stem2word = {stem: vocab[ind]
             for ind, stem in stems_map.items()}

## Word2Vec model

In [25]:
# loading
VECTOR_SIZE = 300
WINDOW = 15
w2v_file_name = 'data/cache/w2v_CBOW_%d_%d_hs_all' % (VECTOR_SIZE, WINDOW)
try:
    model = gensim.models.Word2Vec.load(w2v_file_name)
    print('Loaded from file %s' % w2v_file_name)
except FileNotFoundError:
    sents_stemmed = [[stems_map[word2ind[w.lower()]] 
                      for w in s] 
                     for s in sents]
    model = gensim.models.Word2Vec(sentences=sents_stemmed, size=300, window=15, min_count=1, hs=1, negative=0)
    model.save(w2v_file_name)
    print('Trained and saved to %s' % w2v_file_name)

# testing
print(model.wv.most_similar(positive=[stemmer.stem('woman'), stemmer.stem('king')], negative=[stemmer.stem('man')]))
print(model.wv.similarity(stemmer.stem('campus'), stemmer.stem('dormitory')))
print(model.wv.doesnt_match([stemmer.stem(w) for w in "dormitory bring campus".split()]))

w2v = model.wv

Trained and saved to data/cache/w2v_CBOW_300_15_hs_all
[('esther', 0.4287969768047333), ('queen', 0.40083709359169006), ('daughter', 0.3828454911708832), ('princess', 0.3554416000843048), ('theatre-by-the-sea', 0.3367374837398529), ('sister', 0.33462944626808167), ('sarason', 0.33333131670951843), ('candac', 0.33151155710220337), ('jarmuth', 0.32717573642730713), ('hegai', 0.32047152519226074)]
0.28550296254314844
bring


## Parsing corpora

In [27]:
file_name = 'data/cache/stems_phrases'
try:
    with open(file_name, mode='rb') as fp:
        stems_phrases = pickle.load(fp)
    print('Loade from file: %s' % file_name)
except FileNotFoundError:
    parser = NGramsParser()
    sents_words_indexes = parser.parse_sents_tokens(sents)

    # {stem: Set[sentence_ind]}
    stems_phrases = defaultdict(dict)

    for i, (s_words, words_indexes) in tqdm(enumerate(zip(sents, sents_words_indexes), start=1)):
        words_ttokens = nltk.pos_tag([w.lower() for w in s_words])

        tt_ngrams = [ngr
                     for i in range(2, 5 + 1) 
                     for ngr in n_grams(words_ttokens, i, words_indexes, pad_left=False)]

        types, phrases = parse_phrases(tt_ngrams)

        # format and store phrases
        for t, p in zip(types, phrases):
            phrase_inds = tuple(word2ind[token] for token, _ in p)

            for word_ind in phrase_inds:
                if vocab[word_ind] not in stop_words:
                    stem = stems_map[word_ind]

                    if not stems_phrases[stem].get(t):
                        stems_phrases[stem][t] = defaultdict(set)

                    stem_phr_t = stems_phrases[stem][t]

                    stem_phr_t[phrase_inds].add(sent2ind[s_words])

    # save
    with open(file_name, mode='wb') as fp:
        pickle.dump(stems_phrases, fp)


210608it [07:23, 475.29it/s]


{'NP': defaultdict(set,
             {(8628, 77456): {187,
               66413,
               78058,
               81366,
               115127,
               120880,
               125006,
               131072,
               133693,
               133696,
               144948,
               157951,
               159366,
               178610,
               182963,
               197604,
               200077,
               203270,
               203391},
              (8628, 4142): {10854,
               90541,
               91285,
               97039,
               102810,
               124984,
               128804,
               129251,
               146735,
               192731,
               210054},
              (6841, 8629): {67418, 129544, 141032},
              (8629, 5870): {67418},
              (6841, 8629, 5870): {67418},
              (8629, 5870, 54834): {67418},
              (85626, 6841, 8629, 5870): {67418},
              (6841, 8629, 5870, 54834

In [41]:
def explore(word, p_type='NP'):
    obj = stems_phrases[stemmer.stem(word)]

    for phrase, sss in obj[p_type].items():
        p = ' '.join([vocab[ind] for ind in phrase])
        yield p, [sents[ind] for ind in sss]

g = explore('assume', 'NP')

In [59]:
next(g)

('that bowsman assumed',
 [('It',
   'so',
   'chanced',
   ',',
   'that',
   'after',
   'the',
   'Parsee',
   "'",
   's',
   'disappearance',
   ',',
   'I',
   'was',
   'he',
   'whom',
   'the',
   'Fates',
   'ordained',
   'to',
   'take',
   'the',
   'place',
   'of',
   'Ahab',
   "'",
   's',
   'bowsman',
   ',',
   'when',
   'that',
   'bowsman',
   'assumed',
   'the',
   'vacant',
   'post',
   ';',
   'the',
   'same',
   ',',
   'who',
   ',',
   'when',
   'on',
   'the',
   'last',
   'day',
   'the',
   'three',
   'men',
   'were',
   'tossed',
   'from',
   'out',
   'of',
   'the',
   'rocking',
   'boat',
   ',',
   'was',
   'dropped',
   'astern',
   '.')])

## Count number of phrases

In [28]:
# data for analysis

phrases_count = defaultdict(dict)
phrases_lengths_count = defaultdict(int)
phrases_types_count = defaultdict(int)
stems_phrases_count = defaultdict(dict)

for stem, stem_phrases_dict in stems_phrases.items():
    for phrase_type, phrases_dict in stem_phrases_dict.items():
#         stem_phrase_type_count = sum([len(s_ids) for s_ids in phrases_dict.values()])
        for phrase_tuple, s_ids in phrases_dict.items():
            phrase_len = len(phrase_tuple)
            phrase_sents_count = len(s_ids)
            
            if not stems_phrases_count[stem].get(phrase_len):
                stems_phrases_count[stem][phrase_len] = defaultdict(int)
                
            # count number of such phrase type for given stem
            stems_phrases_count[stem][phrase_len][phrase_type] += phrase_sents_count 

            # count total number of such phrase type
            phrases_count[phrase_len][phrase_type] = phrases_count[phrase_len].get(phrase_type, 0) + phrase_sents_count

            # count total number of all phrases
            phrases_lengths_count[phrase_len] += phrase_sents_count
            phrases_types_count[phrase_type] += phrase_sents_count

all_phrases_count = sum(phrases_types_count.values())

In [29]:
all_phrases_count

4694125

In [30]:
WORD = 'strive'

stems_phrases_count[stemmer.stem(WORD)]

{2: defaultdict(int, {'NP': 28, 'ADJP': 12, 'ADVP': 8, 'VP': 9}),
 3: defaultdict(int, {'NP': 12, 'ADJP': 1, 'VP': 1}),
 4: defaultdict(int, {'NP': 4})}

In [31]:
def phrases_model(word, stemmed=True):
    """
        Return sorted log probs of phrase types and length for the word 
    """
    if not stemmed:
        stem = stemmer.stem(word)
    else:
        stem = word
    
    if not stems_phrases_count.get(stem):
        return []
    
    stem_phrases = stems_phrases_count[stem]
    
    phrases_probs = []
    
    for p_length, p_types_count in stem_phrases.items():
        for p_type, p_type_count in p_types_count.items():
            prob_len = phrases_lengths_count[p_length] / all_phrases_count 
            prob_len_type = phrases_count[p_length][p_type] / phrases_lengths_count[p_length]
            prob_word_len_type = p_type_count / phrases_count[p_length][p_type]
            
            log_prob = sum(np.log(p) for p in [prob_word_len_type, prob_len_type, prob_len])
            
            phrases_probs.append(((p_type, p_length), log_prob))
    
    phrases, scores = [], []
    for phr, s in sorted(phrases_probs, key=itemgetter(1), reverse=True):
        phrases.append(phr)
        scores.append(s)
    
#     scores = np.exp(scores)
#     scores /= np.max(scores)
    
    return list(zip(phrases, scores))

phrases_model(WORD, stemmed=False)

[(('NP', 2), -12.02961777460343),
 (('NP', 3), -12.876915634990633),
 (('ADJP', 2), -12.876915634990635),
 (('VP', 2), -13.164597707442416),
 (('ADVP', 2), -13.2823807430988),
 (('NP', 4), -13.975527923658742),
 (('ADJP', 3), -15.361822284778633),
 (('VP', 3), -15.361822284778635)]

## Scoring phrases (experiments)

In [None]:
LENGTH_PEN = {
    2: 1.75,
    3: 1.25,
    4: 0.75,
    5: 0.25
}

REVERSE = False
SCORE_SIGN = -1
TOP_N = 5

P_TYPE_SCORE = dict(phrases_model(WORD, stemmed=False))

phrases_ = get_phrases(WORD, stemmed=False)

all_scored_phrases = defaultdict(dict)

for p_type, phrases in phrases_.items():
    phrases_ = list(phrases)
    
    scores = np.array(model.score([[stemmer.stem(w) for w in phr] 
                                   for phr in phrases_]))

    for phr, sc in zip(phrases_, scores):
        if not all_scored_phrases[p_type].get(len(phr)):
            all_scored_phrases[p_type][len(phr)] = []
        
        score = SCORE_SIGN * (sc + P_TYPE_SCORE[(p_type, len(phr))]) * LENGTH_PEN[len(phr)] 
        
        all_scored_phrases[p_type][len(phr)].append((phr, score))

In [None]:
for p_type, scored_phrases in all_scored_phrases.items():
    print('%s\n' % p_type)
    for n in scored_phrases:
        nps = scored_phrases[n]
        best_phr = sorted(nps, key=itemgetter(1), reverse=REVERSE)[:TOP_N]

        best_phr = [(' '.join(p), s) for p, s in best_phr]

        print(n)
        pprint(best_phr)
        print()

In [None]:
# group different length together

candidate_phrases = defaultdict(list)

for p_type, scored_phrases in all_scored_phrases.items():
    for n in scored_phrases:
        nps = scored_phrases[n]
        best_phr = sorted(nps, key=itemgetter(1), reverse=REVERSE)[:TOP_N]

        best_phr = [(' '.join(p), s) for p, s in best_phr]
        
        candidate_phrases[p_type].extend(best_phr)
        
    # TODO: filter similar phrases and phrases with keywords
    candidate_phrases[p_type] = sorted(candidate_phrases[p_type], key=itemgetter(1), reverse=REVERSE)[:TOP_N]
    
candidate_phrases

## Implementation of scoring phrases 

In [26]:
def get_phrases(word, stemmed=True):
    if stemmed:
        stem = word
    else:
        stem = stemmer.stem(word)
        
    phrs = stems_phrases[stem]
    
    if not phrs:
        return {}
    
    phrases_ = defaultdict(set)
    
    for p_type, phr_dict in phrs.items():
        for phrase, sents in phr_dict.items():
            phrase_ = tuple(vocab[ind] for ind in phrase)
            
            phrases_[p_type].add(phrase_)
            
    return phrases_

In [None]:
# TODO: add probabilistic CFG

LENGTH_PEN = {
    2: 1.75,
    3: 1.25,
    4: 0.75,
    5: 0.5
}

REVERSE = False
SCORE_SIGN = -1
TOP_N = 10

def get_scored_phrases(word, include_scores=True):
    P_TYPE_SCORE = dict(phrases_model(word, stemmed=False))

    phrases_ = get_phrases(word, stemmed=False)

    all_scored_phrases = defaultdict(dict)

    for p_type, phrases in phrases_.items():
        phrases_ = list(phrases)

        scores = np.array(model.score([[stemmer.stem(w) for w in phr] 
                                       for phr in phrases_]))

        for phr, sc in zip(phrases_, scores):
            if not all_scored_phrases[p_type].get(len(phr)):
                all_scored_phrases[p_type][len(phr)] = []

            score = SCORE_SIGN * (sc + P_TYPE_SCORE[(p_type, len(phr))]) * LENGTH_PEN[len(phr)] 

            all_scored_phrases[p_type][len(phr)].append((phr, score))

    # group different length together

    candidate_phrases = defaultdict(list)

    for p_type, scored_phrases in all_scored_phrases.items():
        for n in scored_phrases:
            nps = scored_phrases[n]
            best_phr = sorted(nps, key=itemgetter(1), reverse=REVERSE)[:TOP_N]

            best_phr = [(' '.join(p), s) for p, s in best_phr]

            candidate_phrases[p_type].extend(best_phr)

        # TODO: filter similar phrases and phrases with keywords
        candidate_phrases[p_type] = sorted(candidate_phrases[p_type], key=itemgetter(1), reverse=REVERSE)#[:TOP_N]
        
        if not include_scores:
            candidate_phrases[p_type] = [p for p, _ in candidate_phrases[p_type]]
            
    return candidate_phrases

In [None]:
stems_phrases['assum']['ADJP']

##  LSTM RNN

In [None]:
import keras

from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras import layers
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers import Bidirectional
from keras.layers import TimeDistributed
from keras.layers import Activation
from keras.utils import np_utils

In [None]:
data = ' '.join(nltk.flatten(sents))

escape_chars = ['æ', 'æ','è','é','î','ü', '\n', '\x1a']

for c in escape_chars:
    data = data.replace(c, '')

In [None]:
SEQ_LENGTH = 100

chars = sorted(list(set(data)))

VOCAB_SIZE = len(chars)

SEQS_NUM = int(np.floor(len(data)/SEQ_LENGTH))

ix_to_char = {ix:char for ix, char in enumerate(chars)}
char_to_ix = {char:ix for ix, char in enumerate(chars)}

print('SEQ_LENGTH =', SEQ_LENGTH, '\nNumber of sequences:', SEQS_NUM, '\nVOCAB_SIZE =', VOCAB_SIZE)

In [None]:
HIDDEN_DIM = 500
LAYER_NUM = 3

rnn = Sequential()

rnn.add(layers.LSTM(HIDDEN_DIM, input_shape=(None, VOCAB_SIZE), return_sequences=True))

for i in range(LAYER_NUM - 1):
    rnn.add(layers.LSTM(HIDDEN_DIM, return_sequences=True))

rnn.add(layers.TimeDistributed(layers.Dense(VOCAB_SIZE)))
rnn.add(layers.Activation('softmax'))

rnn.compile(loss="categorical_crossentropy", optimizer="rmsprop")

In [None]:
# load

rnn.load_weights('trained_large/checkpoint_500_all_epoch_2.hdf5')

In [None]:
def generate_from_context(mdl, context, length=100):
    ix = [char_to_ix[c] for c in context]

    text = [c for c in context]

    X = np.zeros((1, length + len(context), VOCAB_SIZE))

    for i, ind in enumerate(ix):
        X[0, i, :][ind] = 1

        print(ix_to_char[ind], end="")

    for i in range(len(context), length + len(context)):
        X[0, i, :][ix[-1]] = 1

        print(ix_to_char[ix[-1]], end="")

        ix = np.argmax(mdl.predict(X[:, :i+1, :])[0], 1)

        text.append(ix_to_char[ix[-1]])

    return ('').join(text)
  
generate_from_context(rnn, 'to recreational facilities')

## Working with keywords

In [None]:
kws = get_test_keywords('data/lingualeo_words.csv')

print('Keywords: ', len(kws))

In [None]:
def gen_random_kws(cnt):
    all_kws_found = False

    while not all_kws_found:    
        test_kws = list(np.random.choice(list(kws), size=cnt, replace=False))

        word_kws = list(test_kws)

        test_kws = [stemmer.stem(kw) for kw in test_kws]
        kws_phrases = [stems_phrases.get(kw) for kw in test_kws]

        all_kws_found = all(kws_phrases)
    
    return test_kws

In [None]:
TEST_SIZE = 5
np.random.seed = 0

all_kws_found = False

while not all_kws_found:    
    test_kws = list(np.random.choice(list(kws), size=TEST_SIZE, replace=False))

    word_kws = list(test_kws)
    
    print('Test keywords:', word_kws)
    
    test_kws = [stemmer.stem(kw) for kw in test_kws]
    kws_phrases = [stems_phrases.get(kw) for kw in test_kws]
    print('Test keywords stems:', test_kws)

    all_kws_found = all(kws_phrases)
    
    print('All kws found:', all_kws_found)
    print()
    

In [None]:
def cluster_keywords(kws):
    pairs = []

    for i, kw in enumerate(kws):
        cp = list(kws)
        cp.remove(kw)

        dsts = w2v.distances(kw, cp)
#         print(kw, dsts)
        max_ind = np.argmin(dsts)
#         print(i, max_ind)
        sim_kw = kws[max_ind + (1 if i <= max_ind else 0)]
#         print(kw, sim_kw)

        pair = (kw, sim_kw)

        if (sim_kw, kw) in pairs:
            pair = (sim_kw, kw)

        pairs.append(pair)

    final_clusters = []
    
    kws_ = list(kws)
    
    for kws_pair, cnt in Counter(pairs).items():
        if cnt > 1:
            final_clusters.append(kws_pair)
            [kws_.remove(w) for w in kws_pair]
        
    return final_clusters + [(kw,) for kw in kws_]

cluster_keywords(test_kws)

# Text generation

In [None]:
sents_rules = Cache.load_terminal_rules_sents()

total_sents = sum(v['S'] for v in sents_rules.values())

def rank_sents(ss, kwss, top_n=30):
    res_sents = []
    
    sents_stems = [[stems_map.get(word2ind.get(w, w), w) 
                    for w in s.split(' ')]
                  for _, s in ss]
    
    if not sents_stems:
        return []
    
    scores = model.score(sents_stems)
    
    for (s_prob, s), score in zip(ss, scores):
        s_spl = s.split(' ')

        if len(s_spl) > 2:
            not_sws = [w
                       for w in s_spl
                       if w.lower() not in stop_words and stems_map[word2ind[w.lower()]] not in kwss] 
#             print(not_sws)
            not_sw_prob = len(not_sws) / len(s_spl) / 10
            
#             print(score, np.log(s_prob or 1e-6), np.log(not_sw_prob or 1e-6))
            
            res_sents.append((s, score + np.log(s_prob or 1e-8) + np.log(not_sw_prob or 1e-8)))
            
    return sorted(res_sents, key=itemgetter(1), reverse=True)[:top_n]
    
def combine_elements(*args):
    return list(product(*args, repeat=1))

def gen_sents_candidates(kws, kws_phrases):
    marks = {'COMMA': [","],
             'COLON': [":"],
             'SEMICOLON': [";"],
             # 'DOT': ["."],
             'QUESTION': ["?"],
             'EXCLAM': ["!"],
             'DASH': ["-"]}
    
    res_sents = set()

    if len(kws) == 1:
        kw = kws[0]
        phrs = kws_phrases[0]
        
        for p_type, p_type_sents in stems_phrases[kw].items():
            res_sents.update((0, ' '.join(sents[s_id])) for s_ids_set in p_type_sents.values() for s_id in list(s_ids_set) )
            
        return res_sents
    
    phrases = defaultdict(dict)

    for kw, kw_phrases in zip(kws, kws_phrases):
        for p_type in kw_phrases:
            phrases[p_type][kw] = kw_phrases[p_type]

    skipped = 0
    
    for sents_rule, sents_count in sents_rules.items():
        s_prob = max(sents_count['S'], 1000) / total_sents
        
        if s_prob < 0.03:
            skipped += 1
            continue
        
        sent_symbols = str2ngram(sents_rule)
        
        sent_phrases = []
        
        # form lists of phrases of each keyword
        for p_type in sent_symbols:
            if p_type not in marks:
                sent_phrases.append([(kw, phrases[p_type].get(kw))
                                     for kw in kws
                                    if phrases[p_type].get(kw)])
            else:
                sent_phrases.append((None, marks.get(p_type, [])))
            
        # combine phrases of keywords
        kws_phrases_product = combine_elements(*sent_phrases)
        
        # combine sents
        for kws_comb in kws_phrases_product:
            if kws_comb is None:
                continue
        
            un_combs = [kw 
                        for kw, _ in kws_comb 
                        if kw is not None]
            
            if len(set(un_combs)) <= 1:
                continue
            
            kws_comb = [comb for _, comb in kws_comb]
            
            sents_candidates = [(s_prob, ' '.join(s))
                                for s in combine_elements(*kws_comb)]
            
            res_sents.update([s_cand for s_cand in sents_candidates])
    
    print('Skipped: %s out of %s' % (skipped, len(sents_rules)))
    return res_sents

In [None]:
def generate_test(keywords):
    best_sents = []
    clusters = cluster_keywords(keywords)
    
    print('Clusters:', clusters)
    
    for kws_tuple in clusters:
        result = gen_sents_candidates(kws_tuple, [get_scored_phrases(kw, include_scores=False) for kw in kws_tuple])

        ranked_sents = rank_sents(result, keywords)
        
        best_sents.append([s for s, _ in ranked_sents[:3]])
        
    return ['. '.join([s.capitalize() for s in ss]) for ss in zip(*best_sents)]

generate_test(test_kws)

## DEMO

In [None]:
KEYWORDS = '' or gen_random_kws(6)

print('Keywords:', KEYWORDS)

print('\nCandidate text:')

generate_test(KEYWORDS)

