In [1]:
import nltk
import numpy as np
import pandas as pd

from nltk.corpus import gutenberg, brown, reuters, stopwords

from collections import Counter, defaultdict
from pprint import pprint

from utils.nltk_utils import *
from utils.ngrams import *
from utils.grammar import parse_phrases

stop_words = stopwords.words('english')

ImportError: No module named 'nltk'

In [2]:
for fileid in gutenberg.fileids():
    num_chars = len(gutenberg.raw(fileid))
    num_words = len(gutenberg.words(fileid))
    num_sents = len(gutenberg.sents(fileid))
    num_vocab = len(set(w.lower() for w in gutenberg.words(fileid)))
    print(round(num_chars/num_words), round(num_words/num_sents), round(num_words/num_vocab), fileid)

5 25 26 austen-emma.txt
5 26 17 austen-persuasion.txt
5 28 22 austen-sense.txt
4 34 79 bible-kjv.txt
5 19 5 blake-poems.txt
4 19 14 bryant-stories.txt
4 18 12 burgess-busterbrown.txt
4 20 13 carroll-alice.txt
5 20 12 chesterton-ball.txt
5 23 11 chesterton-brown.txt
5 18 11 chesterton-thursday.txt
4 21 25 edgeworth-parents.txt
5 26 15 melville-moby_dick.txt
5 52 11 milton-paradise.txt
4 12 9 shakespeare-caesar.txt
4 12 8 shakespeare-hamlet.txt
4 12 7 shakespeare-macbeth.txt
5 36 12 whitman-leaves.txt


In [5]:
sents = gutenberg.sents() + brown.sents() + reuters.sents()

words = [w.lower() for s in sents for w in s]
vocab = list(set(words))

print('Sents:', len(sents))
print('Words:', len(words))
print('Vocab:', len(vocab))

Sents: 210608
Words: 5503894
Vocab: 87046


# Сheck coverage of dictionary from Lingualeo

In [6]:
df = pd.read_csv('data/lingualeo_words.csv', header=None)[[0]]

d_words = df.as_matrix().reshape(df.shape[0])

d_words = set([p.lower().replace(',', '').strip() for w in d_words for p in w.split(' ')])

print('Dictionary words:', len(d_words))

Dictionary words: 874


In [7]:
print('Common percent:', len(d_words.intersection(vocab)) / len(d_words))

absent_words = d_words.difference(vocab)

print('Absent words: %s out of %s' % (len(absent_words), len(d_words)))
# pprint(absent_words)

Common percent: 0.9233409610983981
Absent words: 67 out of 874


# Experiments with lemmas and stems

## Lemmatizing 

In [8]:
ttokens = nltk.pos_tag(vocab)

lemmatizer = Lemmatizer()

# {token: lemma}
lemmas = {token: lemmatizer.lemmatize(token, tag) 
          for token, tag in ttokens}

print('Lemmas:', len(set(lemmas.values())))

Lemmas: 72467


In [9]:
# {lemma: Set[sentence_ind]}
lemmas_sents = defaultdict(set)

for i, s in enumerate(sents):
    for w in s:
        l = lemmas[w.lower()]
        
        lemmas_sents[l].add(i)
        
print('Lemmas have sentences:', len(lemmas_sents))

Lemmas have sentences: 72467


In [49]:
tw = 'grievance'

sw = list(lemmas_sents[tw])

print(len(sw))

' '.join(sents[sw[2]])

17


'They said Nigeria , which holds the OPEC conference presidency , and Qatar probably have the biggest grievances about price differentials making some of their crudes uncompetitive .'

## Check vocab coverage with lemmas

In [10]:
d_words_lemmas = set([lemmatizer.lemmatize(token, tag) 
                      for token, tag in nltk.pos_tag(d_words)])

In [11]:
_lemmas = lemmas.values()

print('Common percent:', len(d_words_lemmas.intersection(_lemmas)) / len(d_words_lemmas))

absent_words = d_words_lemmas.difference(_lemmas)

print('Absent words: %s out of %s' % (len(absent_words), len(d_words_lemmas)))
# pprint(absent_words)

Common percent: 0.9237089201877934
Absent words: 65 out of 852


## Stemming

In [12]:
stemmer = Stemmer()

# {token: stem}
stems = {token: stemmer.stem(token) 
         for token, _ in ttokens}

print('Stems:', len(set(stems.values())))

Stems: 58856


In [13]:
# {stem: Set[sentence_ind]}
stems_sents = defaultdict(set)

for i, s in enumerate(sents):
    for w in s:
        st = stems[w.lower()]
        
        stems_sents[st].add(i)
        
print('Stems have sentences:', len(stems_sents))

Stems have sentences: 58856


## Check vocab coverage with stems

In [14]:
d_words_stems = set([stemmer.stem(w) for w in d_words])

In [15]:
_stems = stems.values()

print('Common percent:', len(d_words_stems.intersection(_stems)) / len(d_words_stems))

absent_words = d_words_stems.difference(_stems)

print('Absent words: %s out of %s' % (len(absent_words), len(d_words_stems)))
# pprint(absent_words)

Common percent: 0.9577804583835947
Absent words: 35 out of 829


# Phrases extraction

In [60]:
parser = NGramsParser()
words_ttokens = nltk.pos_tag(words)

tokens_indexes = parser.parse_tokens(words)

In [91]:
# {word: Set[phrase]}
words_phrases = defaultdict(set)

for i in range(1, 5 + 1):
    tt_ngrams = list(n_grams(words_ttokens, i, tokens_indexes, pad_left=False))

    types, phrases = parse_phrases(tt_ngrams, i)

    for t, p in zip(types, phrases):
        words_phrases[t].add(p)

parsing phrases from 1-grams: 100%|██████████| 4768859/4768859 [00:30<00:00, 154577.58it/s]
parsing phrases from 2-grams: 100%|██████████| 4057518/4057518 [00:28<00:00, 141489.05it/s]
parsing phrases from 3-grams: 100%|██████████| 3456422/3456422 [00:21<00:00, 159277.54it/s]
parsing phrases from 4-grams: 100%|██████████| 2921233/2921233 [00:26<00:00, 110953.44it/s]
parsing phrases from 5-grams: 100%|██████████| 2451678/2451678 [00:27<00:00, 90633.10it/s]
parsing phrases from 6-grams: 100%|██████████| 2051602/2051602 [00:24<00:00, 85017.92it/s]


## Caching data

In [63]:
import pickle

with open('data/cache/words_phrases', mode='wb') as fp:
    pickle.dump(words_phrases, fp)

In [None]:
with open('data/cache/lemmas', mode='wb') as fp:
    pickle.dump(lemmas, fp)
    
with open('data/cache/lemmas_sents', mode='wb') as fp:
    pickle.dump(lemmas_sents, fp)

In [64]:
with open('data/cache/stems', mode='wb') as fp:
    pickle.dump(stems, fp)
    
with open('data/cache/stems_sents', mode='wb') as fp:
    pickle.dump(stems_sents, fp)

## Experiments with phrases  

In [67]:
lemmas_ttokens = defaultdict(set)

for token, tag in words_ttokens:
    l = lemmas[token]
    
    lemmas_ttokens[l].add((token, tag))

In [75]:
def only_stopwords_phrase(phrase, phr_intersection):
    return any([tt for tt in phrase if tt not in phr_intersection and tt[0] not in stop_words])
    
def get_phrases_containing(lemma: str, size=None):
    # if size:
    #     indexed_phrases =
    #
    # if tag.startswith('N'):
    #     phrase_type = 'NP'
    # elif tag.startswith('V'):
    #     phrase_type = 'VP'
    # elif tag in ['IN', 'TO']:
    #     phrase_type = 'PP'
    # elif tag in ['JJ', 'RB']:
    #     phrase_type = 'ADJP'
    # else:
    #     raise NotImplementedError(tag)

    # TODO: options which phrases to return
    
    ttokens = set(tt for tt in lemmas_ttokens.get(lemma, []))

    stopwords_phrases, not_sw_phrases = defaultdict(list), defaultdict(list)
    
    if ttokens:
        for phr_type, phrases in words_phrases.items():
            for phrase in list(phrases):
                common_ttokens = ttokens.intersection(phrase)
                
                if common_ttokens:
                    if only_stopwords_phrase(phrase, common_ttokens):
                        stopwords_phrases[phr_type].append(phrase)
                    else:
                        not_sw_phrases[phr_type].append(phrase)
                        
    return stopwords_phrases, not_sw_phrases

In [100]:
def explore_vocab():
    phr_length = dict()
    
    for phr_type, phrases in words_phrases.items():
        phr_length[phr_type] = defaultdict(int)
        
        for phr in phrases:
            phr_length[phr_type][len(phr)] += 1
        
    return phr_length

explore_vocab()

{'ADJP': defaultdict(int, {1: 25575, 2: 90057, 3: 18988, 4: 1814, 5: 87}),
 'ADVP': defaultdict(int, {1: 9387, 2: 106481, 3: 15773, 4: 117}),
 'NP': defaultdict(int,
             {1: 65971, 2: 481015, 3: 430109, 4: 130492, 5: 12662, 6: 845}),
 'PP': defaultdict(int, {1: 1580, 2: 868, 3: 4153}),
 'VP': defaultdict(int, {1: 35510, 2: 19966, 3: 2581})}

In [104]:
not_sw, sw = get_phrases_containing('dormitory')

not_sw

defaultdict(list,
            {'NP': [(('build', 'VB'), ('dormitories', 'NNS')),
              (('campus', 'NN'), ('dormitories', 'NNS')),
              (('the', 'DT'), ('dormitory', 'NN'), ('gapt', 'NN')),
              (('the', 'DT'),
               ('dormitory', 'NN'),
               ('and', 'CC'),
               ('dining', 'VBG')),
              (('the', 'DT'), ('campus', 'NN'), ('dormitories', 'NNS')),
              (('dormitory', 'NN'), ('gapt', 'NN')),
              (('dormitory', 'NN'), ('and', 'CC'), ('dining', 'VBG'))]})

In [101]:
def extract_phrases_length(phrases_dict):
    voc = defaultdict(list)
    
    for p_type, phrases in phrases_dict.items():
        for phr in phrases:
            voc[len(phr)].append(phr)
            
    return voc

extract_phrases_length(not_sw)

defaultdict(list,
            {2: [(('plant', 'NN'), ('facilities', 'NNS')),
              (('merger', 'NN'), ('facility', 'NN')),
              (('last', 'JJ'), ('facility', 'NN')),
              (('trade', 'NN'), ('facility', 'NN')),
              (('recreational', 'VB'), ('facilities', 'NNS')),
              (('coop', 'NN'), ('facility', 'NN')),
              (('leased', 'VBN'), ('facilities', 'NNS')),
              (('food-preservation', 'NN'), ('facilities', 'NNS')),
              (('private-school', 'NN'), ('facilities', 'NNS')),
              (('facility', 'NN'), ('dynalectron', 'NNP')),
              (('better', 'JJR'), ('facilities', 'NNS')),
              (('waterfront', 'NN'), ('facility', 'NN')),
              (('combustion', 'NN'), ('facility', 'NN')),
              (('100', 'CD'), ('facility', 'NN')),
              (('28', 'CD'), ('facilities', 'NNS')),
              (('credit', 'NN'), ('facility', 'NN')),
              (('housing', 'NN'), ('facilities', 'NNS')),
        

In [105]:
def extract_vocab(phrases_dict):
    voc = []
    
    for p_type, phrases in phrases_dict.items():
        for phr in phrases:
            voc.extend([token for token, tag in phr if token not in stop_words])
            
    return Counter(voc)

extract_vocab(not_sw)

Counter({'build': 1,
         'campus': 2,
         'dining': 2,
         'dormitories': 3,
         'dormitory': 4,
         'gapt': 2})

# WordNet experiments

In [98]:
from nltk.corpus import wordnet as wn