In [11]:
import os
import sys
def add_sys_path(p):
    p = os.path.abspath(p)
    if p not in sys.path:
        sys.path.append(p)
add_sys_path('..')

In [13]:
from importlib import reload
import word_mover_grammar as wmg

In [165]:

import word_mover_grammar as wmg
reload(wmg)
reload(wmg.extended_grammar);
reload(wmg.earley);

In [83]:
flies = """
S : NP VP
NP: N | A NP
VP: V | VP NP | VP PP
PP: P NP
N: fruit | flies | bananas
A: fruit
V: like | flies
P: like
"""

In [68]:
productions = [
    ['^', ('S',)],
    ['S', ('NP', 'VP',)],
    ['NP', ('N',)],
    ['NP', ('A', 'NP',)],
    ['VP', ('V',)],
    ['VP', ('VP', 'NP',)],
    ['VP', ('VP', 'PP',)],
    ['PP', ('P', 'NP',)],
    ['N', ('fruit',)],
    ['N', ('flies',)],
    ['N', ('bananas',)],
    ['A', ('fruit',)],
    ['V', ('like',)],
    ['V', ('flies',)],
    ['P', ('like',)],
]

parser = wmg.earley.EarleyParser(productions)

In [72]:
symbols = wmg.extended_grammar.rules2symbols(productions)

In [52]:
words = 'fruit flies like bananas'.split()
result = parser.parse(words)
print(result.success)

True


In [53]:
result.final_state in result.forest

True

In [54]:
result.print()

 ^ -> S                        (0, 4)
   S -> NP VP                  (0, 4)
     NP -> A NP                (0, 2)
       A -> fruit              (0, 1)
       NP -> N                 (1, 2)
         N -> flies            (1, 2)
     VP -> VP NP               (2, 4)
       VP -> V                 (2, 3)
         V -> like             (2, 3)
       NP -> N                 (3, 4)
         N -> bananas          (3, 4)
     ____
     NP -> N                   (0, 1)
       N -> fruit              (0, 1)
     VP -> VP PP               (1, 4)
       VP -> V                 (1, 2)
         V -> flies            (1, 2)
       PP -> P NP              (2, 4)
         P -> like             (2, 3)
         NP -> N               (3, 4)
           N -> bananas        (3, 4)


In [55]:
tree = result.sample_a_tree()
wmg.earley.print_tree(tree, result.final_state)

|                   ^                   |
|                   S                   |
|        NP         |        VP         |
|    A    |   NP    |   VP    |   NP    |
|         |    N    |    V    |    N    |
|  fruit  |  flies  |  like   | bananas |


In [56]:
wmg.earley.print_tree_vertically(tree, result.final_state)

 ^ -> S                        (0, 4)
   S -> NP VP                  (0, 4)
     NP -> A NP                (0, 2)
       A -> fruit              (0, 1)
       NP -> N                 (1, 2)
         N -> flies            (1, 2)
     VP -> VP NP               (2, 4)
       VP -> V                 (2, 3)
         V -> like             (2, 3)
       NP -> N                 (3, 4)
         N -> bananas          (3, 4)


In [62]:
for tree in result.iter_trees():
    wmg.earley.print_tree(tree, result.final_state)
    print('=======')

|                   ^                   |
|                   S                   |
|   NP    |             VP              |
|    N    |   VP    |        NP         |
|         |    V    |    A    |   NP    |
|         |         |         |    N    |
|  косой  |  косил  |  косой  |  косой  |
|                   ^                   |
|                   S                   |
|   NP    |             VP              |
|    N    |        VP         |   NP    |
|         |   VP    |   NP    |    N    |
|         |    V    |    N    |         |
|  косой  |  косил  |  косой  |  косой  |


In [61]:
words = 'косой косил косой косой'.split()

productions = [
    ['^', ('S',)],
    ['S', ('NP', 'VP',)],
    ['NP', ('N',)],
    ['NP', ('A', 'NP',)],
    ['VP', ('V',)],
    ['VP', ('VP', 'NP',)],
    ['N', ('заяц',)],
    ['N', ('косой',)],
    ['A', ('кривой',)],
    ['A', ('косой',)],
    ['V', ('косил',)],
]
parser = wmg.earley.EarleyParser(productions)
result = parser.parse(words)
print(result.success)

for tree in result.iter_trees():
    wmg.earley.print_tree(tree, result.final_state)
    print('=======')

True
|                   ^                   |
|                   S                   |
|   NP    |             VP              |
|    N    |   VP    |        NP         |
|         |    V    |    A    |   NP    |
|         |         |         |    N    |
|  косой  |  косил  |  косой  |  косой  |
|                   ^                   |
|                   S                   |
|   NP    |             VP              |
|    N    |        VP         |   NP    |
|         |   VP    |   NP    |    N    |
|         |    V    |    N    |         |
|  косой  |  косил  |  косой  |  косой  |


# Создание грамматики из текста

In [93]:
import yaml

In [92]:
# начнём с классического Yaml. Это почти гранет, только с дефисами. 

grammar_text = """
^:
  - S
S: 
  - NP, VP
VP:
  - V
  - VP NP
  - VP PP
NP:
  - N
  - A NP
PP: 
  - P NP
N: 
  - fruit
  - flies
  - bananas
A: 
  - fruit
V: 
  - like
  - flies
P:
  - like
"""

grammar_dict = yaml.safe_load(grammar_text)
rules = []
for k, vs in grammar_dict.items():
    for v in vs:
        rules.append((k, tuple(v.split())))

In [89]:
rules

[('^', ('S',)),
 ('S', ('NP,', 'VP')),
 ('VP', ('V',)),
 ('VP', ('VP', 'NP')),
 ('VP', ('VP', 'PP')),
 ('NP', ('N',)),
 ('NP', ('A', 'NP')),
 ('PP', ('P', 'NP')),
 ('N', ('fruit',)),
 ('N', ('flies',)),
 ('N', ('bananas',)),
 ('A', ('fruit',)),
 ('V', ('like',)),
 ('V', ('flies',)),
 ('P', ('like',))]

In [90]:
wmg.extended_grammar.rules2symbols(rules)

{'^': <word_mover_grammar.extended_grammar.NonTerminal at 0x3ef3cc2548>,
 'S': <word_mover_grammar.extended_grammar.NonTerminal at 0x3ef3cc2c48>,
 'NP,': <word_mover_grammar.extended_grammar.NonTerminal at 0x3ef3c63908>,
 'VP': <word_mover_grammar.extended_grammar.NonTerminal at 0x3ef3c63c48>,
 'V': <word_mover_grammar.extended_grammar.NonTerminal at 0x3ef3c636c8>,
 'NP': <word_mover_grammar.extended_grammar.NonTerminal at 0x3ef3c632c8>,
 'PP': <word_mover_grammar.extended_grammar.NonTerminal at 0x3ef3c63308>,
 'N': <word_mover_grammar.extended_grammar.NonTerminal at 0x3ef3a76a08>,
 'A': <word_mover_grammar.extended_grammar.NonTerminal at 0x3ef3a769c8>,
 'P': <word_mover_grammar.extended_grammar.NonTerminal at 0x3ef3a76a48>,
 'fruit': <word_mover_grammar.extended_grammar.Terminal at 0x3ef3cd2648>,
 'flies': <word_mover_grammar.extended_grammar.Terminal at 0x3ef3cd2808>,
 'bananas': <word_mover_grammar.extended_grammar.Terminal at 0x3ef3cd27c8>,
 'like': <word_mover_grammar.extended_gra

In [91]:
symbols['^'].deep_sample()

['fruit', 'fruit', 'fruit', 'flies', 'fruit', 'bananas', 'like', 'fruit']

# w2v

In [112]:
# начнём с классического Yaml. Это почти гранет, только с дефисами. 

grammar_text = """
^:
  - S
S: 
  - TURN COLOR_OF_DEVICE COLOR
COLOR:
  - синий
  - зеленый
TURN:
  - включи
  - сделай
  - поставь
COLOR_OF_DEVICE:
  - COLOR_N
  - COLOR_N DEVICE
COLOR_N:
  - цвет
  - свет
DEVICE:
  - лампочки
"""

grammar_dict = yaml.safe_load(grammar_text)
rules = []
for k, vs in grammar_dict.items():
    for v in vs:
        rules.append((k, tuple(v.split())))

In [115]:
rules

[('^', ('S',)),
 ('S', ('TURN', 'COLOR_OF_DEVICE', 'COLOR')),
 ('COLOR', ('синий',)),
 ('COLOR', ('зеленый',)),
 ('TURN', ('включи',)),
 ('TURN', ('сделай',)),
 ('TURN', ('поставь',)),
 ('COLOR_OF_DEVICE', ('COLOR_N',)),
 ('COLOR_OF_DEVICE', ('COLOR_N', 'DEVICE')),
 ('COLOR_N', ('цвет',)),
 ('COLOR_N', ('свет',)),
 ('DEVICE', ('лампочки',))]

unable to import 'smart_open.gcs', disabling that module


In [117]:
grammar = wmg.extended_grammar.rules2symbols(rules, w2v=w2v)

In [118]:
parser = wmg.earley.EarleyParser(grammar)

In [130]:
result = parser.parse('сделай цвет лампочки оранжевым'.split())
print(result.success)

True


In [277]:
for tree in result.iter_trees():
    wmg.earley.print_tree(tree, result.final_state)
    print('=======')

|                             .                             |
|                           root                            |
|    включи    |    $What     |           $Where            |
|              | кондиционер  |      на      |    кухне     |
|    включи    |   радиатор   |      на      |    кухне     |


# Парсер гранета

In [19]:
import compress_fasttext

small_model = compress_fasttext.models.CompressedFastTextKeyedVectors.load(
    'https://github.com/avidale/compress-fasttext/releases/download/v0.0.1/ft_freqprune_100K_20K_pq_100.bin'
)

small_model.init_sims()

def w2v(text):
    return small_model.word_vec(text, use_norm=True)

unable to import 'smart_open.gcs', disabling that module


In [55]:
from pymorphy2 import MorphAnalyzer
analyzer = MorphAnalyzer()
def lemmer(text):
    return [p.normal_form for p in analyzer.parse(text)]

In [175]:
text = """
root:
    включи $What $Where
$What:
    %w2v
    свет | кондиционер
    %regex
    .+[аеиюя]т[ое]р
$Where:
    в $Room
    на $Room
$Room:
    %lemma
    ванна | кухня | спальня
"""

In [176]:
reload(wmg)
reload(wmg.grammar);
reload(wmg.earley);
reload(wmg.text_to_grammar);

In [177]:
grammar = wmg.text_to_grammar.load_granet(text)

In [178]:
parser = wmg.earley.EarleyParser(grammar, w2v=w2v, lemmer=lemmer)

In [192]:
tokens = 'включи компьютер в спальне'.split()
result = parser.parse(tokens)
print(result.success)
for tree in result.iter_trees():
    wmg.earley.print_tree(tree, result.final_state, w=16)
    print('=======')

True
|                               .                               |
|                             root                              |
|    включи     |     $What     |            $Where             |
|               |.+[аеиюя]т[ое]р|       в       |     $Room     |
|               |               |               |    спальня    |
|    включи     |   компьютер   |       в       |    спальне    |


In [188]:
for tree in result.iter_trees():
    wmg.earley.print_tree(tree, result.final_state, w=16)
    print('=======')

|                               .                               |
|                             root                              |
|    включи     |     $What     |            $Where             |
|               |  кондиционер  |       в       |     $Room     |
|               |               |               |    спальня    |
|    включи     |    пылесос    |       в       |    спальне    |


In [186]:
small_model.most_similar('кондиционер')

[('вентилятор', 0.6503194570541382),
 ('вентиляция', 0.6157035827636719),
 ('компрессор', 0.588625967502594),
 ('генератор', 0.577711820602417),
 ('радиатор', 0.5775179862976074),
 ('оборудование', 0.5729262232780457),
 ('дизельный', 0.5674254894256592),
 ('двигатель', 0.5639591813087463),
 ('сгорание', 0.5580621957778931),
 ('пылесос', 0.5563299655914307)]

### Неоднозначные фразы

In [249]:
reload(wmg)
reload(wmg.grammar);
reload(wmg.earley);
reload(wmg.text_to_grammar);

In [209]:
flies = """
S : NP VP
NP: N | A NP
VP: V | VP NP | VP PP
PP: P NP
N: fruit | flies | bananas
A: fruit
V -> like | flies | are
V:
  %regex
  .+ed
P: like
"""

In [210]:
g = wmg.text_to_grammar.load_granet(flies)
parser = wmg.earley.EarleyParser(g)

In [211]:
result = parser.parse('bananas are fruit'.split())
for tree in result.iter_trees():
    wmg.earley.print_tree(tree, result.final_state)
    print('=======')

|                     ^                      |
|                     S                      |
|      NP      |             VP              |
|      N       |      VP      |      NP      |
|   bananas    |      V       |      N       |
|              |     are      |    fruit     |
|   bananas    |     are      |    fruit     |


In [167]:
result = parser.parse('fruit flies like bananas'.split())
for tree in result.iter_trees():
    wmg.earley.print_tree(tree, result.final_state)
    print('=======')

|                             .                             |
|                             S                             |
|      NP      |                     VP                     |
|      N       |      VP      |             PP              |
|    fruit     |      V       |      P       |      NP      |
|              |    flies     |     like     |      N       |
|              |              |              |   bananas    |
|    fruit     |    flies     |     like     |   bananas    |
|                             .                             |
|                             S                             |
|             NP              |             VP              |
|      A       |      NP      |      VP      |      NP      |
|    fruit     |      N       |      V       |      N       |
|              |    flies     |     like     |   bananas    |
|    fruit     |    flies     |     like     |   bananas    |


In [161]:
result = parser.parse('fruit jumped like bananas'.split())
for tree in result.iter_trees():
    wmg.earley.print_tree(tree, result.final_state)
    print('=======')

|                             .                             |
|                             S                             |
|      NP      |                     VP                     |
|      N       |      VP      |             PP              |
|    fruit     |      V       |      P       |      NP      |
|              |     .+ed     |     like     |      N       |
|              |              |              |   bananas    |
|    fruit     |    jumped    |     like     |   bananas    |


In [197]:
''.split('#', 1)

['']

In [198]:
'ab -> cd'.split('->')

['ab ', ' cd']

In [199]:
'->' in 'abc'

False

In [218]:
print("'kek'".strip("8"))

'kek'


In [248]:
reload(wmg)
reload(wmg.grammar);
reload(wmg.earley);
reload(wmg.text_to_grammar);


pajamas = wmg.text_to_grammar.load_granet(
    """
S -> NP VP
PP -> P NP
NP -> Det N | Det N PP | 'I'
VP -> V NP | VP PP
Det -> 'an' | 'my'
N -> 'elephant' | 'pajamas'
V -> 'shot'
P -> 'in'
"""
)

parser = wmg.earley.EarleyParser(pajamas)
result = parser.parse(['I', 'shot', 'an', 'elephant', 'in', 'my', 'pajamas'])
for tree in result.iter_trees():
    wmg.earley.print_tree(tree, result.final_state)
    print('=======')

|                                                   ^                                                    |
|                                                   S                                                    |
|      NP      |                                           VP                                            |
|      I       |                     VP                     |                     PP                     |
|              |      V       |             NP              |      P       |             NP              |
|              |     shot     |     Det      |      N       |      in      |     Det      |      N       |
|              |              |      an      |   elephant   |              |      my      |   pajamas    |
|      I       |     shot     |      an      |   elephant   |      in      |      my      |   pajamas    |
|                                                   ^                                                    |
|                                    

In [242]:

pajamas = wmg.text_to_grammar.load_granet(
    """
S -> я тебя .
"""
)

parser = wmg.earley.EarleyParser(pajamas)
result = parser.parse('я тебя зарежу'.split())
for tree in result.iter_trees():
    wmg.earley.print_tree(tree, result.final_state)
    print('=======')

|                     ^                      |
|                     S                      |
|      я       |     тебя     |      .       |
|      я       |     тебя     |    зарежу    |


In [252]:

pajamas = wmg.text_to_grammar.load_granet(
    """
S: 
  - привет
  - пока
"""
)

parser = wmg.earley.EarleyParser(pajamas)
result = parser.parse('привет'.split())
for tree in result.iter_trees():
    wmg.earley.print_tree(tree, result.final_state)
    print('=======')

|      ^       |
|      S       |
|    привет    |
|    привет    |


In [253]:
pajamas

{'S': <word_mover_grammar.grammar.NonTerminal at 0x2cfdf6afc8>,
 'привет': <word_mover_grammar.grammar.Terminal at 0x2cfdf6a708>,
 'пока': <word_mover_grammar.grammar.Terminal at 0x2cfdf6a608>}