# Grammar blah blah

In [234]:
import numpy as np
import random
import re
from typing import List, Tuple, Dict
from gensim.models import Word2Vec

## Define nontrivial polish language grammar

In [570]:
NONTERMINALS = {
    'S': [
        ('VERB_PHRASE',),
    ],
    'VERB_PHRASE': [
        ('VERB_IMPS',),
        ('ADV_PHRASE', 'VERB_IMPS'),
        ('VERB_IMPS', 'ACC_PHRASE_SG_M1'),
        ('VERB_IMPS', 'ACC_PHRASE_PL_F'),
        ('VERB_IMPS', 'ACC_PHRASE_PL_N1'),
        ('ADV_PHRASE', 'VERB_IMPS', 'ACC_PHRASE_SG_M1'),
        ('ADV_PHRASE', 'VERB_IMPS', 'ACC_PHRASE_PL_F'),
        ('ADV_PHRASE', 'VERB_IMPS', 'ACC_PHRASE_PL_N2'),
        
    ],
    'ADV_PHRASE': [
        ('ADV',),
        ('ADV', 'ADV',),
    ],
    'ACC_PHRASE_SG_M1': [
        ('SUBST_SG_ACC_M1',),
        ('ADJ_PHRASE_SG_ACC_M1', 'SUBST_SG_ACC_M1',),
    ],
    'ACC_PHRASE_PL_F': [
        ('SUBST_PL_ACC_F',),
        ('ADJ_PHRASE_PL_ACC_F_N2', 'SUBST_PL_ACC_F'),
    ],
    'ACC_PHRASE_PL_N2': [
        ('SUBST_PL_ACC_N2',),
        ('ADJ_PHRASE_PL_ACC_F_N2', 'SUBST_PL_ACC_N2'),
    ],
    'ADJ_PHRASE_SG_ACC_M1': [
        ('ADJ_SG_ACC_M1',),
        ('ADJ_SG_ACC_M1', 'ADJ_SG_ACC_M1'),
    ],
    'ADJ_PHRASE_PL_ACC_F_N2': [
        ('ADJ_PL_ACC_F_N2',),
        ('ADJ_PL_ACC_F_N2', 'ADJ_PL_ACC_F_N2'),
    ],
    
    # Productions with terminals
    'VERB_IMPS': [
        ('verb:imps',),
    ],
    'SUBST_SG_ACC_M1': [
        ('subst:sg:acc:m1',),
    ],
    'SUBST_PL_ACC_F': [
        ('subst:pl:acc:f',),
    ],
    'SUBST_PL_ACC_N2': [
        ('subst:pl:acc:n2',),
    ],
    'ADJ_SG_ACC_M1': [
        ('adj:sg:acc:m1',),
    ],
    'ADJ_PL_ACC_F_N2': [
        ('adj:pl:acc:m2.m3.f.n1.n2.p2.p3',),
    ],
    'ADV': [
        ('adv:',),
    ]
}  

In [571]:
TERMINALS = (
    'verb:imps',
    'subst:sg:acc:m1',
    'subst:pl:acc:f',
    'subst:pl:acc:n2',
    'adj:sg:acc:m1',
    'adj:pl:acc:m2.m3.f.n1.n2.p2.p3',
    'adv:',
)

In [572]:
class NoNonterminal(Exception):
    pass

In [573]:
class Symbol:
    def __init__(self, symbol: str):
        self.symbol = symbol

In [574]:
class Terminal(Symbol):
    pass

In [575]:
class Nonterminal(Symbol):
    def __init__(self, symbol: str, productions: List[Symbol]):
        super().__init__(symbol)
        self.productions = productions
        
    def production(self) -> Tuple[Symbol]:

        def create_new_symbol(symbol) -> Symbol:
           if symbol in NONTERMINALS:
               return Nonterminal(symbol, NONTERMINALS[symbol])
           else:
               return Terminal(symbol)
        
        # Draw the production
        rand_prod_ind = np.random.choice(len(self.productions))
        rand_prod = self.productions[rand_prod_ind]
            
        return list(map(create_new_symbol, rand_prod))

In [576]:
class Generator:
    
    def expand_terminal(self, symbols: List[Symbol]) -> List:        

        # Extract nonterminals
        nonterminals = [symbol for symbol in symbols
                        if isinstance(symbol, Nonterminal)]
            
        if not nonterminals:
            raise NoNonterminal
            
        # Expand random nonterminal
        expand_ind = np.random.choice(len(nonterminals))
        nonterminal = nonterminals[expand_ind]
        new_symbols = nonterminal.production()
            
        # Swap nonterminal with new symbols
        nonterminals_processed = 0
        for ind in range(len(symbols)):
            if isinstance(symbols[ind], Nonterminal):
                nonterminals_processed += 1
                    
                if nonterminals_processed-1 == expand_ind:
                        
                    # Delete the old nonterminal
                    symbols.pop(ind)
                                
                    # Insert new ones
                    symbols = symbols[:ind] + new_symbols + symbols[ind:]

        return symbols
    
    def symbols_to_strings(self, symbols: List[Symbol]) -> List:
        return [symbol.symbol for symbol in symbols]
        
    def gen_terminals(self, start_symbol: Symbol) -> List:
        symbols = [start_symbol]
        
        # Expand until there is any nonterminal in the symbols
        while True:
            try:
                symbols = self.expand_terminal(symbols)
            except NoNonterminal:
                return self.symbols_to_strings(symbols)
            

## Generate some sentence schemas and group them by the number of tokens

In [577]:
gen = Generator()

In [578]:
def create_schemas(n_iter: int = 1000, schemas: Dict = {}) -> Dict:
    for i in range(n_iter):
        
        start_symbol = Nonterminal('S', NONTERMINALS['S'])
        
        schema = tuple(gen.gen_terminals(start_symbol))
        schema_len = len(schema)
        
        # Update schemas
        if schema_len in schemas:
            schemas[schema_len].add(schema)
        else:
            schemas[schema_len] = {schema}
            
    # Map sets to tuples to enable drawing
    schemas = {key: tuple(val) for key, val in schemas.items()}
            
    return schemas

In [579]:
schemas = create_schemas()

# Show some schemas
dict(list(schemas.items())[:2])

{4: (('adv:', 'adv:', 'verb:imps', 'subst:sg:acc:m1'),
  ('verb:imps', 'adj:sg:acc:m1', 'adj:sg:acc:m1', 'subst:sg:acc:m1'),
  ('adv:', 'adv:', 'verb:imps', 'subst:pl:acc:n2'),
  ('adv:', 'adv:', 'verb:imps', 'subst:pl:acc:f'),
  ('verb:imps',
   'adj:pl:acc:m2.m3.f.n1.n2.p2.p3',
   'adj:pl:acc:m2.m3.f.n1.n2.p2.p3',
   'subst:pl:acc:f'),
  ('adv:', 'verb:imps', 'adj:sg:acc:m1', 'subst:sg:acc:m1'),
  ('adv:', 'verb:imps', 'adj:pl:acc:m2.m3.f.n1.n2.p2.p3', 'subst:pl:acc:n2'),
  ('adv:', 'verb:imps', 'adj:pl:acc:m2.m3.f.n1.n2.p2.p3', 'subst:pl:acc:f')),
 5: (('adv:',
   'verb:imps',
   'adj:pl:acc:m2.m3.f.n1.n2.p2.p3',
   'adj:pl:acc:m2.m3.f.n1.n2.p2.p3',
   'subst:pl:acc:f'),
  ('adv:', 'verb:imps', 'adj:sg:acc:m1', 'adj:sg:acc:m1', 'subst:sg:acc:m1'),
  ('adv:',
   'adv:',
   'verb:imps',
   'adj:pl:acc:m2.m3.f.n1.n2.p2.p3',
   'subst:pl:acc:f'),
  ('adv:',
   'adv:',
   'verb:imps',
   'adj:pl:acc:m2.m3.f.n1.n2.p2.p3',
   'subst:pl:acc:n2'),
  ('adv:', 'adv:', 'verb:imps', 'adj:sg:acc:

## Extract the grammar categories found in the grammar schemas

In [580]:
class PolimorfGen:
    POLIMORF_PATH = './data/polimorfologik-2.1.txt'
    
    def __init__(self):
        self.grammar_cats = dict((terminal, [])
                                  for terminal in TERMINALS)
    
    def __iter__(self):
        with open(self.POLIMORF_PATH) as f:
            yield from f
            
    def find_terminal_occ(self, line: str):
        """
        Search for each pattern (terminal)
        in the line of the polimorfologik file
        """
        
        base, token, grammar_cats = line.split(';')
        
        for terminal in self.grammar_cats:
            pattern = re.compile(terminal)
        
            if pattern.search(grammar_cats):
                self.grammar_cats[terminal].append((base, token))


In [581]:
polimorf = PolimorfGen()

# Extract the categories
for line in polimorf:
    polimorf.find_terminal_occ(line)

## Generate some sentences of length n without using the embeddings 

In [582]:
def core_sent_gen(n: int) -> str:
    
    # Draw the sentence schema
    try:
        schemas_n_len = schemas[n]
    except KeyError:
        print('Sentence length too large')  
        return None
        
    schema = random.choice(schemas_n_len)
    
    # Draw the tokens
    tokens_with_bases = [random.choice(polimorf.grammar_cats[category])
                         for category in schema]
    
    bases, tokens = list(zip(*tokens_with_bases))
    
    return ' '.join(bases), ' '.join(tokens)
    

In [583]:
core_sent_gen(3)

('policentrycznie rozpić aforystka', 'policentrycznie rozpito aforystki')

In [584]:
core_sent_gen(5)

('szczęśliwicko poprzypasywać przystępny nierozsiewczy kanterberyjka',
 'szczęśliwicko poprzypasywano przystępne nierozsiewcze kanterberyjki')

In [587]:
core_sent_gen(6)

('siekierkowsko nieciągliwie ubruttowić szerokoekranowy przekształcający lampiarz',
 'siekierkowsko nieciągliwie ubruttowiono szerokoekranowego przekształcającego lampiarza')

In [586]:
core_sent_gen(7)

Sentence length too large


## Prepare the Word2Vec struct

In [None]:
class CorpusGen:
    CORPUS_PATH = './data/task3_train_segmented.txt'
    
    def __init__(self, n_sent):
        self.n_sent = n_sent
    
    def __iter__(self):
        with open(self.CORPUS_PATH) as f:
            for line, _ in zip(f, range(self.n_sent)):
                yield line.split()

In [None]:
sentences = CorpusGen(10_000_000)
model = Word2Vec(sentences, min_count=1)
model.save('./data/word2vec.model')

In [591]:
model

In [596]:
os.path.exists('./data/word2vec.model')

True