# Grammar blah blah

In [239]:
import numpy as np
from typing import List, Tuple
from gensim.models import Word2Vec

## Define nontrivial polish language grammar

In [240]:
# At the moment is trivial :) TODO
NONTERMINALS = {
    'S': [('NP', 'VP'), ('VP',)],
    'NP': [('N',), ('A', 'NP')],
    'VP': [('V',), ('V', 'PP')],
}  

In [241]:
class NoNonterminal(Exception):
    pass

In [242]:
class Symbol:
    def __init__(self, symbol: str):
        self.symbol = symbol

In [243]:
class Terminal(Symbol):
    pass

In [249]:
class Nonterminal(Symbol):
    def __init__(self, symbol: str, productions: List[Symbol]):
        super().__init__(symbol)
        self.productions = productions
        
    def production(self) -> Tuple[Symbol]:

        def create_new_symbol(symbol) -> Symbol:
           if symbol in NONTERMINALS:
               return Nonterminal(symbol, NONTERMINALS[symbol])
           else:
               return Terminal(symbol)
        
        # Draw the production
        rand_prod_ind = np.random.choice(len(self.productions))
        rand_prod = self.productions[rand_prod_ind]
            
        return list(map(create_new_symbol, rand_prod))

In [250]:
class Generator:
    
    def expand_terminal(self, symbols: List[Symbol]) -> List:        

        # Extract nonterminals
        nonterminals = [symbol for symbol in symbols
                        if isinstance(symbol, Nonterminal)]
            
        if not nonterminals:
            raise NoNonterminal
            
        # Expand random nonterminal
        expand_ind = np.random.choice(len(nonterminals))
        nonterminal = nonterminals[expand_ind]
        new_symbols = nonterminal.production()
            
        # Swap nonterminal with new symbols
        nonterminals_processed = 0
        for ind in range(len(symbols)):
            if isinstance(symbols[ind], Nonterminal):
                nonterminals_processed += 1
                    
                if nonterminals_processed-1 == expand_ind:
                        
                    # Delete the old nonterminal
                    symbols.pop(ind)
                                
                    # Insert new ones
                    symbols = symbols[:ind] + new_symbols + symbols[ind:]

        return symbols
    
    def symbols_to_strings(self, symbols: List[Symbol]) -> List:
        return [symbol.symbol for symbol in symbols]
        
    def gen_terminals(self, start_symbol: Symbol) -> List:
        symbols = [start_symbol]
        
        # Expand until there is any nonterminal in the symbols
        while True:
            try:
                symbols = self.expand_terminal(symbols)
            except NoNonterminal:
                return self.symbols_to_strings(symbols)
            

## Generate some sentence schemas and group them by the number of tokens

In [251]:
gen = Generator()

In [252]:
dict_of_schemas = {}

In [253]:
for i in range(100):

    start_symbol = Nonterminal('S', NONTERMINALS['S'].copy())

    schema = tuple(gen.gen_terminals(start_symbol))
    schema_len = len(schema)
    
    if schema_len in dict_of_schemas:
        dict_of_schemas[schema_len].add(schema)
    else:
        dict_of_schemas[schema_len] = {schema}

In [254]:
dict_of_schemas

{1: {('V',)},
 2: {('N', 'V'), ('V', 'PP')},
 3: {('A', 'N', 'V'), ('N', 'V', 'PP')},
 4: {('A', 'A', 'N', 'V'), ('A', 'N', 'V', 'PP')},
 5: {('A', 'A', 'A', 'N', 'V'), ('A', 'A', 'N', 'V', 'PP')},
 6: {('A', 'A', 'A', 'N', 'V', 'PP')},
 7: {('A', 'A', 'A', 'A', 'A', 'N', 'V'),
  ('A', 'A', 'A', 'A', 'N', 'V', 'PP')},
 8: {('A', 'A', 'A', 'A', 'A', 'N', 'V', 'PP')},
 9: {('A', 'A', 'A', 'A', 'A', 'A', 'N', 'V', 'PP')}}

## Generate the real sentences using bank of tokens and generated grammar-categories schemas

## Prepare the Word2Vec struct

In [259]:
class CorpusGen:
    CORPUS_PATH = './data/task3_train_segmented.txt'
    
    def __init__(self, n_sent):
        self.n_sent = n_sent
    
    def __iter__(self):
        with open(self.CORPUS_PATH) as f:
            for line, _ in zip(f, range(self.n_sent)):
                yield line.split()

In [266]:
sentences = CorpusGen(1_000_000)
model = Word2Vec(sentences, min_count=1)
model.save('./data/word2vec.model')