# Grammar blah blah

In [611]:
import numpy as np
import random
import re
import os
import warnings
from operator import itemgetter
from sklearn.metrics import accuracy_score
from typing import List, Tuple, Dict
from gensim.models import Word2Vec

In [535]:
# Disable annoying warnings from gensim
warnings.filterwarnings("ignore")

## Define nontrivial polish language grammar

In [536]:
NONTERMINALS = {
    'S': [
        ('VERB_PHRASE_IMPS',),
        ('VERB_PHRASE_IMPS', 'CONJ', 'VERB_PHRASE_IMPS'),
        ('NOM_PHRASE_SG_M', 'VERB_PHRASE_SG_FIN_M_TER'),
        ('NOM_PHRASE_SG_M', 'VERB_PHRASE_SG_FIN_M_TER',
         'CONJ', 'VERB_PHRASE_SG_FIN_M_TER'),
        ('NOM_PHRASE_SG_M', 'VERB_PHRASE_SG_FIN_M_TER',
         'CONJ', 'NOM_PHRASE_SG_M', 'VERB_PHRASE_SG_FIN_M_TER'),
    ],
    'VERB_PHRASE_IMPS': [
        ('VERB_IMPS',),
        ('ADV', 'VERB_IMPS'),
        ('VERB_IMPS', 'ACC_PHRASE_SG_M1'),
        ('VERB_IMPS', 'ACC_PHRASE_PL_F'),
        ('VERB_IMPS', 'ACC_PHRASE_PL_N2'),
        ('ADV', 'VERB_IMPS', 'ACC_PHRASE_SG_M1'),
        ('ADV', 'VERB_IMPS', 'ACC_PHRASE_PL_F'),
        ('ADV', 'VERB_IMPS', 'ACC_PHRASE_PL_N2'),
        
    ],
    'VERB_PHRASE_SG_FIN_M_TER': [
        ('VERB_SG_FIN_M_TER',),
        ('ADV', 'VERB_SG_FIN_M_TER'),
        ('VERB_SG_FIN_M_TER', 'ACC_PHRASE_SG_M1'),
        ('VERB_SG_FIN_M_TER', 'ACC_PHRASE_PL_F'),
        ('VERB_SG_FIN_M_TER', 'ACC_PHRASE_PL_N2'),
        ('ADV', 'VERB_SG_FIN_M_TER', 'ACC_PHRASE_SG_M1'),
        ('ADV', 'VERB_SG_FIN_M_TER', 'ACC_PHRASE_PL_F'),
        ('ADV', 'VERB_SG_FIN_M_TER', 'ACC_PHRASE_PL_N2'),
    ],
    'NOM_PHRASE_SG_M': [
        ('SUBST_SG_NOM_M',),
        ('SUBST_SG_NOM_M', 'PREP_ACC_PHRASE'),
        ('ADJ_PHRASE_SG_NOM_M', 'SUBST_SG_NOM_M'),
        ('ADJ_PHRASE_SG_NOM_M', 'SUBST_SG_NOM_M', 'PREP_ACC_PHRASE'),
    ],
    'ACC_PHRASE_SG_M1': [
        ('SUBST_SG_ACC_M1',),
        ('ADJ_PHRASE_SG_ACC_M1', 'SUBST_SG_ACC_M1',),
    ],
    'ACC_PHRASE_PL_F': [
        ('SUBST_PL_ACC_F',),
        ('ADJ_PHRASE_PL_ACC_F_N2', 'SUBST_PL_ACC_F'),
    ],
    'ACC_PHRASE_PL_N2': [
        ('SUBST_PL_ACC_N2',),
        ('ADJ_PHRASE_PL_ACC_F_N2', 'SUBST_PL_ACC_N2'),
    ],
    'ADJ_PHRASE_SG_NOM_M': [
        ('ADJ_SG_NOM_M',),
        ('ADJ_SG_NOM_M', 'ADJ_SG_NOM_M'),
    ],
    'ADJ_PHRASE_SG_ACC_M1': [
        ('ADJ_SG_ACC_M1',),
        ('ADJ_SG_ACC_M1', 'ADJ_SG_ACC_M1'),
    ],
    'ADJ_PHRASE_PL_ACC_F_N2': [
        ('ADJ_PL_ACC_F_N2',),
        ('ADJ_PL_ACC_F_N2', 'ADJ_PL_ACC_F_N2'),
    ],
    'PREP_ACC_PHRASE': [
        ('PREP_ACC', 'ACC_PHRASE_SG_M1'),
        ('PREP_ACC', 'ACC_PHRASE_PL_F'),
        ('PREP_ACC', 'ACC_PHRASE_PL_N2'),
    ],
    
    # Productions with terminals
    'VERB_SG_FIN_M_TER': [('verb:fin:sg:ter.*:refl',)],
    'VERB_IMPS': [('verb:imps',)],
    'SUBST_SG_NOM_M': [('subst:sg:nom:m',)],
    'SUBST_SG_ACC_M1': [('subst:sg:acc:m1',)],
    'SUBST_PL_ACC_F': [('subst:pl:acc:f',)],
    'SUBST_PL_ACC_N2': [('subst:pl:acc:n2',)],
    'ADJ_SG_NOM_M': [('adj:sg:nom.voc:m1.m2.m3',)],
    'ADJ_SG_ACC_M1': [('adj:sg:acc:m1',)],
    'ADJ_PL_ACC_F_N2': [('adj:pl:acc:m2.m3.f.n1.n2.p2.p3',)],
    'ADV': [('adv:',)],
    'PREP_ACC': [('prep:acc',)],
    'CONJ': [('^conj$',)],
}  

In [537]:
TERMINALS = (
    'verb:fin:sg:ter.*:refl',
    'verb:imps',
    'subst:sg:nom:m',
    'subst:sg:acc:m1',
    'subst:pl:acc:f',
    'subst:pl:acc:n2',
    'adj:sg:nom.voc:m1.m2.m3',
    'adj:sg:acc:m1',
    'adj:pl:acc:m2.m3.f.n1.n2.p2.p3',
    'adv:',
    'prep:acc',
    '^conj$',
)

In [538]:
class NoNonterminal(Exception):
    pass

In [539]:
class TooLongSent(Exception):
    pass

In [540]:
class Symbol:
    def __init__(self, symbol: str):
        self.symbol = symbol

In [541]:
class Terminal(Symbol):
    pass

In [542]:
class Nonterminal(Symbol):
    def __init__(self, symbol: str, productions: List[Symbol]):
        super().__init__(symbol)
        self.productions = productions
        
    def production(self) -> Tuple[Symbol]:

        def create_new_symbol(symbol) -> Symbol:
           if symbol in NONTERMINALS:
               return Nonterminal(symbol, NONTERMINALS[symbol])
           else:
               return Terminal(symbol)
        
        # Draw the production
        rand_prod_ind = np.random.choice(len(self.productions))
        rand_prod = self.productions[rand_prod_ind]
            
        return list(map(create_new_symbol, rand_prod))

In [543]:
class Generator:
    
    def expand_terminal(self, symbols: List[Symbol]) -> List:        

        # Extract nonterminals
        nonterminals = [symbol for symbol in symbols
                        if isinstance(symbol, Nonterminal)]
            
        if not nonterminals:
            raise NoNonterminal
            
        # Expand random nonterminal
        expand_ind = np.random.choice(len(nonterminals))
        nonterminal = nonterminals[expand_ind]
        new_symbols = nonterminal.production()
            
        # Swap nonterminal with new symbols
        nonterminals_processed = 0
        for ind in range(len(symbols)):
            if isinstance(symbols[ind], Nonterminal):
                nonterminals_processed += 1
                    
                if nonterminals_processed-1 == expand_ind:
                        
                    # Delete the old nonterminal
                    symbols.pop(ind)
                                
                    # Insert new ones
                    symbols = symbols[:ind] + new_symbols + symbols[ind:]

        return symbols
    
    def symbols_to_strings(self, symbols: List[Symbol]) -> List:
        return [symbol.symbol for symbol in symbols]
        
    def gen_terminals(self, start_symbol: Symbol) -> List:
        symbols = [start_symbol]
        
        # Expand until there is any nonterminal in the symbols
        while True:
            try:
                symbols = self.expand_terminal(symbols)
            except NoNonterminal:
                return self.symbols_to_strings(symbols)
            

## Generate some sentence schemas and group them by the number of tokens

In [544]:
gen = Generator()

In [545]:
def create_schemas(n_iter: int = 1000, schemas: Dict = {}) -> Dict:
    for i in range(n_iter):
        
        start_symbol = Nonterminal('S', NONTERMINALS['S'])
        
        schema = tuple(gen.gen_terminals(start_symbol))
        schema_len = len(schema)
        
        # Update schemas
        if schema_len in schemas:
            schemas[schema_len].add(schema)
        else:
            schemas[schema_len] = {schema}
            
    # Map sets to tuples to enable drawing
    schemas = {key: tuple(val) for key, val in schemas.items()}
            
    return schemas

In [546]:
schemas = create_schemas()

# Show some schemas
schemas[3]

(('adv:', 'verb:imps', 'subst:sg:acc:m1'),
 ('verb:imps', 'adj:sg:acc:m1', 'subst:sg:acc:m1'),
 ('verb:imps', 'adj:pl:acc:m2.m3.f.n1.n2.p2.p3', 'subst:pl:acc:f'),
 ('subst:sg:nom:m', 'adv:', 'verb:fin:sg:ter.*:refl'),
 ('verb:imps', '^conj$', 'verb:imps'),
 ('subst:sg:nom:m', 'verb:fin:sg:ter.*:refl', 'subst:pl:acc:n2'),
 ('adv:', 'verb:imps', 'subst:pl:acc:f'),
 ('subst:sg:nom:m', 'verb:fin:sg:ter.*:refl', 'subst:sg:acc:m1'),
 ('adv:', 'verb:imps', 'subst:pl:acc:n2'),
 ('verb:imps', 'adj:pl:acc:m2.m3.f.n1.n2.p2.p3', 'subst:pl:acc:n2'),
 ('adj:sg:nom.voc:m1.m2.m3', 'subst:sg:nom:m', 'verb:fin:sg:ter.*:refl'),
 ('subst:sg:nom:m', 'verb:fin:sg:ter.*:refl', 'subst:pl:acc:f'))

## Extract the grammar categories found in the grammar schemas

In [547]:
class PolimorfGen:
    POLIMORF_PATH = './data/polimorfologik-2.1.txt'
    
    def __init__(self):
        self.grammar_cats = dict((terminal, [])
                                  for terminal in TERMINALS)
    
    def __iter__(self):
        with open(self.POLIMORF_PATH) as f:
            yield from f
            
    def find_terminal_occ(self, line: str):
        """
        Search for each pattern (terminal)
        in the line of the polimorfologik file
        """
        
        base, token, grammar_cats = line.split(';')
        
        for terminal in self.grammar_cats:
            pattern = re.compile(terminal)
        
            if pattern.search(grammar_cats):
                self.grammar_cats[terminal].append((base, token))


In [548]:
polimorf = PolimorfGen()

# Extract the categories
for line in polimorf:
    polimorf.find_terminal_occ(line)

## Generate some sentences of length n without using the embeddings 

In [549]:
class SentGen:
    def rand_schema(self, n: int) -> Tuple:

        # Draw the sentence schema
        try:
            schemas_n_len = schemas[n]
            return random.choice(schemas_n_len)
        except KeyError:
            raise TooLongSent

    def core_sent_gen(self, n: int) -> Tuple:
        try:
            schema = self.rand_schema(n)
            
            # Draw the tokens
            tokens_with_bases = [random.choice(polimorf.grammar_cats[category])
                                 for category in schema]

            bases, tokens = list(zip(*tokens_with_bases))

            return ' '.join(tokens)
        except TooLongSent:
            print('Sentence too long')
    

In [550]:
sent_gen = SentGen()

In [551]:
sent_gen.core_sent_gen(5)

'nierozbielały zamachowiec zeszarpie więc wykomentuje'

In [552]:
sent_gen.core_sent_gen(6)

'ikt niezjawiskowo podtrzyma alić niegrzybowsko mistycyzuje'

In [553]:
sent_gen.core_sent_gen(7)

'wjadano niedolnoniemieckie jednonarodowościowe żołędności póty upośledzono nielepowości'

In [554]:
sent_gen.core_sent_gen(9)

'współbrzmienny niepółfonetyczny puaz popod grząskie magnetosferyczne odwapnienia niekusicielsko skameralizuje'

In [555]:
sent_gen.core_sent_gen(20)

'hydrant po nienapotne niespadkowe następstwa pysznicko stropikalizuje góreckie niemaoryskie bezcześci jakokolwiek niestaroczesny nieborowiczkowski Strumph-Wojtkiewicz przed hydrazynowego reeksportera prześpiewuje nieskulskie swoje'

In [556]:
sent_gen.core_sent_gen(30)

Sentence too long


## Prepare the Word2Vec struct

In [557]:
class CorpusGen:
    CORPUS_PATH = './data/task3_train_segmented.txt'
    
    def __init__(self, n_sent):
        self.n_sent = n_sent
    
    def __iter__(self):
        with open(self.CORPUS_PATH) as f:
            for line, _ in zip(f, range(self.n_sent)):
                yield line.split()

In [558]:
if not os.path.isfile('./data/word2vec.model'):
    # Perform the embeddings only during the first session 
    
    sentences = CorpusGen(10_000_000)
    model = Word2Vec(sentences, min_count=1)
    model.save('./data/word2vec.model')
else:
    # The model exists
    
    # Gensim fails in case of loading the model for the second time
    try:
        model
    except NameError:
        model = Word2Vec.load('./data/word2vec.model')

In [559]:
len(model.wv.vocab)

2640650

## Generate thematical sentences from the grammar

In [560]:
TOPICS = (
    ('malina', 'koszyk', 'zazdrość', 'morderstwo'),
    ('programowanie', 'błąd', 'zmienna', 'deklaracja'),
    ('lotniskowiec', 'łódź', 'podwodny',
     'tonąć', 'atak', 'torpeda', 'ocean'),
)

In [561]:
class TopicSentGen(SentGen):
    def __init__(self, model):
        self.model = model
        
    def choose_best_token(self, tokens: List, topic: Tuple) -> str:
        
        # Draw the topic token
        topic_token = random.choice(topic)
        
        base_token_similarities = {pair: 0 for pair in tokens}
    
        def update_sims(pair: Tuple, token: str) -> Dict:
            if token in self.model.wv.vocab:
                base_token_similarities[pair] +=\
                    model.wv.similarity(token, topic_token)
    
        # For each pair similarity is a sum of
        # similarity(base, topic_token) and similarity(token, topic_word)
        for base, token in base_token_similarities:
            update_sims((base, token), base)
            update_sims((base, token), token)
            
        best_base, best_token = max(base_token_similarities.items(),
                                    key=itemgetter(1))[0]
                    
        return best_token
        
    def topic_sent_gen(self, n: int, topic: Tuple,
                       n_to_choose: int = 1000) -> str:
        try:
            schema = self.rand_schema(n)
            
            categories = [random.choices(polimorf.grammar_cats[category],
                                         k=n_to_choose)
                          for category in schema]
            
            topic_sent = [self.choose_best_token(category, topic)
                          for category in categories]
            
            return topic_sent
            
        except TooLongSent:
            print('Sentence too long')
            
    def topic_sent_gen_n_times(self, n: int, topic: Tuple,
                               n_to_choose: int = 1000,
                               n_times: int = 100) -> List:
        
        sents = [self.topic_sent_gen(n, topic, n_to_choose)
                 for _ in range(n_times)]
        
        return sents

In [562]:
topic_sent_gen = TopicSentGen(model)

### Check some examples

In [563]:
topic_sent_gen.topic_sent_gen(5, TOPICS[0])

['gwałcono', 'ylang-ylang', 'tedy', 'złorzeczono', 'czerwienie']

In [564]:
topic_sent_gen.topic_sent_gen(10, TOPICS[1])

['pijany',
 'Borówka',
 'co',
 'grafiki',
 'mikro',
 'łuszczy',
 'odchylenia',
 'więc',
 'pewnie',
 'obliczy']

In [565]:
topic_sent_gen.topic_sent_gen(15, TOPICS[2])

['świt',
 'popod',
 'rwące',
 'minowe',
 'działka',
 'spogląda',
 'tłuste',
 'frontalne',
 'podwodne',
 'jakoż',
 'nieprzyjacielski',
 'ponton',
 'popod',
 'Yamahy',
 'zestrzeli']

## Choose the best topic sent with Positive Pointwise Mutual Information (PPMI)

### Create unigrams and bigrams structures

In [566]:
class NGrams:

    DATA_PATH = './data/poleval_2grams.txt'

    def create_bigrams_unigrams(self, k: int = 100) -> Tuple:
        
        unigrams, bigrams = {}, {}
        
        def update_unigrams(token: str, freq: str) -> None:
            if token in unigrams:
                unigrams[token] += int(freq)
            else:
                unigrams[token] = int(freq)
                
        def update_bigrams(predecesor: str, successor: str,
                           freq: str) -> None:
            bigrams[(predecesor, successor)] = int(freq)

        with open(self.DATA_PATH) as poleval:
            for line in poleval:
                freq, predecesor, successor = line.split()

                # Update bigrams ans unigrams
                if int(freq) >= k:
                    update_bigrams(predecesor, successor, freq)
                    update_unigrams(predecesor, freq)
                    update_unigrams(successor, freq)

        return unigrams, bigrams


In [567]:
ngrams = NGrams()

In [568]:
unigrams, bigrams = ngrams.create_bigrams_unigrams()

In [569]:
# Part of unigrams
dict(list(unigrams.items())[:5])

{':': 1701656,
 'dalszego': 15414,
 'i': 12077277,
 'prowadzenia': 76467,
 'richard': 1695}

In [570]:
# Part of bigrams
dict(list(bigrams.items())[:5])

{(':', 'richard'): 104,
 ('bo', 'chcemy'): 247,
 ('dalszego', 'prowadzenia'): 349,
 ('i', 'stosunek'): 137,
 ('określonych', 'ustawą'): 599}

In [575]:
class PMI:
    def __init__(self, unigrams: Dict, bigrams: Dict):
        self.unigrams = unigrams
        self.bigrams = bigrams
    
    def measure_pmi(self, sentence: List) -> float:
        predecesors = sentence.copy()
        successors = sentence.copy()
        
        predecesors.insert(0, '<BOS>')
        successors.append('<EOS>')
        
        def PMI(predecesor: str, successor: str) -> float:
            numerator = bigrams.get((predecesor, successor), 1.)
            denominator = unigrams.get(predecesor, 1.) *\
                          unigrams.get(successor, 1.)
            
            return np.log(numerator / denominator)
        
        sent_bigrams = list(zip(predecesors, successors))
        
        pmi = sum([PMI(predecesor, successor)
                   for predecesor, successor in sent_bigrams])
        
        return pmi
    
    def choose_highest_pmi(self, sentences: List) -> str:
        pmis_sents = [(self.measure_pmi(sent), sent) for sent in sentences]
        
        _, sent = max(pmis_sents)
        
        return ' '.join(sent).capitalize()

### Generate random sequence of topics and coresponding sentences

In [576]:
pmi = PMI(unigrams, bigrams)

In [577]:
topics = random.choices(TOPICS, k=10)

In [579]:
TOPICS

(('malina', 'koszyk', 'zazdrość', 'morderstwo'),
 ('programowanie', 'błąd', 'zmienna', 'deklaracja'),
 ('lotniskowiec', 'łódź', 'podwodny', 'tonąć', 'atak', 'torpeda', 'ocean'))

In [578]:
for topic in topics:
    print(pmi.choose_highest_pmi(topic_sent_gen.topic_sent_gen_n_times(10, topic)))

Materializm popod bezkarne zwątpienia podstępnie chwyci pedofila vel okrutnie błaga
Rozdzierano morganatycznego wadliwego namiastka póty asymptotycznie rozdzierano niepoczytalnego odredakcyjnego samca
Zestrzelono ruchliwego goniącego astronautę jakoż złotawo rozbito owocowego zielonkawego mausera
Żyrokompas popod paleniska wietrzy czujki jakoż zaatakuje kabinowe malownicze hydrazyny
Ścigacz jmie armijne jednofunkcyjne infiltracje jakoż dokuje jednostrzałowe rurowe diabolo
Ostrogocki spryt zrumieni vel zbiorniczek brunatno gani przerażonego glinianego warzywnego
Poradziecki klakson przywabia jakoż pług galwanicznie zdmuchnie domorosłego śmiercionośnego sulejmana
Niezdecydowanie obwiniano potworne obyczajowe spóźnienia tedy złośliwie nasączono poręczne emmy
Upośledzono niejadalnego przeświadczonego suwerena póty wyszukanie przesłodzono dipolowego wektorowego instalatora
Krzem nieomylnie indukuje tau póty wykładniczo zezna tensorowe ochrowe wola


In [614]:
real = [topics.index(topic) for topic in topics]

In [615]:
# Type your predictions here
predictions = []

if len(predictions) == len(topics):
    accuracy_score(predictions, real)