# Natural Language Processing

### List 2

In [1]:
from src.parsers.tag_bigram_parser import TagBigramParser
from src.parsers.tag_parser import TagParser
from src.parsers.bigram_parser import BigramParser
from src.bigram_tag_bigram_shuffler import BigramTagBigramShuffler
from src.jackard_index import Jackard
from src.parsers.short_sentence_parser import ShortSentenceParser
import numpy as np

## Create required objects, structs etc.

In [2]:
bigram_parser = BigramParser('poleval_2grams.txt')
simple_bigrams = bigram_parser.create_simple_bigrams_struct()

In [3]:
# Some examples of simple_bigrams
dict(list(simple_bigrams.bigrams.items())[:2])

{('rozdrobniona', 'sieć'): '11', ('świadectwem', ','): '87'}

In [4]:
tag_parser = TagParser()

In [5]:
tagged_tokens = tag_parser.create_tag_token_pairs()

In [6]:
tag_bigram_parser = TagBigramParser()

In [7]:
tag_bigrams = tag_bigram_parser.create_tag_bigrams_dict(tagged_tokens)

In [8]:
# Some examples of tag_bigrams
dict(list(tag_bigrams.tag_bigrams.items())[:2])

{('T3618', 'T2906'): 16044, ('T37', 'T77'): 6571}

In [9]:
bigram_tagbigram_shuffler = BigramTagBigramShuffler()

In [10]:
short_sentence_parser = ShortSentenceParser()
short_sentences = short_sentence_parser.extract_sentences(threshold=6)

In [11]:
len(short_sentences)

1685510

#### Check BigramTagBigramShuffler

In [12]:
bigram_tagbigram_shuffler.get_the_best_perm_bigram('operatora zarówno pełnienie jakiejś',
                                                    simple_bigrams)

('jakiejś operatora pełnienie zarówno', 0.9999877116507956)

In [13]:
bigram_tagbigram_shuffler.get_the_best_perm_tag_bigram('operatora zarówno pełnienie jakiejś',
                                                        tag_bigrams, tagged_tokens)

('zarówno jakiejś operatora pełnienie', 1.0)

In [14]:
jackard_measure = Jackard()

## Tune the $\alpha$ on the validation data

In [15]:
validation_sentences = short_sentences[:100]

In [16]:
def measure_jackard_alpha(sentence: str, alpha: float) -> float:
    """
    Return jackard index for the sentence with respect to
    the found alpha
    """
    
    # Handle sentence based on bigrams
    gen_bigrams_sent, gen_bigrams_naturalness = \
        bigram_tagbigram_shuffler.get_the_best_perm_bigram(sentence,
                                                               simple_bigrams)  
    vote_bigram = alpha * gen_bigrams_naturalness
        
    # Handle sentence based on tag bigrams
    gen_tag_bigrams_sent,  gen_tag_bigrams_naturalness = \
        bigram_tagbigram_shuffler.get_the_best_perm_tag_bigram(sentence,
                                                                   tag_bigrams,
                                                                   tagged_tokens)       
    vote_tag_bigram = alpha * (1-alpha) * gen_tag_bigrams_naturalness
        
    if vote_bigram > vote_tag_bigram:
            
        # Add bigrams sent to jackard for the alpha
        jackard = jackard_measure.jackard_for_sentences(sentence, gen_bigrams_sent)
    else:
        
        # Add tag bigrams to jackard for the alpha
        jackard = jackard_measure.jackard_for_sentences(sentence, gen_tag_bigrams_sent)
        
    return jackard

In [17]:
best_jackard, best_alpha = 0, 0.

for alpha in np.linspace(0., 1., 101):
    jackard_sum = 0.
    
    for sentence in validation_sentences:
        
        jackard = measure_jackard_alpha(sentence, alpha)
        
        jackard_sum += jackard
        
    # Update the best alpha
    if jackard_sum > best_jackard:
        best_jackard = jackard_sum
        best_alpha = alpha

#### Found parameter alpha and coresponding jackard

In [18]:
best_jackard / len(validation_sentences), best_alpha

(0.45547619047619053, 0.01)

## Tests

In [19]:
measure_jackard_alpha('Owocniki pojawiają się od września do listopada.', best_alpha)

0.25

In [20]:
measure_jackard_alpha('Na skrzyżowaniu skręć w lewo', best_alpha)

0.16666666666666666

In [21]:
measure_jackard_alpha('Zaraz będzie koniec.', best_alpha)

1.0

In [22]:
measure_jackard_alpha('zaraz będzie koniec', best_alpha)

0.25

In [23]:
measure_jackard_alpha('Komisja wnosi o przyjęcie tej poprawki.', best_alpha)

0.14285714285714285

In [24]:
measure_jackard_alpha('komisja wnosi o przyjęcie tej poprawki', best_alpha)

1.0