In [None]:
import string
import re
from collections import Counter, defaultdict
import random

import requests

# Scrap data

In [None]:
DATA_URL = 'https://wolnelektury.pl/media/book/txt/nasza-szkapa.txt'

In [None]:
text = requests.get(DATA_URL).text
text

In [None]:
# Data cleaning

In [None]:
prefix = 'Maria Konopnicka Nasza szkapa ISBN 978-83-288-2363-1'

text = text.replace('\r\n', ' ')

text = text.replace('  ', ' ')
text = text[len(prefix):]
text = text.split('-----')[0]

# in python 3.9 + 
# text.removeprefix(prefix)

## Split data into sentences 
Test 2 methods:

* using re module

* char by char iteration over text 

In [1]:
def sentence_split_using_re(text):
    sentence_terminators = '.!?'
    sentence_terminators = re.compile('[.!?]')
    return sentence_terminators.split(text)


def sentence_split(text):
    sentence_terminators = '.!?'
    current_sentence = ''
    sentences = []
    for char in text:
        if char in sentence_terminators:
            if len(current_sentence) > 0:
                sentences.append(current_sentence)
                current_sentence = ''
        else:
            current_sentence += char
    if len(current_sentence) > 0:
        sentences.append(current_sentence)
    return sentences

In [None]:
%%timeit
sentence_split(text)

In [None]:
%%timeit
sentence_split_using_re(text)

## Tokenize text

In [None]:
def tokenize(sentence):
    for punct in string.punctuation:
        sentence = sentence.replace(punct, ' ')
    tokenized = [t for t in sentence.lower().split() if t.isalpha() and len(t)]
    return tokenized

In [None]:
tokenized = [tokenize(sentence) for sentence in sentence_split(text)]
tokenized

# Create n-grams

In [None]:
def get_ngrams(tokens, n):
    t = ['<START>'] * (n - 1) + tokens
    return [(tuple(t[i:i+n-1]), t[i+n]) for i in range(len(t)-n)]

In [None]:
n_grams = [get_ngrams(sentence, 3) for sentence in tokenized]
# Counter(n_grams).most_common()
n_grams

# N-gram model 

In [None]:
class NgramModel(object):

    def __init__(self, n):
        self.n = n
        self.context = defaultdict(list)
        self.ngram_counter = Counter()

    def update(self, sentence: str) -> None:
        ngrams = get_ngrams(tokenize(sentence), self.n)
        for ngram in ngrams:
            self.ngram_counter[ngram] += 1
            self.context[ngram[0]].append(ngram[1])
                
    def prob(self, context, token):
        """
        Calculates probability of a candidate token to be generated given a context
        :return: conditional probability
        """
        count_of_token = self.ngram_counter[(context, token)]
        count_of_context = len(self.context[context])
        if count_of_context > 0: 
            return count_of_token / count_of_context
        return 0.0
    
    def random_token(self, context):
        """
        Given a context we "semi-randomly" select the next word to append in a sequence
        :param context:
        :return:
        """
        r = random.random()
        map_to_probs = {}
        token_of_interest = self.context[context]
        for token in token_of_interest:
            map_to_probs[token] = self.prob(context, token)

        summ = 0
        for token in sorted(map_to_probs):
            summ += map_to_probs[token]
            if summ > r:
                return token

    def generate_text(self, token_count: int):
        """
        :param token_count: number of words to be produced
        :return: generated text
        """
        n = self.n
        context_queue = (n - 1) * ['<START>']
        result = []
        while len(result) < token_count:
            predicted_token = self.random_token(tuple(context_queue))
            if predicted_token:
                result.append(predicted_token)
            else:
                predicted_token = '<START>'
            context_queue.pop(0)
            context_queue.append(predicted_token)
        return ' '.join(result)

## Results 

In [None]:
%%time
model = NgramModel(2)
for sentence in sentence_split(text):
    model.update(sentence) 

In [None]:
model.ngram_counter.most_common()

In [None]:
model.generate_text(10)