In [44]:
from io import BytesIO
import pickle
import sys
import random
import numpy as np

def savePickleFile(filename, rule):
    print("> saving file:", filename)
    f = open(filename,"wb")
    pickle.dump(rule,f)
    f.close()
    print("> saved")
    
def loadPickleFile(filename):    
    print("> loading file:", filename)
    file = open(filename, 'rb')
    data = pickle.load(file)
    file.close()
    print("> loaded")
    return data


In [45]:
class NGrams:
    
    def __init__(self, words, sentence):
        self.words = words
        self.sentence = sentence
        self.tokens = sentence.split()
        
    def get_tokens(self):
        return self.tokens
    
    def add_tokens(self,value):
        temp = self.tokens
        temp.append(value)
        self.tokens = temp
        return self.tokens
        
    def unigram_model(self):
        self.next_words = np.random.choice(words, size=3)
        return self.next_words
    
    def bigram_model(self):
        next_words = []
        for i in range(len(words)-1):
            if words[i] == self.tokens[-1]:
                next_words.append(words[i+1])
        self.next_words = next_words
        return self.next_words
    
    def trigram_model(self):
        next_words = []
        for i in range(len(words)-2):
            if words[i] == self.tokens[-2]:
                if words[i+1] == self.tokens[-1]:
                    next_words.append(words[i+2])
        self.next_words = next_words
        return self.next_words
    
    def fourgram_model(self):
        next_words = []
        for i in range(len(words)-3):
            if words[i] == self.tokens[-3]:
                if words[i+1] == self.tokens[-2]:
                    if words[i+2] == self.tokens[-1]:
                        next_words.append(words[i+3])
        self.next_words = next_words
        return self.next_words

    def get_top_3_next_words(self,next_words):
        next_words_dict = dict()
        for word in next_words:
            if not word in next_words_dict.keys():
                next_words_dict[word] = 1
            else:
                next_words_dict[word] += 1

        for i,j in next_words_dict.items():
            next_words_dict[i] = np.round(j/len(next_words),2)

        return sorted(next_words_dict.items(), key = lambda k:(k[1], k[0]), reverse=True)[:3]
    
    def model_selection(self):
        if len(self.fourgram_model()) > 0:
            next_words = self.fourgram_model()
            top_words = self.get_top_3_next_words(next_words)
            print("fourgram-model")
            return top_words
        elif len(self.trigram_model()) > 0:
            next_words = self.trigram_model()
            top_words = self.get_top_3_next_words(next_words)
            print("trigram-model")
            return top_words
        elif len(self.bigram_model()) > 0:
            next_words = self.bigram_model()
            top_words = self.get_top_3_next_words(next_words)
            print("bigram-model")
            return top_words
        else:
            top_words = self.unigram_model()
            print("unigram-model")
            return top_words

In [46]:
words = loadPickleFile("corpus_news.pkl")

> loading file: corpus_news.pkl
> loaded


In [58]:
start_sentence = 'o sorteio da'
tokens = start_sentence.split()

In [64]:
model = NGrams(words=words, sentence=start_sentence)

In [65]:
model.get_tokens()

['o', 'sorteio', 'da']

In [66]:
 model.model_selection()

fourgram-model


[('ordem', 1.0)]

In [28]:
model.add_tokens('nesta')

['Presidente', 'Bolsonaro', 'deve', 'ser', 'divulgado', 'nesta']

In [29]:
 model.model_selection()

fourgram-model


[('quarta', 0.75), ('semana', 0.25)]

In [67]:
for i in range(30):
    values = model.model_selection()
    print(values)
    #value = input()
    c =  random.choice(values)
    if len(c) > 0:
        value = c[0]
    else:
        value = c
    model.add_tokens(value)

fourgram-model
[('ordem', 1.0)]
fourgram-model
[('dos', 1.0)]
fourgram-model
[('desfiles', 0.5), ('depoimentos', 0.5)]
fourgram-model
[('do', 0.5), ('das', 0.5)]
fourgram-model
[('escolas', 0.33), ('Dragões', 0.33), ('13', 0.33)]
fourgram-model
[('escolas', 1.0)]
fourgram-model
[('do', 0.5), ('deveriam', 0.5)]
fourgram-model
[('carnaval', 1.0)]
fourgram-model
[('2020', 1.0)]
fourgram-model
[('Duas', 1.0)]
fourgram-model
[('serão', 1.0)]
fourgram-model
[('rebaixadas', 1.0)]
fourgram-model
[('para', 1.0)]
fourgram-model
[('que', 1.0)]
fourgram-model
[('em', 1.0)]
fourgram-model
[('períodos', 0.33), ('2021', 0.33), ('2018', 0.33)]
fourgram-model
[('o', 1.0)]
fourgram-model
[('Grupo', 1.0)]
fourgram-model
[('Especial', 1.0)]
fourgram-model
[('tenha', 0.2), ('reunia', 0.2), ('em', 0.2)]
fourgram-model
[('2020', 0.5), ('2019', 0.17), ('2018', 0.17)]
fourgram-model
[('que', 0.33), ('depois', 0.33), ('Veja', 0.33)]
fourgram-model
[('o', 1.0)]
fourgram-model
[('resultado', 1.0)]
fourgram-model


In [68]:
print(" ".join(model.get_tokens()))

o sorteio da ordem dos desfiles das 13 escolas do carnaval 2020 Duas serão rebaixadas para que em 2021 o Grupo Especial em 2020 Veja o resultado final será divulgado em 8 de
