In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import files
import pickle
%matplotlib inline

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
df = pd.read_csv('/content/drive/My Drive/SHIFT/готовые csv/comment.csv')

In [0]:
lines = [str(line) for line in df['text']]

In [0]:
sorted(lines, key=len)

In [0]:
from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()

lines = [' '.join(tokenizer.tokenize(word.lower())) for word in lines]

In [0]:
from tqdm import tqdm
from collections import defaultdict, Counter

# special tokens: 
# - unk represents absent tokens, 
# - eos is a special token after the end of sequence

UNK, EOS = "_UNK_", "_EOS_"

def count_ngrams(lines, n):
    """
    Count how many times each word occured after (n - 1) previous words
    :param lines: an iterable of strings with space-separated tokens
    :returns: a dictionary { tuple(prefix_tokens): {next_token_1: count_1, next_token_2: count_2}}

    When building counts, please consider the following two edge cases
    - if prefix is shorter than (n - 1) tokens, it should be padded with UNK. For n=3,
      empty prefix: "" -> (UNK, UNK)
      short prefix: "the" -> (UNK, the)
      long prefix: "the new approach" -> (new, approach)
    - you should add a special token, EOS, at the end of each sequence
      "... with deep neural networks ." -> (..., with, deep, neural, networks, ., EOS)
      count the probability of this token just like all others.
    """
    counts = defaultdict(Counter)
    # counts[(word1, word2)][word3] = how many times word3 occured after (word1, word2)

    for line in tqdm(lines):
      line = (UNK + ' ') * (n - 1) + line + ' ' + EOS
      tokens = line.split()
      for i in range(n - 1, len(tokens)):
        prefix = tokens[i - n + 1 : i]
        counts[tuple(prefix)][tokens[i]] += 1
    
    return counts

In [0]:
class NGramLanguageModel:    
    def __init__(self, lines, n):
        """ 
        Train a simple count-based language model: 
        compute probabilities P(w_t | prefix) given ngram counts
        
        :param n: computes probability of next token given (n - 1) previous words
        :param lines: an iterable of strings with space-separated tokens
        """
        assert n >= 1
        self.n = n
    
        counts = count_ngrams(lines, self.n)
        
        # compute token proabilities given counts
        self.probs = defaultdict(Counter)
        # probs[(word1, word2)][word3] = P(word3 | word1, word2)
        
        # populate self.probs with actual probabilities
        counts_of_prefix = Counter()
        for key, value in counts.items():
          counts_of_prefix[key] = sum(value.values())
        
        for key, value in counts.items():
          for token in value:
            self.probs[key][token] = value[token] / counts_of_prefix[key]
            
    def get_possible_next_tokens(self, prefix):
        """
        :param prefix: string with space-separated prefix tokens
        :returns: a dictionary {token : it's probability} for all tokens with positive probabilities
        """
        prefix = prefix.split()
        prefix = prefix[max(0, len(prefix) - self.n + 1):]
        prefix = [ UNK ] * (self.n - 1 - len(prefix)) + prefix
        return self.probs[tuple(prefix)]
    
    def get_next_token_prob(self, prefix, next_token):
        """
        :param prefix: string with space-separated prefix tokens
        :param next_token: the next token to predict probability for
        :returns: P(next_token|prefix) a single number, 0 <= P <= 1
        """
        return self.get_possible_next_tokens(prefix).get(next_token, 0)

In [0]:
lm = NGramLanguageModel(lines, n=4)

In [0]:
def get_next_token(lm, prefix, temperature=0.4):
    """
    return next token after prefix;
    :param temperature: samples proportionally to lm probabilities ^ temperature
        if temperature == 0, always takes most likely token. Break ties arbitrarily.
    """
    probs_dict = lm.get_possible_next_tokens(prefix)
    probs = np.array(list(probs_dict.values()))  
    if temperature == 0:
      return list(probs_dict.keys())[np.argmax(probs)]
    else:
      probs = probs ** (1 / temperature)
      probs /= sum(probs)
      return list(probs_dict.keys())[np.random.choice(len(probs),p = probs)]

In [0]:
from collections import Counter

prefix = 'космос' # <- your ideas :)

for i in range(100):
    prefix += ' ' + get_next_token(lm, prefix)
    if prefix.endswith(EOS) or len(lm.get_possible_next_tokens(prefix)) == 0:
        break
        
print(prefix)

In [0]:
with open('lmodel.pkl', 'wb') as f:
  pickle.dump(lm, f)

In [0]:
files.download('lmodel.pkl')