In [219]:
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt

In [220]:
#@title Importing dataset

data = pd.read_csv("ted_main.csv")
data = data[['description', 'title']]
(data)

Unnamed: 0,description,title
0,Sir Ken Robinson makes an entertaining and pro...,Do schools kill creativity?
1,With the same humor and humanity he exuded in ...,Averting the climate crisis
2,New York Times columnist David Pogue takes aim...,Simplicity sells
3,"In an emotionally charged talk, MacArthur-winn...",Greening the ghetto
4,You've never seen data presented like this. Wi...,The best stats you've ever seen
...,...,...
2545,"Between 2008 and 2016, the United States depor...",What we're missing in the debate about immigra...
2546,How can you study Mars without a spaceship? He...,The most Martian place on Earth
2547,Science fiction visions of the future show us ...,What intelligent machines can learn from a sch...
2548,In an unmissable talk about race and politics ...,A black man goes undercover in the alt-right


In [221]:
data['description'][0]

'Sir Ken Robinson makes an entertaining and profoundly moving case for creating an education system that nurtures (rather than undermines) creativity.'

In [222]:
lines = data.apply(lambda row: row['title'] + " ; " + row['description'], axis=1).tolist()

lines[0]

'Do schools kill creativity? ; Sir Ken Robinson makes an entertaining and profoundly moving case for creating an education system that nurtures (rather than undermines) creativity.'

In [223]:
#@title Tokenization
"""
Turn lines to lowercase, then tokenize,
then concat tokens together,
into space-separated string
"""
from nltk.tokenize import WordPunctTokenizer

def tokenize(raw_lines):
  tokenizer = WordPunctTokenizer()
  tokens = [tokenizer.tokenize(line.lower()) for line in raw_lines]
  return tokens

In [224]:
#checking format of data
t=tokenize(lines[:5])
for line in t:
  print(line)

['do', 'schools', 'kill', 'creativity', '?', ';', 'sir', 'ken', 'robinson', 'makes', 'an', 'entertaining', 'and', 'profoundly', 'moving', 'case', 'for', 'creating', 'an', 'education', 'system', 'that', 'nurtures', '(', 'rather', 'than', 'undermines', ')', 'creativity', '.']
['averting', 'the', 'climate', 'crisis', ';', 'with', 'the', 'same', 'humor', 'and', 'humanity', 'he', 'exuded', 'in', '"', 'an', 'inconvenient', 'truth', ',"', 'al', 'gore', 'spells', 'out', '15', 'ways', 'that', 'individuals', 'can', 'address', 'climate', 'change', 'immediately', ',', 'from', 'buying', 'a', 'hybrid', 'to', 'inventing', 'a', 'new', ',', 'hotter', 'brand', 'name', 'for', 'global', 'warming', '.']
['simplicity', 'sells', ';', 'new', 'york', 'times', 'columnist', 'david', 'pogue', 'takes', 'aim', 'at', 'technology', '’', 's', 'worst', 'interface', '-', 'design', 'offenders', ',', 'and', 'provides', 'encouraging', 'examples', 'of', 'products', 'that', 'get', 'it', 'right', '.', 'to', 'funny', 'things',

In [225]:
"""
Count how many times each word occured after (n - 1) previous words
:param lines: an iterable of strings with space-separated tokens
:returns: a dictionary { tuple(prefix_tokens): {next_token_1: count_1, next_token_2: count_2}}

When building counts, please consider the following two edge cases:
- if prefix is shorter than (n - 1) tokens, it should be padded with UNK. For n=3,
  empty prefix: "" -> (UNK, UNK)
  short prefix: "the" -> (UNK, the)
  long prefix: "the new approach method" -> (new, approach)
- you should add a special token, EOS, at the end of each sequence
  "... with deep neural networks ." -> (..., with, deep, neural, networks, ., EOS)
  count the probability of this token just like all others.
"""
from collections import defaultdict, Counter

UNK, EOS = '_UNK_', '_EOS_'

def count_ngrams(lines, n=3):
  tokens = tokenize(lines)

  counts = defaultdict(Counter)
  for tok_line in tokens:
    tokens = [UNK] * (n-1) + tok_line + [EOS]
    for i in range(n-1, len(tokens)):
      keys = tuple(tokens[i-n+1:i])
      counts[keys][tokens[i]] += 1

  return counts

In [226]:
w = count_ngrams(lines, 10)
w

defaultdict(collections.Counter,
            {('_UNK_',
              '_UNK_',
              '_UNK_',
              '_UNK_',
              '_UNK_',
              '_UNK_',
              '_UNK_',
              '_UNK_',
              '_UNK_'): Counter({'do': 5,
                      'averting': 1,
                      'simplicity': 1,
                      'greening': 1,
                      'the': 461,
                      'why': 128,
                      'letting': 1,
                      'behind': 3,
                      'let': 30,
                      'a': 233,
                      'my': 65,
                      'one': 9,
                      'an': 26,
                      'improvising': 1,
                      'simple': 2,
                      'organic': 2,
                      'nerdcore': 1,
                      'meet': 18,
                      'happiness': 3,
                      'chemical': 1,
                      'choice': 1,
                      'how': 305,
  

In [227]:
type(w)

collections.defaultdict

In [228]:
def get_possible_next_tokens(prefix, n=3):
    """
    :param prefix: string with space - separated prefix tokens
    :returns: a dictionary {token : it's probability} for all tokens with positive probabilities
    """
    tokenizer = WordPunctTokenizer()
    prefix = tokenizer.tokenize(prefix.lower())
    prefix = prefix[len(prefix)-n+1:len(prefix)]
    probs = w[tuple(prefix)]
    total_prob = sum(probs.values())
    print(total_prob)

In [229]:
d=get_possible_next_tokens("to end")

0


In [230]:
class NGramModel:
  def __init__(self, input, n=3):
        """
        :param n: computes probability of next token given (n - 1) previous words
        :param input: an iterable of strings with space-separated tokens
        """
        assert n > 0
        self.n = n
        self.ngrams = count_ngrams(input, self.n)
        self.probs = defaultdict(Counter)

  def get_possible_next_tokens(self, prefix):
          """
          :param prefix: string with space - separated prefix tokens
          :returns: a dictionary {token : it's probability} for all tokens with positive probabilities
          """
          tokenizer = WordPunctTokenizer()
          prefix = tokenizer.tokenize(prefix.lower())

          prefix = prefix[max(0, len(prefix) - self.n + 1):]
          prefix = [ UNK ] * (self.n - 1 - len(prefix)) + prefix
          probs = self.ngrams[tuple(prefix)]
          all_probs = sum(probs.values())
          probs = {key: probs[key]/all_probs for key in probs}
          return probs

  def get_next_token_prob(self, prefix, next_token):
          """
          :param prefix: string with space-separated prefix tokens
          :param next_token: the next token to predict probability for
          :returns: P(next_token|prefix) a single number, 0 <= P <= 1
          """
          return self.get_possible_next_tokens(prefix).get(next_token, 0)

In [231]:
lm = NGramModel(lines, 10)

In [232]:
print(lm.get_possible_next_tokens("end of the"))

{}


In [233]:
def get_next_token(lm, prefix, temperature=1.0):
    """
    return next token after prefix;
    :param temperature: samples proportionally to lm probabilities ^ (1 / temperature)
        if temperature == 0, always takes most likely token. Break ties arbitrarily.
    """
    prefix_probabilities = lm.get_possible_next_tokens(prefix)
    if temperature == 0.0:
      next_token = max(prefix_probabilities, key=prefix_probabilities.get)
      return next_token
    softened_probabilities = {word: prob ** (1 / temperature) for word, prob in prefix_probabilities.items()}
    normalization_term = sum(softened_probabilities.values())
    normalized_probabilities = {word: prob / normalization_term for word, prob in softened_probabilities.items()}
    next_token = np.random.choice(list(normalized_probabilities.keys()), p=list(normalized_probabilities.values()))

    return next_token
    # return max(prob, key=prob.get)

In [234]:
x = get_next_token(lm, "end of the", 1)
x

ValueError: ignored

In [None]:
prefix = 'science'

def generate(prefix):
  for i in range(100):
      prefix += ' ' + get_next_token(lm, prefix, 0.7)
      if prefix.endswith(EOS) or len(lm.get_possible_next_tokens(prefix)) == 0:
          break
  text = prefix
  return text[:len(text)-5]

print(generate(prefix))

In [238]:
#@title Fully defined class

class NGramModel:
  def __init__(self, input, n=3):
        """
        :param n: computes probability of next token given (n - 1) previous words
        :param input: an iterable of strings with space-separated tokens
        """
        assert n > 0
        self.n = n
        self.ngrams = self.count_ngrams_inclass(input)
        self.probs = defaultdict(Counter)

  def count_ngrams_inclass(self, input):
        tokens = tokenize(input)

        counts = defaultdict(Counter)
        for tok_line in tokens:
          tokens = [UNK] * (self.n-1) + tok_line + [EOS]
          for i in range(self.n-1, len(tokens)):
            keys = tuple(tokens[i-self.n+1:i])
            counts[keys][tokens[i]] += 1

        return counts

  def get_possible_next_tokens(self, prefix):
          """
          :param prefix: string with space - separated prefix tokens
          :returns: a dictionary {token : it's probability} for all tokens with positive probabilities
          """
          tokenizer = WordPunctTokenizer()
          prefix = tokenizer.tokenize(prefix.lower())

          prefix = prefix[max(0, len(prefix) - self.n + 1):]
          prefix = [ UNK ] * (self.n - 1 - len(prefix)) + prefix
          probs = self.ngrams[tuple(prefix)]
          all_probs = sum(probs.values())
          probs = {key: probs[key]/all_probs for key in probs}
          return probs

  def get_next_token_prob(self, prefix, next_token):
          """
          :param prefix: string with space-separated prefix tokens
          :param next_token: the next token to predict probability for
          :returns: P(next_token|prefix) a single number, 0 <= P <= 1
          """
          return self.get_possible_next_tokens(prefix).get(next_token, 0)

  def get_next_token(self, prefix, temperature=1.0):
          """
          return next token after prefix;
          :param temperature: samples proportionally to lm probabilities ^ (1 / temperature)
              if temperature == 0, always takes most likely token. Break ties arbitrarily.
          """
          prefix_probabilities = self.get_possible_next_tokens(prefix)
          if temperature == 0.0:
            next_token = max(prefix_probabilities, key=prefix_probabilities.get)
            return next_token

          softened_probabilities = {word: prob ** (1 / temperature) for word, prob in prefix_probabilities.items()}
          normalization_term = sum(softened_probabilities.values())
          normalized_probabilities = {word: prob / normalization_term for word, prob in softened_probabilities.items()}
          # next_token = np.random.choice(list(normalized_probabilities.keys()), p=list(normalized_probabilities.values()))

          # return next_token

  def generate(self, prefix):
          for i in range(100):
              prefix += ' ' + self.get_next_token(lm, prefix, 0.7)
              if prefix.endswith(EOS) or len(self.get_possible_next_tokens(prefix)) == 0:
                  break
          text = prefix
          return text[:len(text)-5]


In [239]:
model = NGramModel(lines, 3)

In [240]:
x = model.get_next_token("end of the", 1)

In [241]:
x = generate("end of the")
print(x)

ValueError: ignored