# Language model

In [1]:
!wget -O shakes.txt https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt

--2024-02-09 14:51:49--  https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt
Resolving ocw.mit.edu (ocw.mit.edu)... 151.101.2.133, 151.101.130.133, 151.101.66.133, ...
Connecting to ocw.mit.edu (ocw.mit.edu)|151.101.2.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5458199 (5.2M) [text/plain]
Saving to: ‘shakes.txt’


2024-02-09 14:51:49 (55.0 MB/s) - ‘shakes.txt’ saved [5458199/5458199]



In [2]:
with open('shakes.txt','r') as f:
    corpus = f.read()

len(corpus)

5458199

In [8]:
# we use sklearn
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range=(1,3), token_pattern=r'[a-zA-Z]+', stop_words=None, lowercase=False)
analyzer = vectorizer.build_analyzer()


ngrams = analyzer(corpus)

In [9]:
len(ngrams)

2784033

In [11]:
ngrams[2000000]

'fortune Which was'

In [12]:
UnigramsWithRepetitions = [x for x in ngrams if len(x.split()) ==1]
BigramsWithRepetitions = [x for x in ngrams if len(x.split()) ==2]
TrigramsWithRepetitions = [x for x in ngrams if len(x.split()) ==3]

In [13]:
Unigrams = set(UnigramsWithRepetitions)
Bigrams = set(BigramsWithRepetitions)
Trigrams = set(TrigramsWithRepetitions)

In [14]:
len(BigramsWithRepetitions), len(Bigrams)

(928011, 392911)

In [15]:
import math
log = lambda n: math.log(n) if n>0 else -math.inf
exp = lambda n: math.exp(n) if n!= -math.inf else 0


def lP(w):
    if w in Trigrams:
        return log(ngrams.count(w)) - log(len(TrigramsWithRepetitions))
    elif w in Bigrams:
        return log(ngrams.count(w)) - log(len(BigramsWithRepetitions))
    else:
        return log(ngrams.count(w)) - log( len(UnigramsWithRepetitions))


def lP_cond(w2,w1):
    """ P(w2|w1) """
    return lP("{} {}".format(w1,w2)) - lP(w1)


In [16]:
lP('I am'), lP('am'), lP('supercalifragili'),lP_cond('am','I')

(-6.216237638861002, -6.086830762240441, -inf, -2.484410756523892)

In [17]:
exp(lP('I am')), exp(lP('am')), exp(lP('supercalifragili')), exp(lP_cond('am','I'))

(0.0019967435730826474, 0.0022725999232768527, 0, 0.08337466801995831)

## laplace smoothing

In [18]:
def lP_cond_laplace_book(w2,w1,alpha=1):
    """ P(w2|w1) according to the slides """
    return log(ngrams.count("{} {}".format(w1,w2))+alpha) - log(ngrams.count(w1) +alpha*len(Unigrams)**2)

def P_cond_laplace_book(w2,w1,alpha=1):
    return  exp(lP_cond_laplace_book(w2,w1,alpha))


In [19]:
exp(lP('I')), exp(lP('am')), exp(lP_cond('am','I')), exp(lP_cond('supercali','I')), P_cond_laplace_book('am','I'), P_cond_laplace_book('supercali','I')

(0.023949043762365143,
 0.0022725999232768527,
 0.08337466801995831,
 0,
 2.1370271465647872e-06,
 1.1526575763564106e-09)

In [20]:
P_cond_laplace_book('am','I',alpha=0.00001), P_cond_laplace_book('supercali','I',alpha=0.00001)

(0.05996689815589311, 3.236206035200897e-10)

## Generate new sentences

Let's use  language model to generate random sentences

=> Starting with "I"

In [22]:
candidates = [b.split() for b in Bigrams]   # e.g. candidate pair ['I','am'] represents
                                            # the candidate for adding "am" after "I"

from collections import Counter
ngrams_counter = Counter(ngrams)

def P_cond2(w2,w1):
    """ P(w2|w1) for bigrams (approximated) """
    return ngrams_counter["{} {}".format(w1,w2)] / ngrams_counter[w1]



def generate_sentence(starting_word='I',n=10):
    """ generate a sentence of n words using a bigram model starting with a given word  """
    s = [starting_word]
    for i in range(n-1):
        w = next_word(s[-1])
        s.append(w)
    return ' '.join(s)

def next_word(word):
    """ choose with weighted probabilities the next token after the given word """
    from numpy.random import choice
    word_candidates = [c[1] for c in candidates if c[0]==word]
    probability_distribution = [P_cond2(w_next,word) for w_next in word_candidates]
    draw = choice(word_candidates, 1, p=probability_distribution)
    return draw[0]



In [25]:
next_word('I')

'would'

In [26]:
generate_sentence()

'I kill with thy mood and go Come from possibility'

In [None]:
generate_sentence()

'I cannot find Though slackly guarded POSTHUMUS with great oneyers'

In [27]:
generate_sentence('you',n=20)

'you do And even now By my mother what is only mark me that I have a flea s roaring'

exercize: improve the code and generate sentences using a trigram language model