# Lab 2: Language Modelling

## 1. Getting Started

In [137]:
import os, random, math, operator
TRAINING_DIR = "./sentence-completion/Holmes_Training_Data"  # this needs to be the parent directory for the training corpus

In [2]:
def get_training_testing(training_dir=TRAINING_DIR, split=0.5):
    filenames = os.listdir(training_dir)
    n = len(filenames)
    print("There are {} files in the training directory: {}".format(n, training_dir))
    random.seed(53) #if you want the same random split every time
    random.shuffle(filenames)
    index = int(n * split)
    return filenames[:index], filenames[index:]

In [4]:
trainingfiles, heldoutfiles = get_training_testing()

There are 522 files in the training directory: ./sentence-completion/Holmes_Training_Data


## 2. A Unigram Model

In [50]:
from nltk import word_tokenize as tokenize

class language_model:

    def __init__(self, trainingdir=TRAINING_DIR, files=[]):
        self.training_dir = trainingdir
        self.files = files
        self.train()

    def train(self):
        self.unigram = {}
        self.bigram = {}
        self._processfiles()
        self._convert_to_probs()

    def _processline(self,line):
        tokens=["_START"]+tokenize(line)+["_END"]
        for token in tokens:
            self.unigram[token]=self.unigram.get(token,0)+1

    def _processfiles(self):
        for afile in self.files:
            print("Processing {}".format(afile))
            try:
                with open(os.path.join(self.training_dir, afile)) as instream:
                    for line in instream:
                        line = line.rstrip()
                        if len(line) > 0:
                            self._processline(line)
            except UnicodeDecodeError:
                print("UnicodeDecodeError processing {}: ignoring file".format(afile))

    def _convert_to_probs(self):
        self.unigram = {k: v / sum(self.unigram.values()) for (k, v) in self.unigram.items()}

    def get_prob(self, token, method="unigram"):
        if method == "unigram":
            return self.unigram.get(token, 0)
        else:
            print("Not implemented: {}".format(method))
        return 0

    # 2.2 generation
    def generate_simple_sentence(self, k=5, limit=15):
        top_k = sorted(self.unigram.items(), key=lambda item: item[1], reverse=True)[:k]
        sentence = []
        token = ''
        while token!='_END' and len(sentence) < limit:
            top_k = [(tok, prob) for (tok, prob) in top_k if tok != '_START']
            token = random.choice(top_k)[0]
            sentence.append(token)
        return ' '.join(sentence[:-1])

    def generate_sentence_extension(self, limit=15):
        """
        Using the cumulative probability distribution (with random.choices())
        :return:
        """
        sentence = []
        token = ''
        keys = list(self.unigram.keys())
        values = list(self.unigram.values())
        while token != '.' and len(sentence) < limit:
            token = random.choices(keys, values)
            if token[0] != '_START' and token[0] != '_END':
                sentence.append(token)
        return ' '.join(token[0] for token in sentence)

### 2.1. Training

In [51]:
MAX_FILES=5
mylm = language_model(files=trainingfiles[:MAX_FILES])

Processing DYNMT10.TXT
Processing 09TOM10.TXT
Processing PRSIT10.TXT
Processing NWIND10.TXT
Processing BDAPH10.TXT


In [52]:
mylm.get_prob('tales')

6.751525844840934e-06

In [53]:
mylm.get_prob('case')

0.00012377797382208378

### 2.2. Generation

In [54]:
sorted_unigram_tuples = sorted(mylm.unigram.items(), key=lambda item: item[1], reverse=True)
sorted_unigram = {k: v for k, v in sorted_unigram_tuples}

In [55]:
def generate_simple_sentence():
    top_10 = sorted_unigram_tuples[:10]
    sentence = []
    token = ''
    while token != '_END' and len(sentence) < 15:
        top_10 = [(tok, prob) for (tok, prob) in top_10 if tok != '_START']
        token = random.choice(top_10)[0]
        sentence.append(token)
    return ' '.join(sentence[:-1])

In [56]:
simple_sentence = generate_simple_sentence()
simple_sentence

'the , the `` `` the'

In [63]:
mylm.generate_simple_sentence()

', the'

In [66]:
mylm.generate_simple_sentence(k=20)

"`` the the that his you that the and ''"

In [67]:
print(random.choices(list(mylm.unigram.keys()), list(mylm.unigram.values())))

['departure']


In [68]:
def generate_sentence_extension():
    sentence = []
    token = ''
    while token != '.' and len(sentence) < 15:
        token = random.choices(list(mylm.unigram.keys()), list(mylm.unigram.values()))
        if token[0] != '_START' and token[0] != '_END':
            sentence.append(token)
    return ' '.join(token[0] for token in sentence)

In [69]:
generate_sentence_extension()

'; I him back was the light , up . manage each his , to'

In [72]:
mylm.generate_sentence_extension()

"of . making `` you '' ask Cross breezily to serious coming . he and"

## 3. Adding Bigrams

In [86]:
class language_model:

    def __init__(self, trainingdir=TRAINING_DIR, files=[]):
        self.training_dir = trainingdir
        self.files = files
        self.train()

    def train(self):
        self.unigram = {}
        self.bigram = {}
        self._processfiles()
        self._convert_to_probs()

    def _processline(self, line):
        tokens = ["_START"] + tokenize(line) + ["_END"]
        for i, token in enumerate(tokens):
            self.unigram[token] = self.unigram.get(token, 0) + 1
            if i > 0:
                previous = tokens[i-1]
                current = self.bigram.get(previous, {})
                current[token] = current.get(token, 0) + 1
                self.bigram[previous] = current

    def _processfiles(self):
        for afile in self.files:
            print("Processing {}".format(afile))
            try:
                with open(os.path.join(self.training_dir, afile)) as instream:
                    for line in instream:
                        line = line.rstrip()
                        if len(line) > 0:
                            self._processline(line)
            except UnicodeDecodeError:
                print("UnicodeDecodeError processing {}: ignoring file".format(afile))

    def _convert_to_probs(self):
        self.unigram = {k: v / sum(self.unigram.values()) for (k, v) in self.unigram.items()}
        for (k, v) in self.bigram.items():
            self.bigram[k] = {in_k: in_v / sum(v.values()) for (in_k, in_v) in v.items()}

    def get_prob(self, token, previous_token="", method="unigram"):
        if method == "unigram":
            return self.unigram.get(token, 0)
        elif method == 'bigram':
            return self.bigram.get(previous_token, {}).get(token, 0)
        else:
            print("Not implemented: {}".format(method))
        return 0

    # 2.2 generation
    def generate_simple_sentence(self, k=5, limit=15):
        top_k = sorted(self.unigram.items(), key=lambda item: item[1], reverse=True)[:k]
        sentence = []
        token = ''
        while token!='_END' and len(sentence) < limit:
            top_k = [(tok, prob) for (tok, prob) in top_k if tok != '_START']
            token = random.choice(top_k)[0]
            sentence.append(token)
        return ' '.join(sentence[:-1])

    def generate_sentence_extension(self, limit=15):
        """
        Using the cumulative probability distribution (with random.choices())
        :return:
        """
        sentence = []
        token = ''
        keys = list(self.unigram.keys())
        values = list(self.unigram.values())
        while token != '.' and len(sentence) < limit:
            token = random.choices(keys, values)
            if token[0] != '_START' and token[0] != '_END':
                sentence.append(token)
        return ' '.join(token[0] for token in sentence)

    # 3.2 bigram generation
    def generate_sentence_from_bigram(self, k=5, limit=15):
        sentence = []
        token = '_START'
        while token != '_END' and len(sentence) < limit: # and token != '.'
            possibilities = self.bigram.get(token, {})
            top = sorted(possibilities.items(), key=lambda item: item[1], reverse=True)[:k]
            token = random.choice(top)[0]
            sentence.append(token)
        return ' '.join(sentence[:-1])

### 3.1. Training

In [87]:
MAX_FILES=5
mylm = language_model(files=trainingfiles[:MAX_FILES])

Processing DYNMT10.TXT
Processing 09TOM10.TXT
Processing PRSIT10.TXT
Processing NWIND10.TXT
Processing BDAPH10.TXT


In [88]:
mylm.get_prob('The', '_START', method='bigram')

0.012881010075245504

In [89]:
mylm.get_prob('The', method='unigram')

0.0026893577948616386

### 3.2. Generation

In [97]:
mylm.generate_sentence_from_bigram(k=10)

'the wind , as he had the wind came from her eyes of that'

## 4 Perplexity

In [117]:
class language_model:

    def __init__(self, trainingdir=TRAINING_DIR, files=[]):
        self.training_dir = trainingdir
        self.files = files
        self.train()

    def train(self):
        self.unigram = {}
        self.bigram = {}
        self._processfiles()
        self._convert_to_probs()

    def _processline(self, line):
        tokens = ["_START"] + tokenize(line) + ["_END"]
        for i, token in enumerate(tokens):
            self.unigram[token] = self.unigram.get(token, 0) + 1
            if i > 0:
                previous = tokens[i-1]
                current = self.bigram.get(previous, {})
                current[token] = current.get(token, 0) + 1
                self.bigram[previous] = current

    def _processfiles(self):
        for afile in self.files:
            print("Processing {}".format(afile))
            try:
                with open(os.path.join(self.training_dir, afile)) as instream:
                    for line in instream:
                        line = line.rstrip()
                        if len(line) > 0:
                            self._processline(line)
            except UnicodeDecodeError:
                print("UnicodeDecodeError processing {}: ignoring file".format(afile))

    def _convert_to_probs(self):
        self.unigram = {k: v / sum(self.unigram.values()) for (k, v) in self.unigram.items()}
        for (k, v) in self.bigram.items():
            self.bigram[k] = {in_k: in_v / sum(v.values()) for (in_k, in_v) in v.items()}

    def get_prob(self, token, previous_token="", method="unigram"):
        if method == "unigram":
            return self.unigram.get(token, 0)
        elif method == 'bigram':
            return self.bigram.get(previous_token, {}).get(token, 0)
        else:
            print("Not implemented: {}".format(method))
        return 0

    # 2.2 generation
    def generate_simple_sentence(self, k=5, limit=15):
        top_k = sorted(self.unigram.items(), key=lambda item: item[1], reverse=True)[:k]
        sentence = []
        token = ''
        while token!='_END' and len(sentence) < limit:
            top_k = [(tok, prob) for (tok, prob) in top_k if tok != '_START']
            token = random.choice(top_k)[0]
            sentence.append(token)
        return ' '.join(sentence[:-1])

    def generate_sentence_extension(self, limit=15):
        """
        Using the cumulative probability distribution (with random.choices())
        :return:
        """
        sentence = []
        token = ''
        keys = list(self.unigram.keys())
        values = list(self.unigram.values())
        while token != '.' and len(sentence) < limit:
            token = random.choices(keys, values)
            if token[0] != '_START' and token[0] != '_END':
                sentence.append(token)
        return ' '.join(token[0] for token in sentence)

    # 3.2 bigram generation
    def generate_sentence_from_bigram(self, k=5, limit=15):
        sentence = []
        token = '_START'
        while token != '_END' and len(sentence) < limit: # and token != '.'
            possibilities = self.bigram.get(token, {})
            top = sorted(possibilities.items(), key=lambda item: item[1], reverse=True)[:k]
            token = random.choice(top)[0]
            sentence.append(token)
        return ' '.join(sentence[:-1])

    # 4 perplexity
    # def get_log_probability(self, file, method='unigram'):
    #     log_probs = 0
    #     try:
    #         with open(os.path.join(self.training_dir, file)) as instream:
    #             if method == 'unigram':
    #                 for line in instream:
    #                     line = line.rstrip()
    #                     if len(line) > 0:
    #                         tokens = ["_START"] + tokenize(line) + ["_END"]
    #                         log_probs += sum([math.log(self.get_prob(token, method='unigram')) for token in tokens])
    #             elif method == 'bigram':
    #                 for line in instream:
    #                     line = line.rstrip()
    #                     if len(line) > 0:
    #                         tokens = ["_START"] + tokenize(line) + ["_END"]
    #                         # log_probs += sum([math.log(self.get_prob(tokens[i+1], previous_token=token, method='bigram'))
    #                         #                   for i, token in enumerate(tokens)])
    #                         for i, token in enumerate(tokens):
    #                             if i > 0:
    #                                 log_probs += math.log(self.get_prob(token, previous_token=tokens[i-1], method='bigram'))
    #
    #     except UnicodeDecodeError:
    #         print("UnicodeDecodeError processing {}: ignoring file".format(file))
    #     return log_probs
    #
    # def get_perplexity(self, file, method='unigram'):
    #     text = open(os.path.join(self.training_dir, file)).read()
    #     num_of_words = len(text.split())
    #     log_probability = self.get_log_probability(file, method=method)
    #     return math.exp(-log_probability/num_of_words)

    def compute_prob_line(self,line,method="unigram"):
        #this will add _start to the beginning of a line of text
        #compute the probability of the line according to the desired model
        #and returns probability together with number of tokens

        tokens=["_START"]+tokenize(line)+["_END"]
        acc=0
        for i,token in enumerate(tokens):
            if i > 0:
                acc+=math.log(self.get_prob(token,previous_token=tokens[i-1],method=method))
        return acc,len(tokens[1:])


    def compute_probability(self,filenames=[],method="unigram"):
        #computes the probability (and length) of a corpus contained in filenames
        if filenames==[]:
            filenames=self.files

        total_p=0
        total_N=0
        for i,afile in enumerate(filenames):
            print("Processing file {}:{}".format(i,afile))
            try:
                with open(os.path.join(self.training_dir,afile)) as instream:
                    for line in instream:
                        line=line.rstrip()
                        if len(line)>0:
                            p,N=self.compute_prob_line(line,method=method)
                            total_p+=p
                            total_N+=N
            except UnicodeDecodeError:
                print("UnicodeDecodeError processing file {}: ignoring rest of file".format(afile))
        return total_p,total_N

    def compute_perplexity(self,filenames=[],method="unigram"):

        #compute the probability and length of the corpus
        #calculate perplexity
        #lower perplexity means that the model better explains the data

        p,N=self.compute_probability(filenames=filenames,method=method)
        #print(p,N)
        pp=math.exp(-p/N)
        return pp

In [118]:
MAX_FILES=5
mylm = language_model(files=trainingfiles[:MAX_FILES])

Processing DYNMT10.TXT
Processing 09TOM10.TXT
Processing PRSIT10.TXT
Processing NWIND10.TXT
Processing BDAPH10.TXT


In [100]:
mylm.get_log_probability('DYNMT10.TXT', method='unigram')

-666538.3915608346

In [101]:
mylm.get_perplexity('DYNMT10.TXT', method='unigram')

5433.144986357501

In [102]:
mylm.get_log_probability('DYNMT10.TXT', method='bigram')


-409464.2359516801

In [103]:
mylm.get_perplexity('DYNMT10.TXT', method='bigram')



197.01367717606865

In [105]:
mylm.get_prob('The', '_START', method='bigram')


0.012881010075245504

In [106]:
mylm.get_perplexity('2DFRE10.TXT', method='bigram')

ValueError: math domain error

In [119]:
mylm.compute_perplexity()

Processing file 0:DYNMT10.TXT
Processing file 1:09TOM10.TXT
Processing file 2:PRSIT10.TXT
Processing file 3:NWIND10.TXT
Processing file 4:BDAPH10.TXT


487.82830040293857

In [120]:
mylm.compute_perplexity(filenames=['DYNMT10.TXT'])

Processing file 0:DYNMT10.TXT


624.1897972356056

In [121]:
mylm.compute_perplexity(filenames=['DYNMT10.TXT'], method='bigram')

Processing file 0:DYNMT10.TXT


59.133163959841994

## 5. Dealing with Unseen Data


In [122]:
mylm.compute_perplexity(filenames=heldoutfiles[:MAX_FILES])

Processing file 0:GGIRL10.TXT


ValueError: math domain error

### 5.1. Unknown Words

In [130]:
# My class, not working properly
class language_model:

    def __init__(self, trainingdir=TRAINING_DIR, files=[]):
        self.training_dir = trainingdir
        self.files = files
        self.train()

    def train(self):
        self.unigram = {}
        self.bigram = {}
        self._processfiles()
        self.make_unknowns()
        self._convert_to_probs()

    def _processline(self, line):
        tokens = ["_START"] + tokenize(line) + ["_END"]
        for i, token in enumerate(tokens):
            self.unigram[token] = self.unigram.get(token, 0) + 1
            if i > 0:
                previous = tokens[i-1]
                current = self.bigram.get(previous, {})
                current[token] = current.get(token, 0) + 1
                self.bigram[previous] = current

    def _processfiles(self):
        for afile in self.files:
            print("Processing {}".format(afile))
            try:
                with open(os.path.join(self.training_dir, afile)) as instream:
                    for line in instream:
                        line = line.rstrip()
                        if len(line) > 0:
                            self._processline(line)
            except UnicodeDecodeError:
                print("UnicodeDecodeError processing {}: ignoring file".format(afile))

    def _convert_to_probs(self):
        self.unigram = {k: v / sum(self.unigram.values()) for (k, v) in self.unigram.items()}
        for (k, v) in self.bigram.items():
            self.bigram[k] = {in_k: in_v / sum(v.values()) for (in_k, in_v) in v.items()}

    def get_prob(self, token, previous_token="", method="unigram"):
        if method == "unigram":
            return self.unigram.get(token, 0)
        elif method == 'bigram':
            bigram = self.bigram.get(previous_token, self.bigram.get("_UNK",{}))
            return bigram.get(token, bigram.get("__UNK",0))
        else:
            print("Not implemented: {}".format(method))
        return 0

    # 2.2 generation
    def generate_simple_sentence(self, k=5, limit=15):
        top_k = sorted(self.unigram.items(), key=lambda item: item[1], reverse=True)[:k]
        sentence = []
        token = ''
        while token!='_END' and len(sentence) < limit:
            top_k = [(tok, prob) for (tok, prob) in top_k if tok != '_START' and tok != '_UNK']
            token = random.choice(top_k)[0]
            sentence.append(token)
        return ' '.join(sentence[:-1])

    def generate_sentence_extension(self, limit=15):
        """
        Using the cumulative probability distribution (with random.choices())
        :return:
        """
        sentence = []
        token = ''
        keys = list(self.unigram.keys())
        values = list(self.unigram.values())
        while token != '.' and len(sentence) < limit:
            token = random.choices(keys, values)
            if token[0] != '_START' and token[0] != '_END' and token[0] != '_UNK':
                sentence.append(token)
        return ' '.join(token[0] for token in sentence)

    # 3.2 bigram generation
    def generate_sentence_from_bigram(self, k=5, limit=15):
        sentence = []
        token = '_START'
        while token != '_END' and len(sentence) < limit: # and token != '.'
            possibilities = self.bigram.get(token, {})
            top = sorted(possibilities.items(), key=lambda item: item[1], reverse=True)[:k]
            token = random.choice(top)[0]
            sentence.append(token)
        return ' '.join(sentence[:-1])

    # 4 perplexity
    # def get_log_probability(self, file, method='unigram'):
    #     log_probs = 0
    #     try:
    #         with open(os.path.join(self.training_dir, file)) as instream:
    #             if method == 'unigram':
    #                 for line in instream:
    #                     line = line.rstrip()
    #                     if len(line) > 0:
    #                         tokens = ["_START"] + tokenize(line) + ["_END"]
    #                         log_probs += sum([math.log(self.get_prob(token, method='unigram')) for token in tokens])
    #             elif method == 'bigram':
    #                 for line in instream:
    #                     line = line.rstrip()
    #                     if len(line) > 0:
    #                         tokens = ["_START"] + tokenize(line) + ["_END"]
    #                         # log_probs += sum([math.log(self.get_prob(tokens[i+1], previous_token=token, method='bigram'))
    #                         #                   for i, token in enumerate(tokens)])
    #                         for i, token in enumerate(tokens):
    #                             if i > 0:
    #                                 log_probs += math.log(self.get_prob(token, previous_token=tokens[i-1], method='bigram'))
    #
    #     except UnicodeDecodeError:
    #         print("UnicodeDecodeError processing {}: ignoring file".format(file))
    #     return log_probs
    #
    # def get_perplexity(self, file, method='unigram'):
    #     text = open(os.path.join(self.training_dir, file)).read()
    #     num_of_words = len(text.split())
    #     log_probability = self.get_log_probability(file, method=method)
    #     return math.exp(-log_probability/num_of_words)

    def compute_prob_line(self,line,method="unigram"):
        #this will add _start to the beginning of a line of text
        #compute the probability of the line according to the desired model
        #and returns probability together with number of tokens

        tokens=["_START"]+tokenize(line)+["_END"]
        acc=0
        for i,token in enumerate(tokens):
            if i > 0:
                acc+=math.log(self.get_prob(token,previous_token=tokens[i-1],method=method))
        return acc,len(tokens[1:])


    def compute_probability(self,filenames=[],method="unigram"):
        #computes the probability (and length) of a corpus contained in filenames
        if filenames==[]:
            filenames=self.files

        total_p=0
        total_N=0
        for i,afile in enumerate(filenames):
            print("Processing file {}:{}".format(i,afile))
            try:
                with open(os.path.join(self.training_dir,afile)) as instream:
                    for line in instream:
                        line=line.rstrip()
                        if len(line)>0:
                            p,N=self.compute_prob_line(line,method=method)
                            total_p+=p
                            total_N+=N
            except UnicodeDecodeError:
                print("UnicodeDecodeError processing file {}: ignoring rest of file".format(afile))
        return total_p,total_N

    def compute_perplexity(self,filenames=[],method="unigram"):

        #compute the probability and length of the corpus
        #calculate perplexity
        #lower perplexity means that the model better explains the data

        p,N=self.compute_probability(filenames=filenames,method=method)
        #print(p,N)
        pp=math.exp(-p/N)
        return pp

    def make_unknowns(self, known=2):
        for (k, v) in list(self.unigram.items()):
            if v<known:
                del self.unigram[k]
                self.unigram["_UNK"]=self.unigram.get("_UNK",0)+v
        for (k, adict) in list(self.bigram.items()):
            for (kk, v) in list(adict.items()):
                isknown=self.unigram.get(kk,0)
                if isknown==0:
                    adict["_UNK"]=adict.get("_UNK",0)+v
                    del adict[kk]
            isknown=self.unigram.get(k,0)
            if isknown==0:
                del self.bigram[k]
                current=self.bigram.get("_UNK",{})
                current.update(adict)
                self.bigram["_UNK"]=current

            else:
                self.bigram[k]=adict

In [132]:
class language_model():

    def __init__(self,trainingdir=TRAINING_DIR,files=[]):
        self.training_dir=trainingdir
        self.files=files
        self.train()

    def train(self):
        self.unigram={}
        self.bigram={}

        self.processfiles()
        self.make_unknowns()
        self.convert_to_probs()


    def processline(self,line):
        tokens=["__START"]+tokenize(line)+["__END"]
        previous="__END"
        for token in tokens:
            self.unigram[token]=self.unigram.get(token,0)+1
            current=self.bigram.get(previous,{})
            current[token]=current.get(token,0)+1
            self.bigram[previous]=current
            previous=token


    def processfiles(self):
        for afile in self.files:
            print("Processing {}".format(afile))
            try:
                with open(os.path.join(self.training_dir,afile)) as instream:
                    for line in instream:
                        line=line.rstrip()
                        if len(line)>0:
                            self.processline(line)
            except UnicodeDecodeError:
                print("UnicodeDecodeError processing {}: ignoring rest of file".format(afile))


    def convert_to_probs(self):

        self.unigram={k:v/sum(self.unigram.values()) for (k,v) in self.unigram.items()}
        self.bigram={key:{k:v/sum(adict.values()) for (k,v) in adict.items()} for (key,adict) in self.bigram.items()}


    def get_prob(self,token,context="",method="unigram"):
        if method=="unigram":
            return self.unigram.get(token,self.unigram.get("__UNK",0))
        elif method=="bigram":
            bigram=self.bigram.get(context[-1],self.bigram.get("__UNK",{}))
            return bigram.get(token,bigram.get("__UNK",0))


    def nextlikely(self,k=1,current="",method="unigram"):
        #use probabilities according to method to generate a likely next sequence
        #choose random token from k best
        blacklist=["__START","__UNK"]

        if method=="unigram":
            dist=self.unigram
        else:
            dist=self.bigram.get(current,self.bigram.get("__UNK",{}))

        #sort the tokens by unigram probability
        mostlikely=sorted(list(dist.items()),key=operator.itemgetter(1),reverse=True)
        #filter out any undesirable tokens
        filtered=[w for (w,p) in mostlikely if w not in blacklist]
        #choose one randomly from the top k
        res=random.choice(filtered[:k])
        return res

    def generate(self,k=1,end="__END",limit=20,method="bigram"):
        current="__START"
        tokens=[]
        while current!="__END" and len(tokens)<limit:
            current=self.nextlikely(k=k,current=current,method=method)
            tokens.append(current)
        return " ".join(tokens)


    def compute_prob_line(self,line,method="unigram"):
        #this will add _start to the beginning of a line of text
        #compute the probability of the line according to the desired model
        #and returns probability together with number of tokens


        tokens=["__START"]+tokenize(line)+["__END"]
        acc=0
        for i,token in enumerate(tokens[1:]):
            acc+=math.log(self.get_prob(token,tokens[:i+1],method))
        return acc,len(tokens[1:])


    def compute_probability(self,filenames=[],method="unigram"):
        #computes the probability (and length) of a corpus contained in filenames
        if filenames==[]:
            filenames=self.files

        total_p=0
        total_N=0
        for i,afile in enumerate(filenames):
            print("Processing file {}:{}".format(i,afile))
            try:
                with open(os.path.join(self.training_dir,afile)) as instream:
                    for line in instream:
                        line=line.rstrip()
                        if len(line)>0:
                            p,N=self.compute_prob_line(line,method=method)
                            total_p+=p
                            total_N+=N
            except UnicodeDecodeError:
                print("UnicodeDecodeError processing file {}: ignoring rest of file".format(afile))
        return total_p,total_N

    def compute_perplexity(self,filenames=[],method="unigram"):

        #compute the probability and length of the corpus
        #calculate perplexity
        #lower perplexity means that the model better explains the data

        p,N=self.compute_probability(filenames=filenames,method=method)
        #print(p,N)
        pp=math.exp(-p/N)
        return pp

    def make_unknowns(self,known=2):
        unknown=0
        for (k,v) in list(self.unigram.items()):
            if v<known:
                del self.unigram[k]
                self.unigram["__UNK"]=self.unigram.get("__UNK",0)+v
        for (k,adict) in list(self.bigram.items()):
            for (kk,v) in list(adict.items()):
                isknown=self.unigram.get(kk,0)
                if isknown==0:
                    adict["__UNK"]=adict.get("__UNK",0)+v
                    del adict[kk]
            isknown=self.unigram.get(k,0)
            if isknown==0:
                del self.bigram[k]
                current=self.bigram.get("__UNK",{})
                current.update(adict)
                self.bigram["__UNK"]=current

            else:
                self.bigram[k]=adict

In [133]:
mylm=language_model(files=trainingfiles[:MAX_FILES])
p=mylm.compute_perplexity()
print("Training data unigram perplexity: {}".format(p))
p=mylm.compute_perplexity(filenames=heldoutfiles[:MAX_FILES])
print("Testing data unigram perplexity: {}".format(p))

Processing DYNMT10.TXT
Processing 09TOM10.TXT
Processing PRSIT10.TXT
Processing NWIND10.TXT
Processing BDAPH10.TXT
Processing file 0:DYNMT10.TXT
Processing file 1:09TOM10.TXT
Processing file 2:PRSIT10.TXT
Processing file 3:NWIND10.TXT
Processing file 4:BDAPH10.TXT
Training data unigram perplexity: 410.6950375016966
Processing file 0:GGIRL10.TXT
Processing file 1:SBRUN10.TXT
Processing file 2:TARZ510.TXT
Processing file 3:ASPRN10.TXT
Processing file 4:TBTAS10.TXT
Testing data unigram perplexity: 361.47935014681156


### 5.2. Discounting for unseen combinations

In [134]:
class language_model():

    def __init__(self,trainingdir=TRAINING_DIR,files=[]):
        self.training_dir=trainingdir
        self.files=files
        self.train()

    def train(self):
        self.unigram={}
        self.bigram={}

        self._processfiles()
        self._make_unknowns()
        self._discount()
        self._convert_to_probs()


    def _processline(self,line):
        tokens=["__START"]+tokenize(line)+["__END"]
        previous="__END"
        for token in tokens:
            self.unigram[token]=self.unigram.get(token,0)+1
            current=self.bigram.get(previous,{})
            current[token]=current.get(token,0)+1
            self.bigram[previous]=current
            previous=token


    def _processfiles(self):
        for afile in self.files:
            print("Processing {}".format(afile))
            try:
                with open(os.path.join(self.training_dir,afile)) as instream:
                    for line in instream:
                        line=line.rstrip()
                        if len(line)>0:
                            self._processline(line)
            except UnicodeDecodeError:
                print("UnicodeDecodeError processing {}: ignoring rest of file".format(afile))


    def _convert_to_probs(self):

        self.unigram={k:v/sum(self.unigram.values()) for (k,v) in self.unigram.items()}
        self.bigram={key:{k:v/sum(adict.values()) for (k,v) in adict.items()} for (key,adict) in self.bigram.items()}


    def get_prob(self,token,context="",method="unigram"):
        if method=="unigram":
            return self.unigram.get(token,self.unigram.get("__UNK",0))
        elif method=="bigram":
            bigram=self.bigram.get(context[-1],self.bigram.get("__UNK",{}))
            big_p=bigram.get(token,bigram.get("__UNK",0))
            lmbda=bigram["__DISCOUNT"]
            uni_p=self.unigram.get(token,self.unigram.get("__UNK",0))
            #print(big_p,lmbda,uni_p)
            p=big_p+lmbda*uni_p
            return p


    def nextlikely(self,k=1,current="",method="unigram"):
        #use probabilities according to method to generate a likely next sequence
        #choose random token from k best
        blacklist=["__START","__UNK","__DISCOUNT"]

        if method=="unigram":
            dist=self.unigram
        else:
            dist=self.bigram.get(current,self.bigram.get("__UNK",{}))

        #sort the tokens by unigram probability
        mostlikely=sorted(list(dist.items()),key=operator.itemgetter(1),reverse=True)
        #filter out any undesirable tokens
        filtered=[w for (w,p) in mostlikely if w not in blacklist]
        #choose one randomly from the top k
        res=random.choice(filtered[:k])
        return res

    def generate(self,k=1,end="__END",limit=20,method="bigram"):
        current="__START"
        tokens=[]
        while current!=end and len(tokens)<limit:
            current=self.nextlikely(k=k,current=current,method=method)
            tokens.append(current)
        return " ".join(tokens)


    def compute_prob_line(self,line,method="unigram"):
        #this will add _start to the beginning of a line of text
        #compute the probability of the line according to the desired model
        #and returns probability together with number of tokens

        tokens=["__START"]+tokenize(line)+["__END"]
        acc=0
        for i,token in enumerate(tokens[1:]):
            acc+=math.log(self.get_prob(token,tokens[:i+1],method))
        return acc,len(tokens[1:])

    def compute_probability(self,filenames=[],method="unigram"):
        #computes the probability (and length) of a corpus contained in filenames
        if filenames==[]:
            filenames=self.files

        total_p=0
        total_N=0
        for i,afile in enumerate(filenames):
            print("Processing file {}:{}".format(i,afile))
            try:
                with open(os.path.join(self.training_dir,afile)) as instream:
                    for line in instream:
                        line=line.rstrip()
                        if len(line)>0:
                            p,N=self.compute_prob_line(line,method=method)
                            total_p+=p
                            total_N+=N
            except UnicodeDecodeError:
                print("UnicodeDecodeError processing file {}: ignoring rest of file".format(afile))
        return total_p,total_N

    def compute_perplexity(self,filenames=[],method="unigram"):

        #compute the probability and length of the corpus
        #calculate perplexity
        #lower perplexity means that the model better explains the data

        p,N=self.compute_probability(filenames=filenames,method=method)
        #print(p,N)
        pp=math.exp(-p/N)
        return pp

    def _make_unknowns(self,known=2):
        unknown=0
        for (k,v) in list(self.unigram.items()):
            if v<known:
                del self.unigram[k]
                self.unigram["__UNK"]=self.unigram.get("__UNK",0)+v
        for (k,adict) in list(self.bigram.items()):
            for (kk,v) in list(adict.items()):
                isknown=self.unigram.get(kk,0)
                if isknown==0:
                    adict["__UNK"]=adict.get("__UNK",0)+v
                    del adict[kk]
            isknown=self.unigram.get(k,0)
            if isknown==0:
                del self.bigram[k]
                current=self.bigram.get("__UNK",{})
                current.update(adict)
                self.bigram["__UNK"]=current

            else:
                self.bigram[k]=adict

    def _discount(self,discount=0.75):
        #discount each bigram count by a small fixed amount
        self.bigram={k:{kk:value-discount for (kk,value) in adict.items()}for (k,adict) in self.bigram.items()}

        #for each word, store the total amount of the discount so that the total is the same
        #i.e., so we are reserving this as probability mass
        for k in self.bigram.keys():
            lamb=len(self.bigram[k])
            self.bigram[k]["__DISCOUNT"]=lamb*discount

In [135]:

MAX_FILES=5

filesets={"training":trainingfiles[:MAX_FILES],"testing":heldoutfiles[:MAX_FILES]}


mylm=language_model(files=filesets["training"])
methods=["unigram","bigram"]
#methods=["bigram"]

for f,names in list(filesets.items()):
    for m in methods:

        p=mylm.compute_perplexity(filenames=names,method=m)

        print("Perplexity on {} with {} method is {}".format(f,m,p))

Processing DYNMT10.TXT
Processing 09TOM10.TXT
Processing PRSIT10.TXT
Processing NWIND10.TXT
Processing BDAPH10.TXT
Processing file 0:DYNMT10.TXT
Processing file 1:09TOM10.TXT
Processing file 2:PRSIT10.TXT
Processing file 3:NWIND10.TXT
Processing file 4:BDAPH10.TXT
Perplexity on training with unigram method is 410.6950375016966
Processing file 0:DYNMT10.TXT
Processing file 1:09TOM10.TXT
Processing file 2:PRSIT10.TXT
Processing file 3:NWIND10.TXT
Processing file 4:BDAPH10.TXT
Perplexity on training with bigram method is 60.94723288295493
Processing file 0:GGIRL10.TXT
Processing file 1:SBRUN10.TXT
Processing file 2:TARZ510.TXT
Processing file 3:ASPRN10.TXT
Processing file 4:TBTAS10.TXT
Perplexity on testing with unigram method is 361.47935014681156
Processing file 0:GGIRL10.TXT
Processing file 1:SBRUN10.TXT
Processing file 2:TARZ510.TXT
Processing file 3:ASPRN10.TXT
Processing file 4:TBTAS10.TXT
Perplexity on testing with bigram method is 72.71931865316098


In [148]:
mylm.generate(k=10,method="bigram")

"of them ? ' I have to a man , but a long it would be __END"

## 6. Extensions
### 6.1. Kneser-Ney Backoff

In [149]:
class language_model():

    def __init__(self,trainingdir=TRAINING_DIR,files=[]):
        self.training_dir=trainingdir
        self.files=files
        self.train()

    def train(self):
        self.unigram={}
        self.bigram={}

        self._processfiles()
        self._make_unknowns()
        self._discount()
        self._convert_to_probs()


    def _processline(self,line):
        tokens=["__START"]+tokenize(line)+["__END"]
        previous="__END"
        for token in tokens:
            self.unigram[token]=self.unigram.get(token,0)+1
            current=self.bigram.get(previous,{})
            current[token]=current.get(token,0)+1
            self.bigram[previous]=current
            previous=token


    def _processfiles(self):
        for afile in self.files:
            print("Processing {}".format(afile))
            try:
                with open(os.path.join(self.training_dir,afile)) as instream:
                    for line in instream:
                        line=line.rstrip()
                        if len(line)>0:
                            self._processline(line)
            except UnicodeDecodeError:
                print("UnicodeDecodeError processing {}: ignoring rest of file".format(afile))


    def _convert_to_probs(self):

        self.unigram={k:v/sum(self.unigram.values()) for (k,v) in self.unigram.items()}
        self.bigram={key:{k:v/sum(adict.values()) for (k,v) in adict.items()} for (key,adict) in self.bigram.items()}
        self.kn={k:v/sum(self.kn.values()) for (k,v) in self.kn.items()}

    def get_prob(self,token,context="",methodparams={}):
        if methodparams.get("method","unigram")=="unigram":
            return self.unigram.get(token,self.unigram.get("__UNK",0))
        else:
            if methodparams.get("smoothing","kneser-ney")=="kneser-ney":
                unidist=self.kn
            else:
                unidist=self.unigram
            bigram=self.bigram.get(context[-1],self.bigram.get("__UNK",{}))
            big_p=bigram.get(token,bigram.get("__UNK",0))
            lmbda=bigram["__DISCOUNT"]
            uni_p=unidist.get(token,unidist.get("__UNK",0))
            #print(big_p,lmbda,uni_p)
            p=big_p+lmbda*uni_p
            return p


    def nextlikely(self,k=1,current="",method="unigram"):
        #use probabilities according to method to generate a likely next sequence
        #choose random token from k best
        blacklist=["__START","__UNK","__DISCOUNT"]

        if method=="unigram":
            dist=self.unigram
        else:
            dist=self.bigram.get(current,self.bigram.get("__UNK",{}))

        #sort the tokens by unigram probability
        mostlikely=sorted(list(dist.items()),key=operator.itemgetter(1),reverse=True)
        #filter out any undesirable tokens
        filtered=[w for (w,p) in mostlikely if w not in blacklist]
        #choose one randomly from the top k
        res=random.choice(filtered[:k])
        return res

    def generate(self,k=1,end="__END",limit=20,method="bigram",methodparams={}):
        if method=="":
            method=methodparams.get("method","bigram")
        current="__START"
        tokens=[]
        while current!=end and len(tokens)<limit:
            current=self.nextlikely(k=k,current=current,method=method)
            tokens.append(current)
        return " ".join(tokens[:-1])


    def compute_prob_line(self,line,methodparams={}):
        #this will add _start to the beginning of a line of text
        #compute the probability of the line according to the desired model
        #and returns probability together with number of tokens

        tokens=["__START"]+tokenize(line)+["__END"]
        acc=0
        for i,token in enumerate(tokens[1:]):
            acc+=math.log(self.get_prob(token,tokens[:i+1],methodparams))
        return acc,len(tokens[1:])

    def compute_probability(self,filenames=[],methodparams={}):
        #computes the probability (and length) of a corpus contained in filenames
        if filenames==[]:
            filenames=self.files

        total_p=0
        total_N=0
        for i,afile in enumerate(filenames):
            print("Processing file {}:{}".format(i,afile))
            try:
                with open(os.path.join(self.training_dir,afile)) as instream:
                    for line in instream:
                        line=line.rstrip()
                        if len(line)>0:
                            p,N=self.compute_prob_line(line,methodparams=methodparams)
                            total_p+=p
                            total_N+=N
            except UnicodeDecodeError:
                print("UnicodeDecodeError processing file {}: ignoring rest of file".format(afile))
        return total_p,total_N

    def compute_perplexity(self,filenames=[],methodparams={"method":"bigram","smoothing":"kneser-ney"}):

        #compute the probability and length of the corpus
        #calculate perplexity
        #lower perplexity means that the model better explains the data

        p,N=self.compute_probability(filenames=filenames,methodparams=methodparams)
        #print(p,N)
        pp=math.exp(-p/N)
        return pp

    def _make_unknowns(self,known=2):
        unknown=0
        for (k,v) in list(self.unigram.items()):
            if v<known:
                del self.unigram[k]
                self.unigram["__UNK"]=self.unigram.get("__UNK",0)+v
        for (k,adict) in list(self.bigram.items()):
            for (kk,v) in list(adict.items()):
                isknown=self.unigram.get(kk,0)
                if isknown==0:
                    adict["__UNK"]=adict.get("__UNK",0)+v
                    del adict[kk]
            isknown=self.unigram.get(k,0)
            if isknown==0:
                del self.bigram[k]
                current=self.bigram.get("__UNK",{})
                current.update(adict)
                self.bigram["__UNK"]=current

            else:
                self.bigram[k]=adict

    def _discount(self,discount=0.75):
        #discount each bigram count by a small fixed amount
        self.bigram={k:{kk:value-discount for (kk,value) in adict.items()}for (k,adict) in self.bigram.items()}

        #for each word, store the total amount of the discount so that the total is the same
        #i.e., so we are reserving this as probability mass
        for k in self.bigram.keys():
            lamb=len(self.bigram[k])
            self.bigram[k]["__DISCOUNT"]=lamb*discount

        #work out kneser-ney unigram probabilities
        #count the number of contexts each word has been seen in
        self.kn={}
        for (k,adict) in self.bigram.items():
            for kk in adict.keys():
                self.kn[kk]=self.kn.get(kk,0)+1

In [150]:
MAX_FILES=10

filesets={"training":trainingfiles[:MAX_FILES],"testing":heldoutfiles[:MAX_FILES]}


mylm=language_model(files=filesets["training"])
methods=[{"method":"unigram"},{"method":"bigram","smoothing":"katz"},{"method":"bigram","smoothing":"kneser-ney"}]
#methods=["bigram"]

for f,names in list(filesets.items()):
    for m in methods:

        p=mylm.compute_perplexity(filenames=names,methodparams=m)

        print("Perplexity on {} with <{},{}> method is {}".format(f,m["method"],m.get("smoothing","no smoothing"),p))

Processing DYNMT10.TXT
Processing 09TOM10.TXT
Processing PRSIT10.TXT
Processing NWIND10.TXT
Processing BDAPH10.TXT
Processing COTRT10.TXT
Processing POISN10.TXT
Processing RNDBY10.TXT
Processing LAMEP10.TXT
Processing OAKDA10.TXT
Processing file 0:DYNMT10.TXT
Processing file 1:09TOM10.TXT
Processing file 2:PRSIT10.TXT
Processing file 3:NWIND10.TXT
Processing file 4:BDAPH10.TXT
Processing file 5:COTRT10.TXT
Processing file 6:POISN10.TXT
Processing file 7:RNDBY10.TXT
Processing file 8:LAMEP10.TXT
Processing file 9:OAKDA10.TXT
Perplexity on training with <unigram,no smoothing> method is 467.7230096346662
Processing file 0:DYNMT10.TXT
Processing file 1:09TOM10.TXT
Processing file 2:PRSIT10.TXT
Processing file 3:NWIND10.TXT
Processing file 4:BDAPH10.TXT
Processing file 5:COTRT10.TXT
Processing file 6:POISN10.TXT
Processing file 7:RNDBY10.TXT
Processing file 8:LAMEP10.TXT
Processing file 9:OAKDA10.TXT
Perplexity on training with <bigram,katz> method is 69.64855700160476
Processing file 0:DYN

In [156]:
mylm.generate(k=5,method="bigram")

"`` I have n't know that the"