In [1]:
import numpy as np
import collections
from math import log
import sys
import random
import matplotlib
import matplotlib.pyplot as plt
import sklearn
from sklearn import feature_extraction
% matplotlib inline

In [2]:
# Python 3 backwards compatibility tricks
if sys.version_info.major > 2:

    def xrange(*args, **kwargs):
        return iter(range(*args, **kwargs))

    def unicode(*args, **kwargs):
        return str(*args, **kwargs)

class LangModel:
    def fit_corpus(self, corpus):
        """Learn the language model for the whole corpus.

        The corpus consists of a list of sentences."""
        for s in corpus:
            self.fit_sentence(s)
        self.norm()

    def perplexity(self, corpus):
        """Computes the perplexity of the corpus by the model.

        Assumes the model uses an EOS symbol at the end of each sentence.
        """
        return pow(2.0, self.entropy(corpus))

    def entropy(self, corpus):
        num_words = 0.0
        sum_logprob = 0.0
        for s in corpus:
            num_words += len(s) + 1 # for Linespace
            sum_logprob += self.logprob_sentence(['_LINESPACE']+s)
        return -(1.0/num_words)*(sum_logprob)

    def logprob_sentence(self, sentence):
        p = 0.0
        for i in range(1,len(sentence)):
            p += self.cond_logprob(sentence[i], sentence[:i])
        p += self.cond_logprob('_LINESPACE', sentence)
        return p

    # required, update the model when a sentence is observed
    def fit_sentence(self, sentence): pass
    # optional, if there are any post-training steps (such as normalizing probabilities)
    def norm(self): pass
    # required, return the log2 of the conditional prob of word, given previous words
    def cond_logprob(self, word, previous): pass
    # required, the list of words the language model supports (including EOS)
    def vocab(self): pass

In [3]:
class Unigram(LangModel):
    def __init__(self, backoff = 0.000001):
        self.model = dict()
        self.lbackoff = log(backoff, 2)
        
    def inc_word(self, w):
        if w in self.model:
            self.model[w] += 1.0
        else:
            self.model[w] = 1.0

    def fit_sentence(self, sentence):
        for w in sentence:
            self.inc_word(w)
        self.inc_word('_LINESPACE')

    def norm(self):
        """Normalize and convert to log2-probs."""
        tot = 0.0
        for word in self.model:
            tot += self.model[word]
        ltot = log(tot, 2)
        for word in self.model:
            self.model[word] = log(self.model[word], 2) - ltot

    def cond_logprob(self, word, previous):
        if word in self.model:
            return self.model[word]
        else:
            return self.lbackoff

    def vocab(self):
        return self.model.keys()

In [4]:
class Bigram(LangModel):
    def __init__(self, backoff = 0.000001):
        self.model = dict()
        self.lbackoff = log(backoff, 2)
        self.vocabulary = dict()
        
    def inc_word(self, w):
        if w in self.model:
            self.model[w] += 1.0
        else:
            self.model[w] = 1.0

    def fit_sentence(self, sentence):
        s = ['_LINESPACE'] + sentence + ['_LINESPACE']
        for w in s:
            self.inc_word(w)
            self.vocabulary[w] = 1
        #self.inc_word('_LINESPACE')
        
        for i in range(2,3):
            for j in range(len(s)-i+1):
                self.inc_word(tuple(s[j:j+i]))

    def cond_logprob(self, word, previous):
        if (len(previous)>=1 and tuple([previous[-1]] + [word]) in self.model):
            return log(self.model[tuple([previous[-1]] + [word])]/self.model[previous[-1]],2)
        elif word in self.model:
            return log(self.model[word]/len(self.vocabulary),2)
        else:
            return self.lbackoff

    def vocab(self):
        return self.vocabulary.keys()

In [5]:
class Trigram(LangModel):
    def __init__(self, backoff = 0.000001):
        self.model = dict()
        self.lbackoff = log(backoff, 2)
        self.vocabulary = dict()
        
    def inc_word(self, w):
        if w in self.model:
            self.model[w] += 1.0
        else:
            self.model[w] = 1.0

    def fit_sentence(self, sentence):
        s = ['_LINESPACE'] + sentence + ['_LINESPACE']
        for w in s:
            self.inc_word(w)
            self.vocabulary[w] = 1
        #self.inc_word('_LINESPACE')
        
        for i in range(2,4):
            for j in range(len(s)-i+1):
                self.inc_word(tuple(s[j:j+i]))

    def cond_logprob(self, word, previous):
        if (len(previous)>=2 and tuple(previous[-2:] + [word]) in self.model):
            return log(self.model[tuple(previous[-2:] + [word])]/self.model[tuple(previous[-2:])],2)
        elif (len(previous)>=1 and tuple([previous[-1]] + [word]) in self.model):
            return log(self.model[tuple([previous[-1]] + [word])]/self.model[previous[-1]],2)
        elif word in self.model:
            return log(self.model[word]/len(self.vocabulary),2)
        else:
            return self.lbackoff

    def vocab(self):
        return self.vocabulary.keys()

In [6]:
class Sampler:
    def __init__(self, lm, temp = 1.0):
        """Sampler for a given language model.

        Supports the use of temperature, i.e. how peaky we want to treat the
        distribution as. Temperature of 1 means no change, temperature <1 means
        less randomness (samples high probability words even more), and temp>1
        means more randomness (samples low prob words more than otherwise). See
        simulated annealing for what this means.
        """
        self.lm = lm
        self.rnd = random.Random()
        self.temp = temp

    def sample_sentence(self, prefix = [], max_length = 20):
        """Sample a random sentence (list of words) from the language model.

        Samples words till either EOS symbol is sampled or max_length is reached.
        Does not make any assumptions about the length of the context.
        """
        i = 0
        sent = prefix
        word = self.sample_next(sent, False)
        while i <= max_length and word != "_LINESPACE":
            sent.append(word)
            word = self.sample_next(sent)
            i += 1
        return sent

    def sample_next(self, prev, incl_eos = True):
        """Samples a single word from context.

        Can be useful to debug the model, for example if you have a bigram model,
        and know the probability of X-Y should be really high, you can run
        sample_next([Y]) to see how often X get generated.

        incl_eos determines whether the space of words should include EOS or not.
        """
        wps = []
        tot = -np.inf # this is the log (total mass)
        for w in self.lm.vocab():
            if not incl_eos and w == "_LINESPACE":
                continue
                
            lp = self.lm.cond_logprob(w, prev)
            #wps.append([w, lp/self.temp])
            wps.append([w, lp])
            tot = np.logaddexp2(lp, tot)
        p = self.rnd.random()
        word = self.rnd.choice(wps)[0]
        s = -np.inf # running mass
        for w,lp in wps:
            s = np.logaddexp2(s, lp)
            if p < pow(2, s-tot):
                word = w
                break
        return word

In [7]:
def textToTokens(text):
    """Converts input string to a corpus of tokenized sentences.

    Assumes that the sentences are divided by newlines (but will ignore empty sentences).
    You can use this to try out your own datasets, but is not needed for reading the homework data.
    """
    corpus = []
    sents = text.split("\n")
    from sklearn.feature_extraction.text import CountVectorizer
    count_vect = CountVectorizer()
    count_vect.fit(sents)
    tokenizer = count_vect.build_tokenizer()
    for s in sents:
        toks = tokenizer(s)
        if len(toks) > 0:
            corpus.append(toks)
    return corpus

def file_splitter(filename, seed = 0, train_prop = 0.7, dev_prop = 0.15,
    test_prop = 0.15):
    """Splits the lines of a file into 3 output files."""
    import random
    rnd = random.Random(seed)
    basename = filename[:-4]
    train_file = open(basename + ".train.txt", "w")
    test_file = open(basename + ".test.txt", "w")
    dev_file = open(basename + ".dev.txt", "w")
    with open(filename, 'r') as f:
        for l in f.readlines():
            p = rnd.random()
            if p < train_prop:
                train_file.write(l)
            elif p < train_prop + dev_prop:
                dev_file.write(l)
            else:
                test_file.write(l)
    train_file.close()
    test_file.close()
    dev_file.close()

def read_texts(tarfname, dname):
    """Read the data from the homework data file.

    Given the location of the data archive file and the name of the
    dataset (one of brown, reuters, or gutenberg), this returns a
    data object containing train, test, and dev data. Each is a list
    of sentences, where each sentence is a sequence of tokens.
    """
    import tarfile
    tar = tarfile.open(tarfname, "r:gz", errors = 'replace')
    train_mem = tar.getmember(dname + ".train.txt")
    train_txt = unicode(tar.extractfile(train_mem).read(), errors='replace')
    test_mem = tar.getmember(dname + ".test.txt")
    test_txt = unicode(tar.extractfile(test_mem).read(), errors='replace')
    dev_mem = tar.getmember(dname + ".dev.txt")
    dev_txt = unicode(tar.extractfile(dev_mem).read(), errors='replace')

    from sklearn.feature_extraction.text import CountVectorizer
    count_vect = CountVectorizer()
    count_vect.fit(train_txt.split("\n"))
    tokenizer = count_vect.build_tokenizer()
    class Data: pass
    data = Data()
    data.train = []
    for s in train_txt.split("\n"):
        toks = tokenizer(s)
        if len(toks) > 0:
            data.train.append(toks)
    data.test = []
    for s in test_txt.split("\n"):
        toks = tokenizer(s)
        if len(toks) > 0:
            data.test.append(toks)
    data.dev = []
    for s in dev_txt.split("\n"):
        toks = tokenizer(s)
        if len(toks) > 0:
            data.dev.append(toks)
    print(dname," read.", "train:", len(data.train), "dev:", len(data.dev), "test:", len(data.test))
    return data

def learn_unigram(data):
    """Learns a unigram model from data.train.

    It also evaluates the model on data.dev and data.test, along with generating
    some sample sentences from the model.
    """
    unigram = Unigram()
    unigram.fit_corpus(data.train)
    print("vocab:", len(unigram.vocab()))
    # evaluate on train, test, and dev
#     print("train:", unigram.perplexity(data.train))
#     print("dev  :", unigram.perplexity(data.dev))
#     print("test :", unigram.perplexity(data.test))
#     sampler = Sampler(unigram)
#     print("sample: ", " ".join(str(x) for x in sampler.sample_sentence([])))
#     print("sample: ", " ".join(str(x) for x in sampler.sample_sentence([])))
#     print("sample: ", " ".join(str(x) for x in sampler.sample_sentence([])))
    return unigram

def print_table(table, row_names, col_names, latex_file = None):
    """Pretty prints the table given the table, and row and col names.

    If a latex_file is provided (and tabulate is installed), it also writes a
    file containing the LaTeX source of the table (which you can \input into your report)
    """
    try:
        from tabulate import tabulate
        rows = map(lambda r,t: [r] + t, row_names,table.tolist())
        print(tabulate(rows, headers = [""] + col_names))
        if latex_file is not None:
            latex_str = tabulate(rows, headers = [""] + col_names, tablefmt="latex")
            with open(latex_file, 'w') as f:
                f.write(latex_str)
                f.close()
    except ImportError as e:
        row_format ="{:>15} " * (len(col_names) + 1)
        print(row_format.format("", *col_names))
        for row_name, row in zip(row_names, table):
            print(row_format.format(row_name, *row))

In [9]:
dnames = ["brown", "reuters", "gutenberg"]
datas = []
# Learn the models for each of the domains, and evaluate it
for dname in dnames:
    print("-----------------------")
    print(dname)
    data = read_texts("/Users/adityajoshi/UCI/Stats NLP/Language modeling/hw2.gz", dname)
    datas.append(data)

-----------------------
brown
('brown', ' read.', 'train:', 39802, 'dev:', 8437, 'test:', 8533)
-----------------------
reuters
('reuters', ' read.', 'train:', 38183, 'dev:', 8083, 'test:', 8199)
-----------------------
gutenberg
('gutenberg', ' read.', 'train:', 68767, 'dev:', 14667, 'test:', 14861)


In [11]:
unigram = Unigram()
corpus = [
    [ "Madam", "I", "am", "your", "only", "adam" ]
]
unigram.fit_corpus(corpus)
print(unigram.model)
sampler = Sampler(unigram)
for i in xrange(10):
    print(i, ":", " ".join(str(x) for x in sampler.sample_sentence([])))

{'Madam': -2.807354922057604, '_LINESPACE': -2.807354922057604, 'I': -2.807354922057604, 'am': -2.807354922057604, 'only': -2.807354922057604, 'adam': -2.807354922057604, 'your': -2.807354922057604}
(0, ':', 'adam only am your only am I')
(1, ':', 'your only only only')
(2, ':', 'adam your')
(3, ':', 'adam adam Madam I I I am')
(4, ':', 'Madam I Madam only I adam only Madam am')
(5, ':', 'am your am adam I your adam I adam your am am am am am your adam am only Madam')
(6, ':', 'I I adam am your only am Madam Madam I')
(7, ':', 'your adam adam only I')
(8, ':', 'I I am Madam adam')
(9, ':', 'I only only')


In [13]:
dnames = ["brown", "reuters", "gutenberg"]
unigram_models = []
i = 0
# Learn the models for each of the domains, and evaluate it
for dname in dnames:
    print("-----------------------")
    print(dname)
    unigram_model = learn_unigram(datas[i])
    unigram_models.append(unigram_model)
    sampler = Sampler(unigram_model)
    print("sample: ", " ".join(str(x) for x in sampler.sample_sentence([])))
    i=i+1

-----------------------
brown
('vocab:', 41746)
('sample: ', 'is of public of Illinois Stanley the to tragic and her crewcut through he fathers selectivity it to shoot You come')
-----------------------
reuters
('vocab:', 35989)
('sample: ', 'each reasonable The pain standards by the by period proposal on said recent White be it dlrs equipment by tax DIVIDEND')
-----------------------
gutenberg
('vocab:', 43736)
('sample: ', 'having As Then the 11 here as barn to the think the he glory his bough some time praying at Judah')


In [15]:
unigram_train = np.zeros((3,))
unigram_dev = np.zeros((3,))
unigram_test = np.zeros((3,))

for i,dname in enumerate(dnames):
    unigram_train[i] = unigram_models[i].perplexity(datas[i].train)
    unigram_dev[i] = unigram_models[i].perplexity(datas[i].dev)
    unigram_test[i] = unigram_models[i].perplexity(datas[i].test)

In [16]:
unigram = Unigram()
unigram.fit_corpus(data.train)
print("vocab:", len(unigram.vocab()))
# evaluate on train, test, and dev
print("train:", unigram.perplexity(data.train))
print("dev  :", unigram.perplexity(data.dev))
print("test :", unigram.perplexity(data.test))
sampler = Sampler(unigram)
print("sample: ", " ".join(str(x) for x in sampler.sample_sentence([])))

('vocab:', 43736)
('train:', 981.368830109398)
('dev  :', 1012.4294581112321)
('test :', 990.082497294399)
('sample: ', 'is Doubtfull and their of shall my meanwhile in at 13 upon THAT your isolated which or securing would found in')


In [17]:
print(unigram_test)

[ 1604.19822047  1501.45588743   990.08249729]
