In [10]:
import numpy as np
import matplotlib.pyplot as plt
import string
from sklearn.model_selection import train_test_split

In [3]:
input_files = ['edgar_allan_poe.txt','robert_frost.txt']

In [2]:
lines = []
labels = []

In [13]:
for label,f in enumerate(input_files):
    for line in open(f):
        line = line.rstrip().lower()
        if line:
          # remove punctuation
          line = line.translate(str.maketrans('', '', string.punctuation))
    
          lines.append(line)
          labels.append(label)

In [18]:
X_train,X_test,y_train,y_test = train_test_split(lines,labels)

In [20]:
word2idx = {'<unk>':0}
idx = 1

In [21]:
for text in X_train:
    tokens = text.split()
    for token in tokens:
        if token not in word2idx:
            word2idx[token] = idx
            idx+=1
        

In [22]:
word2idx

{'<unk>': 0,
 'in': 1,
 'every': 2,
 'lifelike': 3,
 'posture': 4,
 'of': 5,
 'the': 6,
 'swarm': 7,
 'this': 8,
 'eden': 9,
 'day': 10,
 'is': 11,
 'done': 12,
 'at': 13,
 'two': 14,
 'oclock': 15,
 'was': 16,
 'only': 17,
 'adding': 18,
 'frost': 19,
 'to': 20,
 'snow': 21,
 'that': 22,
 'had': 23,
 'me': 24,
 'by': 25,
 'coat': 26,
 'as': 27,
 'good': 28,
 'seated': 29,
 'but': 30,
 'you': 31,
 'meddle': 32,
 'with': 33,
 'my': 34,
 'fate': 35,
 'and': 36,
 'tell': 37,
 'where': 38,
 'youre': 39,
 'off': 40,
 'forâ€”montreal': 41,
 'ive': 42,
 'kept': 43,
 'brown': 44,
 'standing': 45,
 'cold': 46,
 'a': 47,
 'leak': 48,
 'emptied': 49,
 'then': 50,
 'forty': 51,
 'years': 52,
 'not': 53,
 'even': 54,
 'one': 55,
 'lonely': 56,
 'rose': 57,
 'these': 58,
 'cheeks': 59,
 'worm': 60,
 'never': 61,
 'dies': 62,
 'cased': 63,
 'world': 64,
 'he': 65,
 'gone': 66,
 'smiled': 67,
 'died': 68,
 'parterre': 69,
 'enchanted': 70,
 'call': 71,
 'it': 72,
 'ill': 73,
 'what': 74,
 'show': 75,


In [26]:
# convert data into integer format
train_text_int = []
test_text_int = []

for text in X_train:
  tokens = text.split()
  line_as_int = [word2idx[token] for token in tokens]
  train_text_int.append(line_as_int)

for text in X_test:
  tokens = text.split()
  line_as_int = [word2idx.get(token, 0) for token in tokens]
  test_text_int.append(line_as_int)

In [29]:
V = len(word2idx)

#state transition matrix ->The A matrix represents the probabilities of transitioning from one state to another in a Markov process.
A0 = np.ones((V,V))
pi0 = np.ones(V)

A1 = np.ones((V,V))
pi1 = np.ones(V)

In [32]:
# we are calculating the state transition matrix and and initial distribution matrix for each sample class. 
def compute_counts(text_as_int,A,pi):
    for tokens in text_as_int:
        last_idx = None
        for idx in tokens:
            # its the first word in a sentence
            if last_idx is not None:
                pi[idx]+=1
            else:
                A[last_idx,idx]+=1
        last_idx = 1

compute_counts([t for t, y in zip(train_text_int, y_train) if y == 0], A0, pi0)
compute_counts([t for t, y in zip(train_text_int, y_train) if y == 1], A1, pi1)

In [33]:
# normalize A and pi so they are valid probability matrices (note that we've initialized the matrices to 1)
# taking sum row wise
A0 /= A0.sum(axis=1, keepdims=True)
pi0 /= pi0.sum()

A1 /= A1.sum(axis=1, keepdims=True)
pi1 /= pi1.sum()

In [34]:
logA0 = np.log(A0)
logpi0 = np.log(pi0)

logA1 = np.log(A1)
logpi1 = np.log(pi1)

In [37]:
count0 = sum(y == 0 for y in y_train)
count1 = sum(y == 1 for y in y_train)
total = len(y_train)

p0 = count0 / total
p1 = count1 / total
logp0 = np.log(p0)
logp1 = np.log(p1)
p0, p1

(0.335747392815759, 0.6642526071842411)

In [39]:
# build a classifier
class Classifier:
  def __init__(self, logAs, logpis, logpriors):
    self.logAs = logAs
    self.logpis = logpis
    self.logpriors = logpriors
    self.K = len(logpriors) # number of classes

  def _compute_log_likelihood(self, input_, class_):
    logA = self.logAs[class_]
    logpi = self.logpis[class_]

    last_idx = None
    logprob = 0
    for idx in input_:
      if last_idx is None:
        # it's the first token
        logprob += logpi[idx]
      else:
        logprob += logA[last_idx, idx]
      
      # update last_idx
      last_idx = idx
    
    return logprob
  
  def predict(self, inputs):
    predictions = np.zeros(len(inputs))
    for i, input_ in enumerate(inputs):
      posteriors = [self._compute_log_likelihood(input_, c) + self.logpriors[c] \
             for c in range(self.K)]
      pred = np.argmax(posteriors)
      predictions[i] = pred
    return predictions

In [40]:
clf = Classifier([logA0, logA1], [logpi0, logpi1], [logp0, logp1])

In [42]:
Ptrain = clf.predict(train_text_int)
print(f"Train acc: {np.mean(Ptrain == y_train)}")

Train acc: 0.6642526071842411


In [44]:
Ptest = clf.predict(test_text_int)
print(f"Test acc: {np.mean(Ptest == y_test)}")

Test acc: 0.6701388888888888
