In [3]:
import numpy as np
import matplotlib.pyplot as plt
import string
from sklearn.model_selection import train_test_split

In [4]:
input_files = [
    'edgar_allan_poe.txt',
    'robert_frost.txt'
]

In [5]:
input_texts = []
labels = []

for label, f in enumerate(input_files):
    print(f"{f} correspond to label {label}")
    for line in open(f):
        line = line.rstrip().lower()
        if line:
            line = line.translate(str.maketrans('', '', string.punctuation))
        
            input_texts.append(line)
            labels.append(label)

edgar_allan_poe.txt correspond to label 0
robert_frost.txt correspond to label 1


In [6]:
train_text, test_text, Ytrain, Ytest = train_test_split(input_texts, labels)

In [7]:
len(Ytrain), len(Ytest)

(1618, 540)

In [8]:
train_text[:5]

['yet does perhaps for all three',
 'half looking for the orchid calypso',
 'its hands of gold',
 'i wanted to put out the light and see',
 'has always been too much for me it has']

In [9]:
Ytrain[:5]

[1, 1, 1, 1, 1]

In [10]:
idx = 1
word2idx ={'<unk>': 0}

In [11]:
for text in train_text:
    tokens = text.split()
    for token in tokens:
        if token not in word2idx:
            word2idx[token] = idx
            idx +=1

In [12]:
word2idx

{'<unk>': 0,
 'yet': 1,
 'does': 2,
 'perhaps': 3,
 'for': 4,
 'all': 5,
 'three': 6,
 'half': 7,
 'looking': 8,
 'the': 9,
 'orchid': 10,
 'calypso': 11,
 'its': 12,
 'hands': 13,
 'of': 14,
 'gold': 15,
 'i': 16,
 'wanted': 17,
 'to': 18,
 'put': 19,
 'out': 20,
 'light': 21,
 'and': 22,
 'see': 23,
 'has': 24,
 'always': 25,
 'been': 26,
 'too': 27,
 'much': 28,
 'me': 29,
 'it': 30,
 'she': 31,
 'rolls': 32,
 'through': 33,
 'an': 34,
 'ether': 35,
 'sighs': 36,
 'he': 37,
 'thinks': 38,
 'dark': 39,
 'flooded': 40,
 'with': 41,
 'daylight': 42,
 'how': 43,
 'fairylike': 44,
 'a': 45,
 'melody': 46,
 'there': 47,
 'floats': 48,
 'but': 49,
 'please': 50,
 'in': 51,
 'kitchen': 52,
 'chimney': 53,
 'sos': 54,
 'take': 55,
 'any': 56,
 'comfort': 57,
 'can': 58,
 'sew': 59,
 'mountain': 60,
 'may': 61,
 'have': 62,
 'shifted': 63,
 'since': 64,
 'saw': 65,
 'come': 66,
 'past': 67,
 'stars': 68,
 'lion': 69,
 'kept': 70,
 'hidden': 71,
 'instep': 72,
 'arch': 73,
 'or': 74,
 'like': 

In [13]:
len(word2idx)

2506

In [14]:
train_text_int = []
test_text_int = []
for text in train_text:
    tokens = text.split()
    line_as_int = [word2idx[token] for token in tokens]
    train_text_int.append(line_as_int)
for text in test_text :
    tokens = text.split()
    line_as_int = [word2idx.get(token, 0) for token in tokens]
    test_text_int.append(line_as_int)

In [15]:
train_text_int[100:105]

[[394, 22, 395, 22, 396, 397, 398],
 [150, 14, 9, 399, 400, 401, 18, 91, 402],
 [360, 84, 61, 403, 18, 45, 404],
 [405, 406, 100, 9, 60, 407, 257],
 [408, 409, 188, 410, 51, 9, 411]]

In [16]:
V = len(word2idx)

A0 = np.ones((V,V))
pi0 = np.ones(V)

A1 = np.ones((V,V))
pi1 = np.ones(V)

In [17]:
def compute_counts(text_as_int, A, pi):
    for tokens in text_as_int:
        last_idx = None
        for idx in tokens:
            if last_idx is None:
                pi[idx] += 1
            else:
                A[last_idx, idx] += 1
            last_idx = idx
compute_counts([t for t, y in zip(train_text_int, Ytrain) if y==0], A0, pi0)
compute_counts([t for t, y in zip(train_text_int, Ytrain) if y==1], A1, pi1)

In [18]:
A0 /= A0.sum(axis=1, keepdims=True)
pi0 /= pi0.sum()

A1 /= A1.sum(axis=1, keepdims=True)
pi1 /= pi1.sum()

logA0 = np.log(A0)
logpi0 = np.log(pi0)

logA1 = np.log(A1)
logpi1 = np.log(pi1)

In [19]:
count0 = sum(y==0 for y in Ytrain)
count1 = sum(y==1 for y in Ytrain)
total = len(Ytrain)
p0 = count0/total
p1 = count1/total
logp0 = np.log(p0)
logp1 = np.log(p1)
p0, p1

(0.3362175525339926, 0.6637824474660075)

In [26]:
class Classifier:
    def __init__(self, logAs, logpis, logpriors):
        self.logAs = logAs
        self.logpis = logpis
        self.logpriors = logpriors
        self.K = len(logpriors)
        
    def _compute_log_likelihood(self, input_, class_):
        logA = self.logAs[class_]
        logpi = self.logpis[class_]
        
        last_idx = None
        logprob = 0
        for idx in input_:
            if last_idx is None:
                logprob += logpi[idx]
            else:
                logprob += logA[last_idx, idx]
            
            last_idx = idx
            
        return logprob
    
    def predict(self, inputs):
        predictions = np.zeros(len(inputs))
        for i, input_ in enumerate(inputs):
            posteriors = [self._compute_log_likelihood(input_,c) + self.logpriors[c] \
                         for c in range(self.K)]
            pred = np.argmax(posteriors)
            predictions[i] = pred
        return predictions

In [27]:
clf = Classifier([logA0, logA1], [logpi0, logpi1], [logp0, logp1])

In [28]:
Ptrain = clf.predict(train_text_int)
print(f"Train acc: {np.mean(Ptrain==Ytrain)}")

Train acc: 0.992583436341162


In [30]:
Ptest = clf.predict(test_text_int)
print(f"Test acc: {np.mean(Ptest==Ytest)}")

Test acc: 0.8407407407407408


In [35]:
from sklearn.metrics import confusion_matrix, f1_score

In [37]:
cm = confusion_matrix(Ytrain, Ptrain)
cm

array([[ 532,   12],
       [   0, 1074]], dtype=int64)

In [38]:
cm = confusion_matrix(Ytest, Ptest)
cm

array([[103,  75],
       [ 11, 351]], dtype=int64)

In [39]:
f1_score(Ytrain, Ptrain)

0.9944444444444445

In [40]:
f1_score(Ytest, Ptest)

0.8908629441624365