## Markov-chain-based classifier

In [None]:
import string
import numpy as np
import pandas as pd
import itertools

### Markov model implementation

In [1]:
class Model():
    def __init__(self, name, string, alphabet, state_len):
        self.name = name
        self.string = string
        self.alphabet = np.asarray(list(alphabet))
        self.size = len(self.alphabet)
        self.Pi = []
        self.permutations = []
        permutations = np.asarray(list(itertools.permutations(self.alphabet, 2)), dtype='str')
        for i in permutations:
            self.permutations.append(str(i[0])+str(i[1]))
        self.permutations = np.asarray(self.permutations)
        self.A, self.pi = self.freq(state_len)
        
    def freq(self, pi_len):
        A = np.zeros((self.size, self.size))
        if pi_len == 1:
            pi = np.zeros(self.size)
            for ind in range(len(self.string[:-1])):
                if ind == 0:
                    k = np.where(self.alphabet == self.string[ind])[0]
                    pi[k] += 1
                elif self.string[ind] == " ":
                    k = np.where(self.alphabet == self.string[ind+1])[0]
                    pi[k] += 1
                else:
                    if self.string[ind+1] != " ":
                        i = np.where(self.alphabet == self.string[ind])[0]
                        j = np.where(self.alphabet == self.string[ind+1])[0]
                        A[i, j] += 1
            sums = [sum(i) for i in A]
            A = np.log(self.laplace_correction((A.T / sums).T))
            pi = np.log(self.laplace_correction(pi / sum(pi)))
            return A, pi
        else:
            pi = np.zeros(self.size**2)
            for ind in range(len(self.string[:-2])):
                if ind == 0:
                    if self.string[ind] != " " and self.string[ind+1] != " ":
                        permutation = "".join([self.string[ind], self.string[ind+1]])
                        k = np.where(self.permutations == permutation)[0]
                        pi[k] += 1
                elif self.string[ind] == " ":
                    if self.string[ind+1] != " " and self.string[ind+2] != " ":
                        permutation = "".join([self.string[ind+1], self.string[ind+2]])
                        k = np.where(self.permutations == permutation)[0]
                        pi[k] += 1
                else:
                    if self.string[ind+1] != " ":
                        i = np.where(self.alphabet == self.string[ind])[0]
                        j = np.where(self.alphabet == self.string[ind+1])[0]
                        A[i, j] += 1
            sums = [sum(i) for i in A]
            A = np.log(self.laplace_correction((A.T / sums).T))
            pi = np.log(self.laplace_correction(pi / sum(pi)))
            return A, pi

    def laplace_correction(self, matrix):
        matrix = matrix + 0.0001
        norm_matrix = matrix / np.linalg.norm(matrix)
        return norm_matrix
    
    def probability_of_word(self, word, state_len):
        P = 0
        for ind in range(len(word[:-1])):
            if ind == 0:
                if state_len == 1:
                    k = np.where(self.alphabet == word[ind])[0]
                    P += self.pi[k]
                else:
                    if self.string[ind] != " " and self.string[ind+1] != " ":
                        permutation = "".join([self.string[ind], self.string[ind+1]])
                        k = np.where(self.permutations == permutation)[0]
                        P += self.pi[k]
            else:
                i = np.where(self.alphabet == word[ind])[0]
                j = np.where(self.alphabet == word[ind+1])[0]
                P += self.A[i, j]
        return P
    
    def probability_of_sequence(self, sequence, state_len):
        for word in sequence.split(" "):
            self.Pi.append(self.probability_of_word(word, state_len))
        return np.sum(self.Pi)

### Classification based on markov model

In [2]:
class MarkovClassifier():
    def __init__(self, alphabet, state_len):
        self.alphabet = alphabet
        self.models_names = None
        self.models = []
        self.idx_pred = None
        self.state_len = state_len
        
    def fit(self, titles, train_datasets):
        self.models_names = titles
        for title, X_train in zip(titles, train_datasets):
            model = Model(title, X_train, self.alphabet, self.state_len)
            self.models.append(model)
            
    def predict(self, X_test):
        P = []
        for mi in self.models:
            P.append(mi.probability_of_sequence(X_test, self.state_len))
        self.idx_pred = np.argmax(P)
        return self.models_names[self.idx_pred]
    
    def score(self):
        M = [mi.Pi for mi in self.models]
        M = np.asarray(M, dtype="float").T
        min_idx = np.argmax(M, axis=1)
        unique, counts = np.unique(min_idx, return_counts=True)
        idx = np.where(unique == self.idx_pred)[0]
        if len(counts) != len(self.models_names):
            counts_ = []
            for i in range(len(self.models_names)):
                if i in unique:
                    counts_.append(int(counts[np.where(i == unique)]))
                else:
                    counts_.append(0)
            return (counts[idx] / np.sum(counts))[0], counts_
        return (counts[idx] / np.sum(counts))[0], counts      

### Loading datasets

In [3]:
alphabet = "abcdefghijklmnopqrstuvwxyz"      
titles = ["King_Henry_IV", "Oliver_Twist", "Poor_folk", "Nicholas_Nickleby"]
train_datasets = []
test_datasets = []
for title in titles:
    sample = ""
    with open(title+".txt") as f:
        line = f.readline()
        while line:
            line = f.readline()
            line = line.replace("\n", " ")
            sample += line.translate(str.maketrans("", "", string.punctuation)).lower()
    idx = int(len(sample) * 0.7)
    X_train = sample[:idx]
    X_test = sample[idx:]
    train_datasets.append(X_train)
    test_datasets.append(X_test)

### Applying the classifier
(state treated as a single character)

In [8]:
clf = MarkovClassifier(alphabet, 1)
clf.fit(titles, train_datasets)
score = []
confusion_matrix = []
for i, title in enumerate(titles):
    clf.predict(test_datasets[i])
    score_i, counts = clf.score()
    confusion_matrix.append(counts)
    score.append(int(score_i*100))
    print(f"TEST DATA: {title}, SCORE: {score[i]}%")
print(f"MAIN SCORE: {np.mean(score)}%" )

TEST DATA: King_Henry_IV, SCORE: 63%
TEST DATA: Oliver_Twist, SCORE: 51%
TEST DATA: Poor_folk, SCORE: 48%
TEST DATA: Nicholas_Nickleby, SCORE: 46%
MAIN SCORE: 52.0%


### Confusion matrix

In [5]:
import pandas as pd

df = pd.DataFrame(data=np.asarray(confusion_matrix), columns = titles, index=titles)
df

Unnamed: 0,King_Henry_IV,Oliver_Twist,Poor_folk,Nicholas_Nickleby
King_Henry_IV,2068,363,484,350
Oliver_Twist,2599,912,808,698
Poor_folk,2982,1128,1180,899
Nicholas_Nickleby,3235,1293,1348,1054


(state treated as a pair of characters)

In [7]:
clf = MarkovClassifier(alphabet, 2)
clf.fit(titles, train_datasets)
score = []
confusion_matrix = []
for i, title in enumerate(titles):
    clf.predict(test_datasets[i])
    score_i, counts = clf.score()
    confusion_matrix.append(counts)
    score.append(int(score_i*100))
    print(f"TEST DATA: {title}, SCORE: {score[i]}%")
print(f"MAIN SCORE: {np.mean(score)}%" )

TEST DATA: King_Henry_IV, SCORE: 83%
TEST DATA: Oliver_Twist, SCORE: 73%
TEST DATA: Poor_folk, SCORE: 72%
TEST DATA: Nicholas_Nickleby, SCORE: 71%
MAIN SCORE: 74.75%


### Confusion matrix

In [9]:
df = pd.DataFrame(data=np.asarray(confusion_matrix), columns = titles, index=titles)
df

Unnamed: 0,King_Henry_IV,Oliver_Twist,Poor_folk,Nicholas_Nickleby
King_Henry_IV,2068,363,484,350
Oliver_Twist,2599,912,808,698
Poor_folk,2982,1128,1180,899
Nicholas_Nickleby,3235,1293,1348,1054


In [11]:
sample = ""
with open("Antony_and_Cleopatra.txt") as f:
    line = f.readline()
    while line:
        line = f.readline()
        line = line.replace("\n", " ")
        sample += line.translate(str.maketrans("", "", string.punctuation)).lower()
idx = int(len(sample) * 0.7)
clf.predict(sample)
print(f"TEST DATA: {title}, SCORE: {int(clf.score()[0]*100)}%")
# Christmas_Carol.txt
# Antony_and_Cleopatra.txt

TEST DATA: Nicholas_Nickleby, SCORE: 46%
