In [69]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

import nltk
nltk.download('punkt')
nltk.download('perluniprops')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LEGION\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package perluniprops to
[nltk_data]     C:\Users\LEGION\AppData\Roaming\nltk_data...
[nltk_data]   Package perluniprops is already up-to-date!


In [70]:
data = pd.read_csv("X_data_stemmed.csv", encoding = 'utf-8-sig')
data

Unnamed: 0.1,Unnamed: 0,0
0,0,آرتمیس دازده ساله با کمک محافظ شخص فق ماهر بات...
1,1,کرده بد در مسیر ماجخط ساحل تاالب میراند جاییکه...
2,2,به رابطه دستان که آدامسن دا شر میکرد دامن نزد ...
3,3,بد زن پنجاه پنج کیل اگر چیز را پنجبار لمس می...
4,4,مال چند سال پیشه فل با تیک صب یک از سم را زمین...
...,...,...
1376,1376,نارنیا پاسخ داده خاهد شد انتقا تیسراک بس سخ...
1377,1377,خندیدن کردند البته نم تانستند جل خد را بگیرند ...
1378,1378,تجیز کرد فراه ساز همه گنه سایل راحت که ض کنن ا...
1379,1379,اسـبپـسـرکا زندگ پیشتاز خدا خدا م کردند دباره ...


In [71]:
y_data = pd.read_csv("y_data.csv", encoding = 'utf-8-sig')
y_data

Unnamed: 0.1,Unnamed: 0,Author,Author_ID
0,0,Artemis Fowl,1
1,1,Artemis Fowl,1
2,2,Artemis Fowl,1
3,3,Artemis Fowl,1
4,4,Artemis Fowl,1
...,...,...,...
1376,1376,c.s.lewis,10
1377,1377,c.s.lewis,10
1378,1378,c.s.lewis,10
1379,1379,c.s.lewis,10


In [72]:
texts = list(data['0'].values)  # List of input texts
labels = list((y_data['Author_ID']-1).values)

In [73]:
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, stratify=y_data['Author_ID'], random_state=42)

In [74]:
n_classes = len(np.unique(labels))

In [75]:
class TrieNode:
    def __init__(self):
        self.children = {}
        self.is_end_of_term = False
        self.tf = 0  
        self.postinglist=set()

class Trie:
    def __init__(self):
        self.root = TrieNode()

    def insert(self, term, docID):
        node = self.root
        for char in term:
            if char not in node.children:
                node.children[char] = TrieNode()
            node = node.children[char]
        node.is_end_of_term = True
        node.tf+= 1 
        node.postinglist.add(docID)
    
        

    def search(self, term ):
        a = dict()
        node = self.root
        for char in term:
            if char not in node.children:
                return  0 , set() 
            node = node.children[char]
        if (node.is_end_of_term):
            return node.tf , node.postinglist
        else:
            return 0 ,0

In [76]:
class_inverted_indexes = []
for i in range(n_classes):
    class_inverted_indexes.append(Trie())

In [77]:
class_TC = []
for i in range(n_classes):
    class_TC.append(0)


In [78]:
terms = set()



In [79]:
for t , l  in zip(train_texts , train_labels):
    tokens = word_tokenize(t)
    for token in tokens:
        class_inverted_indexes[l].insert(token,0)
        class_TC[l] += 1 
        terms.add(token)   
    


In [80]:

V = len(terms)

In [81]:
def predict(document, class_inverted_indexes, class_TC, n_classes, V):
    tokens = word_tokenize(document)
    class_scores = np.zeros(n_classes)
    for token in tokens:
        for l in range(n_classes):
            t , b = class_inverted_indexes[l].search(token)
            class_scores[l] += np.log2((t + 1)/(class_TC[l]+V)) 
    return np.argmax(class_scores)

In [82]:
predictions = []
i = 0 
for t in test_texts:
    predictions.append(predict(t, class_inverted_indexes, class_TC, n_classes, V))
    print(predict(t, class_inverted_indexes, class_TC, n_classes, V),test_labels[i])
    i +=1


5 5
9 9
2 2
1 1
4 4
7 7
0 0
6 6
6 6
9 9
2 2
5 5
4 4
3 3
0 0
7 7
5 5
1 1
1 1
4 4
2 2
9 9
9 9
2 2
2 2
4 8
1 1
5 5
7 7
9 9
4 4
3 3
2 2
7 7
4 4
3 3
2 2
0 0
4 4
5 5
5 5
5 5
4 4
3 3
3 3
0 0
5 5
3 3
2 2
6 6
1 1
9 9
2 2
1 1
5 5
1 1
5 5
4 4
2 2
7 7
7 7
6 6
1 1
7 7
4 8
2 2
5 5
5 5
2 2
4 4
2 2
7 7
2 2
3 3
2 2
0 0
4 4
4 4
7 7
9 9
9 9
2 2
9 9
2 2
5 5
3 3
2 2
2 2
9 9
4 4
1 1
2 2
2 2
3 3
9 9
8 8
2 2
0 0
3 3
2 2
1 1
0 0
2 2
0 0
7 7
9 9
4 4
0 0
5 5
9 9
2 2
2 2
2 2
4 4
3 3
0 0
2 2
0 0
1 1
1 1
0 0
5 5
2 8
5 5
1 1
2 2
2 2
4 4
5 5
5 5
8 8
2 2
5 5
5 5
5 5
2 2
4 4
2 2
5 5
3 3
4 4
0 0
5 5
3 3
2 2
6 6
2 2
6 6
9 9
3 3
0 0
8 8
3 3
5 5
0 0
0 0
0 0
8 8
0 0
2 2
9 9
1 1
2 2
5 5
6 6
1 1
0 0
5 5
8 8
6 6
0 0
2 2
7 7
2 2
4 4
2 2
4 4
4 4
8 8
1 1
8 8
7 7
5 5
4 4
5 5
0 0
5 5
0 0
1 1
9 9
5 5
7 7
9 9
3 3
3 3
5 5
5 5
6 6
5 5
4 4
0 0
7 7
6 6
7 7
2 2
8 8
5 5
5 5
4 4
5 5
9 9
0 0
7 7
0 0
3 3
7 7
0 0
4 4
2 2
4 4
5 5
0 0
3 3
0 0
2 2
4 4
4 4
4 4
0 0
3 3
5 5
3 3
3 3
6 6
5 5
2 2
2 2
2 2
3 3
1 1
1 1
5 5
5 5
5 5
5 5
2 2
4 4
2 2
4 4
4 6


In [83]:
print(classification_report(test_labels, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        32
           1       1.00      1.00      1.00        22
           2       0.98      1.00      0.99        52
           3       1.00      1.00      1.00        26
           4       0.92      1.00      0.96        35
           5       1.00      1.00      1.00        46
           6       1.00      0.93      0.97        15
           7       1.00      1.00      1.00        18
           8       1.00      0.73      0.84        11
           9       1.00      1.00      1.00        20

    accuracy                           0.99       277
   macro avg       0.99      0.97      0.98       277
weighted avg       0.99      0.99      0.98       277

