In [None]:
import pandas as pd;
import hazm as hz;
import math

In [None]:
categories = {
    "جامعه‌شناسی" : 0,
    "مدیریت و کسب و کار" : 1,
    "رمان" : 2,
    "کلیات اسلام" : 3,
    "داستان کودک و نوجوانان" : 4,
    "داستان کوتاه" : 5,
}

In [None]:
bow = [{},{},{},{},{},{},0] #the last element is for number of all unique words

In [None]:
lemmatizer = hz.Lemmatizer()
stemmer = hz.Stemmer()
normalizer = hz.Normalizer()
alpha = 1

In [None]:
train_data = pd.read_csv('books_train.csv')

In [None]:
train_data.describe()

## Preprocessing functions

In [None]:
def useless_words() -> list:
    extraThings = [',', '.', ')', '(', ':', '«', '،', '»' , '؟' , '،' , '؛' , '؟' , 'ـ' , '٪', '!']
    persian_numbers = ['۱','۲','۳','۴','۵','۶','۷','۸','۹','۰' ]
    extraThings.extend(hz.stopwords_list())
    extraThings.extend(persian_numbers)
    
    return extraThings

def create_bow(bow ,category, description):
    normalizer.normalize(description)
    category_index = categories[category]
    words_list = hz.word_tokenize(description)
    filter_words_of_sentence(bow, words_list, category_index, useless_words())
    
def filter_words_of_sentence(bow:list, words_list: list, category_index: int,extraThings: list):    
    for word in words_list:
        word = lemmatizer.lemmatize(word)
        word = stemmer.stem(word)
          
        if word not in bow[category_index] and word not in extraThings:
            if(is_new_word(word, bow)):
                bow[6] += 1
            bow[category_index][word] = 1
            continue    
                   
        if word in bow[category_index]:
            bow[category_index][word] += 1

def is_new_word(word: str, bow)->bool:
    for i in range(6):
        if word in bow[i]:
            return False
    return True   
        
def sum_values(dict_cat: dict)->int:
    sum = 0
    for i in dict_cat.values():
        sum += i
    return sum

## Classifing function

In [None]:
def guess_the_category_of_book(bow: list, book_words: list, number_of_all_words: int) -> int:    
    guessed_category = 0 #default value
    max = float('-inf') #minimum number for initializing
    extraThings = useless_words()
    
    for category in categories.values():
        sum_category_elements  = sum_values(bow[category])    
        sum_p = 0.0
        
        for word in book_words:
            if word in extraThings:
                continue
            word = lemmatizer.lemmatize(word)
            word = stemmer.stem(word)
            
            if word in bow[category]:
                word_count = bow[category][word]
                if word_count == 0:
                    sum_p += math.log10((alpha) / (sum_category_elements + alpha*number_of_all_words))
                else:
                    sum_p += math.log10(word_count/sum_category_elements)
            else:                      
                sum_p += math.log10((alpha) / (sum_category_elements + alpha*number_of_all_words))
        
        if max < sum_p:
            max = sum_p
            guessed_category = category       

    return guessed_category

### Evaluate model

In [None]:
def judge(bow: list, description, category, number_of_all_words: int):
    normalizer.normalize(description)
    book_words = hz.word_tokenize(description) 
    guessed_category = guess_the_category_of_book(bow, book_words, number_of_all_words)
    book_category = categories[category]
    if guessed_category == book_category: 
        return 1
    return 0



def calculate_accuracy(test_data):
    true_guess_counter = 0
    for x in test_data['guess_status']:
        if x == 1:
            true_guess_counter += 1
    return true_guess_counter/len(test_data.index) 

In [None]:
train_data = train_data.apply(lambda row :create_bow(bow ,row['categories'],row['description']), axis = 1)

In [None]:
test_data = pd.read_csv('books_test.csv')
test_data['guess_status'] = test_data.apply(lambda row: judge(bow, row['description'], row['categories'], bow[6]), axis=1)  

In [None]:
print(calculate_accuracy(test_data))