<div style="direction:rtl;line-height:200%;text-align:justify" dir=rtl><font face="B Mitra" size=6 color="#2b580c" >
توابع مشترک
</font></div>

In [1]:
from nltk import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
from string import punctuation
from math import log10


def tokenize(docs):
    tokenized_docs = []
    for doc in docs:
        tokenized_doc = {'category': doc['category'] if 'category' in doc.keys() else 0}
        all_tokens = []
        title_tokens = doc['title'].lower().translate(str.maketrans(' ', ' ', punctuation)).split(' ')
        body_tokens = doc['body'].lower().translate(str.maketrans(' ', ' ', punctuation)).split(' ')
        for token in title_tokens:
            if token != '':
                all_tokens.append(token)
        for token in body_tokens:
            if token != '':
                all_tokens.append(token)
        tokenized_doc['tokens'] = all_tokens
        tokenized_docs.append(tokenized_doc)
    return tokenized_docs


def stemming(docs):
    stemmer = PorterStemmer()
    stemmed_docs = []
    for doc in docs:
        stemmed_doc = {'category': doc['category']}
        all_tokens = []
        for token in doc['tokens']:
            all_tokens.append(stemmer.stem(token))
        stemmed_doc['tokens'] = all_tokens
        stemmed_docs.append(stemmed_doc)
    return stemmed_docs


def lemmatization(docs):
    lemmatizer = WordNetLemmatizer()
    lemmatized_docs = []
    for doc in docs:
        lemmatized_doc = {'category': doc['category']}
        all_tokens = []
        for token in doc['tokens']:
            all_tokens.append(lemmatizer.lemmatize(token))
        lemmatized_doc['tokens'] = all_tokens
        lemmatized_docs.append(lemmatized_doc)
    return lemmatized_docs


def stop_words_removal(docs):
    stop_words = set(stopwords.words('english'))
    new_docs = []
    for doc in docs:
        new_doc = {'category': doc['category']}
        all_tokens = []
        for token in doc['tokens']:
            if token not in stop_words:
                all_tokens.append(token)
        new_doc['tokens'] = all_tokens
        new_docs.append(new_doc)
    return new_docs


def tf_idf(docs):
    vocabulary = {}
    tf = [{} for _ in range(len(docs))]
    idf = {}
    number_of_terms = 0
    for i in range(len(docs)):
        for term in docs[i]['tokens']:
            if term not in tf[i].keys():
                tf[i][term] = 0
            tf[i][term] += 1
            if term not in vocabulary.keys():
                vocabulary[term] = number_of_terms
                number_of_terms += 1
    for term in vocabulary.keys():
        df = 0
        for i in range(len(docs)):
            if term in tf[i].keys():
                df += 1
        idf[term] = log10(len(docs) / df) if df != 0 else 0
    docs_tf_idf_matrix = [[0 for __ in range(len(vocabulary.keys()))] for _ in range(len(docs))]
    for i in range(len(docs)):
        for term in tf[i].keys():
            docs_tf_idf_matrix[i][vocabulary[term]] = tf[i][term] * idf[term]
    return vocabulary, idf, docs_tf_idf_matrix


def evaluation(results):
    precision = [0, 0, 0, 0, 0]
    recall = [0, 0, 0, 0, 0]
    fi = [0, 0, 0, 0, 0]
    confusion_matrix = [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]
    for result in results:
        confusion_matrix[result[0]][result[1]] += 1
    for i in range(1, 5):
        tp, fp, fn = 0, 0, 0
        for result in results:
            if result[0] == i and result[1] != i:
                fn += 1
            if result[0] == i and result[1] == i:
                tp += 1
            if result[0] != i and result[1] == i:
                fp += 1
        precision[i] = tp / (tp + fp) if tp + fp != 0 else 0
        recall[i] = tp / (tp + fn) if tp + fn != 0 else 0
        fi[i] = 2 * precision[i] * recall[i] / (precision[i] + recall[i]) if precision[i] + recall[i] != 0 else 0
    for i in range(1, 5):
        print('\tCategory {}: \n\t\tPrecision: {:.5f}\n\t\tRecall: {:.5f}'.format(i, precision[i], recall[i]))
    print('\tAccuracy: {:.5f}'.format((confusion_matrix[1][1] + confusion_matrix[2][2] + confusion_matrix[3][3] + confusion_matrix[4][4]) / len(results)))
    print('\tConfusion Matrix: ', end='')
    for i in range(1, 5):
        print('\n\t\t', end='')
        for j in range(1, 5):
            print('{: =3}  '.format(confusion_matrix[i][j]), end="")
    print('\n\tMacro Averaged FI: {:.5f}'.format(sum(fi) / 4))


def apply_text_operation(docs, text_operation):
    if text_operation == 'stemming':
        new_docs = stemming(tokenize(docs))
    elif text_operation == 'lemmatization':
        new_docs = lemmatization(tokenize(docs))
    elif text_operation == 'stop words removal':
        new_docs = stop_words_removal(tokenize(docs))
    else:
        new_docs = tokenize(docs)
    return new_docs


<div style="direction:rtl;line-height:200%;text-align:justify" dir=rtl><font face="B Mitra" size=6 color="#2b580c" >
لود داده‌ها
</font></div>

In [2]:
import json

training_data_path = 'train.json'
test_data_path = 'validation.json'

with open(training_data_path) as training_data_file:
    training_data = json.loads(training_data_file.read())

with open(test_data_path) as test_data_file:
    test_data = json.loads(test_data_file.read())

<div style="direction:ltr;line-height:200%;text-align:justify" dir=rtl>
    <font face="B Mitra" size=6 color="#2b580c" >
        <br/>
KNN
            </font>
</div>

In [3]:
import numpy
from copy import deepcopy


vocabulary = {}
train_matrix = []
category_of_docs = []
idf = {}
x2 = numpy.asarray([])


def KNN_train(docs, text_operation):
    new_docs = apply_text_operation(docs, text_operation)
    for i in range(len(docs)):
        category_of_docs.append(docs[i]['category'])
    output = tf_idf(new_docs)
    global vocabulary
    vocabulary = output[0]
    global idf
    idf = output[1]
    global train_matrix
    train_matrix = numpy.asarray(output[2])
    global x2
    x2 = [row @ row for row in train_matrix]


def KNN_classify(doc, text_operation, distance_metric):
    tokens = apply_text_operation([doc], text_operation)[0]['tokens']
    result = []
    test_matrix = [0 for _ in range(len(vocabulary))]
    for term in tokens:
        if term in vocabulary:
            test_matrix[vocabulary[term]] += 1
    test_matrix = numpy.asarray([test_matrix[vocabulary[term]] * idf[term] for term in vocabulary.keys()])
    if distance_metric == 'euclidean distance':
        xy = train_matrix @ test_matrix
        y2 = test_matrix @ test_matrix
        scores = list(x2 - 2 * xy + y2)
        for i in range(5):
            result.append(category_of_docs[scores.index(min(scores))])
            scores[scores.index(min(scores))] = float('inf')
    else:   # distance_metric == 'cosine similarity'
        scores = list(numpy.asarray(train_matrix) @ numpy.asarray(test_matrix))
        for i in range(5):
            result.append(category_of_docs[scores.index(max(scores))])
            scores[scores.index(max(scores))] = float('-inf')
    return result


def KNN_test(docs, mode, distance_metric):
    new_data = deepcopy(docs)
    results = []
    for doc in new_data:
        category = doc.pop('category')
        first_k_categories = KNN_classify(doc, mode, distance_metric)
        results.append((category, first_k_categories))
    return results


def KNN_print_log(k, distance_metric, all_results, text_operation):
    result = []
    for i in range(len(all_results)):
        result.append((all_results[i][0], numpy.bincount(all_results[i][1][:k]).argmax()))
    print('K-Nearest Neighbours:')
    print('\tK: ' + str(k))
    print('\tText Operation: ' + text_operation)
    print('\tDistance Metric: ' + distance_metric)
    evaluation(result)


def KNN_find_best_hyper_parameter(train_docs, test_docs, parameters, distance_metric):
    print("Finding best K for K-Nearest Neighbours:")
    KNN_train(train_docs, 'none')
    all_results = KNN_test(test_docs, 'none', distance_metric)
    for parameter in parameters:
        KNN_print_log(parameter, distance_metric, all_results, 'none')


def KNN_check_text_operations_effects(train_docs, test_docs, distance_metric):
    print("Checking effects of text operations on K-Nearest Neighbours:")
    KNN_train(train_docs, 'stemming')
    all_results = KNN_test(test_docs, 'stemming', distance_metric)
    KNN_print_log(1, distance_metric, all_results, 'stemming')
    KNN_train(train_docs, 'lemmatization')
    all_results = KNN_test(test_docs, 'lemmatization', distance_metric)
    KNN_print_log(1, distance_metric, all_results, 'lemmatization')
    KNN_train(train_docs, 'stop words removal')
    all_results = KNN_test(test_docs, 'stop words removal', distance_metric)
    KNN_print_log(1, distance_metric, all_results, 'stop words removal')


def KNN_run(train_docs, test_docs, k, distance_metric, text_operation):
    KNN_train(train_docs, 'none')
    all_results = KNN_test(test_docs, text_operation, distance_metric)
    KNN_print_log(k, distance_metric, all_results, text_operation)


<div style="direction:rtl;line-height:200%;text-align:justify" dir=rtl>
    <font face="B Mitra" size=4 color="#639a67" >
- یافتن بهترین مقدار k به ازای معیار فاصله cosine similarity
    </font>
    <br/>
    <font face="B Mitra" size=1>
* از نصف داده‌ها استفاده‌شده‌است.
    </font>
    <br/>
    <font face="B Mitra" size=3>
با توجه به نتایج زیر، بین مقادیر 1، 3 و 5 برای K، بهترین مقدار 5 است.
    </font>
</div>

In [4]:
KNN_find_best_hyper_parameter(training_data[:int(len(training_data) / 2)], test_data[:int(len(test_data) / 2)], [1, 3, 5], 'cosine similarity')

Finding best K for K-Nearest Neighbours:
K-Nearest Neighbours:
	K: 1
	Text Operation: none
	Distance Metric: cosine similarity
	Category 1: 
		Precision: 0.78030
		Recall: 0.80679
	Category 2: 
		Precision: 0.90423
		Recall: 0.85829
	Category 3: 
		Precision: 0.81143
		Recall: 0.74934
	Category 4: 
		Precision: 0.72180
		Recall: 0.79121
	Accuracy: 0.80133
	Confusion Matrix: 
		309   18   21   35  
		 28  321    3   22  
		 33    8  284   54  
		 26    8   42  288  
	Macro Averaged FI: 0.80201
K-Nearest Neighbours:
	K: 3
	Text Operation: none
	Distance Metric: cosine similarity
	Category 1: 
		Precision: 0.79717
		Recall: 0.88251
	Category 2: 
		Precision: 0.90984
		Recall: 0.89037
	Category 3: 
		Precision: 0.86280
		Recall: 0.74670
	Category 4: 
		Precision: 0.78010
		Recall: 0.81868
	Accuracy: 0.83467
	Confusion Matrix: 
		338   11   13   21  
		 25  333    4   12  
		 33   12  283   51  
		 28   10   28  298  
	Macro Averaged FI: 0.83429
K-Nearest Neighbours:
	K: 5
	Text Operation: 

<div style="direction:rtl;line-height:200%;text-align:justify" dir=rtl>
    <font face="B Mitra" size=4 color="#639a67" >
- یافتن بهترین مقدار k به ازای معیار فاصله euclidean distance
    </font>
    <br/>
    <font face="B Mitra" size=1>
* از نصف داده‌ها استفاده‌شده‌است.
    </font>
    <br/>
    <font face="B Mitra" size=3>
با توجه به نتایج زیر، بین مقادیر 1، 3 و 5 برای K، بهترین مقدار 5 است.
    </font>
</div>

In [5]:
KNN_find_best_hyper_parameter(training_data[:int(len(training_data) / 2)], test_data[:int(len(test_data) / 2)], [1, 3, 5], 'euclidean distance')

Finding best K for K-Nearest Neighbours:
K-Nearest Neighbours:
	K: 1
	Text Operation: none
	Distance Metric: euclidean distance
	Category 1: 
		Precision: 0.82609
		Recall: 0.34726
	Category 2: 
		Precision: 0.96154
		Recall: 0.26738
	Category 3: 
		Precision: 0.31873
		Recall: 0.90237
	Category 4: 
		Precision: 0.81481
		Recall: 0.36264
	Accuracy: 0.47133
	Confusion Matrix: 
		133    4  243    3  
		 14  100  259    1  
		 11    0  342   26  
		  3    0  229  132  
	Macro Averaged FI: 0.47009
K-Nearest Neighbours:
	K: 3
	Text Operation: none
	Distance Metric: euclidean distance
	Category 1: 
		Precision: 0.43750
		Recall: 0.85901
	Category 2: 
		Precision: 0.88136
		Recall: 0.41711
	Category 3: 
		Precision: 0.75182
		Recall: 0.54354
	Category 4: 
		Precision: 0.65657
		Recall: 0.53571
	Accuracy: 0.59067
	Confusion Matrix: 
		329    6   19   29  
		183  156   10   25  
		120    5  206   48  
		120   10   39  195  
	Macro Averaged FI: 0.59173
K-Nearest Neighbours:
	K: 5
	Text Operation

<div style="direction:rtl;line-height:200%;text-align:justify" dir=rtl>
    <font face="B Mitra" size=4 color="#639a67" >
- بررسی تأثیر روش‌های پردازش متن بر روی دسته‌بندی به ازای معیار فاصله cosine similarity
    </font>
    <br/>
    <font face="B Mitra" size=1>
* از نصف داده‌ها استفاده‌شده‌است.
    </font>
    <br/>
    <font face="B Mitra" size=3>
با توجه به نتایج زیر، بهترین اثر را stop words removal و بدترین اثر را stemming دارد.    </font>
</div>

In [6]:
KNN_check_text_operations_effects(training_data[:int(len(training_data) / 2)], test_data[:int(len(test_data) / 2)], 'cosine similarity')

Checking effects of text operations on K-Nearest Neighbours:
K-Nearest Neighbours:
	K: 1
	Text Operation: stemming
	Distance Metric: cosine similarity
	Category 1: 
		Precision: 0.78880
		Recall: 0.80940
	Category 2: 
		Precision: 0.92486
		Recall: 0.85561
	Category 3: 
		Precision: 0.79887
		Recall: 0.74406
	Category 4: 
		Precision: 0.71569
		Recall: 0.80220
	Accuracy: 0.80267
	Confusion Matrix: 
		310   12   23   38  
		 30  320    6   18  
		 29    8  282   60  
		 24    6   42  292  
	Macro Averaged FI: 0.80371
K-Nearest Neighbours:
	K: 1
	Text Operation: lemmatization
	Distance Metric: cosine similarity
	Category 1: 
		Precision: 0.78987
		Recall: 0.81462
	Category 2: 
		Precision: 0.92090
		Recall: 0.87166
	Category 3: 
		Precision: 0.80000
		Recall: 0.73879
	Category 4: 
		Precision: 0.71820
		Recall: 0.79121
	Accuracy: 0.80400
	Confusion Matrix: 
		312   12   22   37  
		 26  326    4   18  
		 32    9  280   58  
		 25    7   44  288  
	Macro Averaged FI: 0.80469
K-Nearest Ne

<div style="direction:rtl;line-height:200%;text-align:justify" dir=rtl>
    <font face="B Mitra" size=4 color="#639a67" >
- بررسی تأثیر روش‌های پردازش متن بر روی دسته‌بندی به ازای معیار فاصله euclidean distance
    </font>
    <br/>
    <font face="B Mitra" size=1>
* از نصف داده‌ها استفاده‌شده‌است.
    </font>
    <br/>
    <font face="B Mitra" size=3>
با توجه به نتایج زیر، بدترین اثر را stop words removal و بهترین اثر را stemming دارد.
</font>
</div>

In [7]:
KNN_check_text_operations_effects(training_data[:int(len(training_data) / 2)], test_data[:int(len(test_data) / 2)], 'euclidean distance')

Checking effects of text operations on K-Nearest Neighbours:
K-Nearest Neighbours:
	K: 1
	Text Operation: stemming
	Distance Metric: euclidean distance
	Category 1: 
		Precision: 0.84167
		Recall: 0.52742
	Category 2: 
		Precision: 0.92903
		Recall: 0.38503
	Category 3: 
		Precision: 0.38318
		Recall: 0.86544
	Category 4: 
		Precision: 0.77912
		Recall: 0.53297
	Accuracy: 0.57867
	Confusion Matrix: 
		202    8  162   11  
		 16  144  206    8  
		 14    1  328   36  
		  8    2  160  194  
	Macro Averaged FI: 0.58926
K-Nearest Neighbours:
	K: 1
	Text Operation: lemmatization
	Distance Metric: euclidean distance
	Category 1: 
		Precision: 0.83920
		Recall: 0.43603
	Category 2: 
		Precision: 0.96581
		Recall: 0.30214
	Category 3: 
		Precision: 0.34295
		Recall: 0.87863
	Category 4: 
		Precision: 0.77934
		Recall: 0.45604
	Accuracy: 0.51933
	Confusion Matrix: 
		167    4  200   12  
		 16  113  243    2  
		 13    0  333   33  
		  3    0  195  166  
	Macro Averaged FI: 0.52572
K-Nearest 

<div style="direction:ltr;line-height:200%;text-align:justify" dir=rtl>
    <font face="B Mitra" size=6 color="#2b580c" >
        <br/>
Naïve Bayes
            </font>
</div>

In [8]:
from math import log10
from copy import deepcopy


vocabulary = set()
probability_of_categories = {1: 0, 2: 0, 3: 0, 4: 0}
probability_of_terms = {}


def NB_preprocess(docs, text_operation):
    new_docs = apply_text_operation(docs, text_operation)
    terms_of_category = {1: {}, 2: {}, 3: {}, 4: {}}
    for doc in new_docs:
        for word in doc['tokens']:
            if word not in terms_of_category[doc['category']]:
                terms_of_category[doc['category']][word] = 0
            terms_of_category[doc['category']][word] += 1
            vocabulary.add(word)
    return terms_of_category


def NB_train(docs, alpha, text_operation):
    terms_of_category = NB_preprocess(docs, text_operation)
    number_of_terms = {}
    for doc in docs:
        probability_of_categories[doc['category']] += 1
    for category in probability_of_categories.keys():
        probability_of_categories[category] /= len(docs)
    for term in vocabulary:
        number_of_terms[term] = {category: 0 for category in probability_of_categories.keys()}
        for category in probability_of_categories.keys():
            if term in terms_of_category[category].keys():
                number_of_terms[term][category] = terms_of_category[category][term]
    for term in vocabulary:
        probability_of_terms[term] = {category: 0.0 for category in probability_of_categories.keys()}
        for category in probability_of_categories.keys():
            probability_of_terms[term][category] = (number_of_terms[term][category] + alpha) / (len(terms_of_category[category]) + alpha * len(vocabulary))


def NB_classify(doc, text_operation):
    tokens = apply_text_operation([doc], text_operation)[0]['tokens']
    scores = {}
    for category in probability_of_categories.keys():
        scores[category] = log10(probability_of_categories[category])
        for term in tokens:
            if term in vocabulary:
                scores[category] += log10(probability_of_terms[term][category])
    for category in scores.keys():
        if scores[category] == max(scores.values()):
            return category


def NB_test(docs, alpha, text_operation):
    new_data = deepcopy(docs)
    results = []
    for doc in new_data:
        category = doc.pop('category')
        predicted_category = NB_classify(doc, text_operation)
        results.append((category, predicted_category))
    print('Naive Bayes:')
    print('\tAlpha: ' + str(alpha))
    print('\tText Operation: ' + text_operation)
    evaluation(results)


def NB_find_best_hyper_parameter(train_docs, test_docs, parameters, text_operation):
    print("Finding best alpha for Naive Bayes:")
    NB_train(train_docs, parameters[0], text_operation)
    NB_test(test_docs, parameters[0], text_operation)
    NB_train(train_docs, parameters[1], text_operation)
    NB_test(test_docs, parameters[1], text_operation)
    NB_train(train_docs, parameters[2], text_operation)
    NB_test(test_docs, parameters[2], text_operation)


def NB_check_text_operations_effects(train_docs, test_docs, alpha):
    print("Checking effects of text operations on Naive Bayes:")
    NB_train(train_docs, alpha, 'stemming')
    NB_test(test_docs, alpha, 'stemming')
    NB_train(train_docs, alpha, 'lemmatization')
    NB_test(test_docs, alpha, 'lemmatization')
    NB_train(train_docs, alpha, 'stop words removal')
    NB_test(test_docs, alpha, 'stop words removal')


def NB_run(train_docs, test_docs, alpha, text_operation):
    NB_train(train_docs, alpha, text_operation)
    NB_test(test_docs, alpha, text_operation)


<div style="direction:rtl;line-height:200%;text-align:justify" dir=rtl>
    <font face="B Mitra" size=4 color="#639a67" >
- یافتن بهترین مقدار $\alpha$
    </font>
    <br/>
    <font face="B Mitra" size=3>
با توجه به نتایج زیر، بین مقادیر 0.25، 1 و 4 برای $\alpha$، بهترین مقدار 1 است.
    </font>
</div>

In [9]:
NB_find_best_hyper_parameter(training_data, test_data, [0.25, 1, 4], 'none')

Finding best alpha for Naive Bayes:
Naive Bayes:
	Alpha: 0.25
	Text Operation: none
	Category 1: 
		Precision: 0.88816
		Recall: 0.90000
	Category 2: 
		Precision: 0.94080
		Recall: 0.97467
	Category 3: 
		Precision: 0.85229
		Recall: 0.86933
	Category 4: 
		Precision: 0.89112
		Recall: 0.82933
	Accuracy: 0.89333
	Confusion Matrix: 
		675   27   32   16  
		 13  731    3    3  
		 33    8  652   57  
		 39   11   78  622  
	Macro Averaged FI: 0.89283
Naive Bayes:
	Alpha: 1
	Text Operation: none
	Category 1: 
		Precision: 0.89376
		Recall: 0.89733
	Category 2: 
		Precision: 0.93967
		Recall: 0.97600
	Category 3: 
		Precision: 0.86479
		Recall: 0.86133
	Category 4: 
		Precision: 0.87656
		Recall: 0.84267
	Accuracy: 0.89433
	Confusion Matrix: 
		673   29   29   19  
		 11  732    2    5  
		 31    8  646   65  
		 38   10   70  632  
	Macro Averaged FI: 0.89384
Naive Bayes:
	Alpha: 4
	Text Operation: none
	Category 1: 
		Precision: 0.88992
		Recall: 0.89467
	Category 2: 
		Precision: 0.93

<div style="direction:rtl;line-height:200%;text-align:justify" dir=rtl>
    <font face="B Mitra" size=4 color="#639a67" >
- بررسی تأثیر روش‌های پردازش متن بر روی دسته‌بندی
    </font>
    <br/>
    <font face="B Mitra" size=3>
با توجه به نتایج زیر، بهترین اثر را stop words removal و بدترین اثر را lemmatization دارد. 
</font>
</div>

In [10]:
NB_check_text_operations_effects(training_data, test_data, 1)

Checking effects of text operations on Naive Bayes:
Naive Bayes:
	Alpha: 1
	Text Operation: stemming
	Category 1: 
		Precision: 0.89093
		Recall: 0.90400
	Category 2: 
		Precision: 0.94201
		Recall: 0.97467
	Category 3: 
		Precision: 0.86821
		Recall: 0.85200
	Category 4: 
		Precision: 0.87345
		Recall: 0.84667
	Accuracy: 0.89433
	Confusion Matrix: 
		678   26   28   18  
		 11  731    3    5  
		 32   10  639   69  
		 40    9   66  635  
	Macro Averaged FI: 0.89384
Naive Bayes:
	Alpha: 1
	Text Operation: lemmatization
	Category 1: 
		Precision: 0.89196
		Recall: 0.90267
	Category 2: 
		Precision: 0.93726
		Recall: 0.97600
	Category 3: 
		Precision: 0.86957
		Recall: 0.85333
	Category 4: 
		Precision: 0.87569
		Recall: 0.84533
	Accuracy: 0.89433
	Confusion Matrix: 
		677   27   29   17  
		 12  732    1    5  
		 32   10  640   68  
		 38   12   66  634  
	Macro Averaged FI: 0.89378
Naive Bayes:
	Alpha: 1
	Text Operation: stop words removal
	Category 1: 
		Precision: 0.89933
		Recall:

<div style="direction:ltr;line-height:200%;text-align:justify" dir=rtl>
    <font face="B Mitra" size=6 color="#2b580c" >
        <br/>
SVM
            </font>
</div>

In [11]:
from sklearn import svm
from copy import deepcopy
import scipy.sparse


vocabulary = {}
SVM = None
idf = {}


def SVM_train(docs, c):
    new_docs = tokenize(docs)
    output = tf_idf(new_docs)
    global vocabulary
    vocabulary = output[0]
    global idf
    idf = output[1]
    train_x = scipy.sparse.csr_matrix(output[2])
    train_y = [new_docs[i]['category'] for i in range(len(new_docs))]
    global SVM
    SVM = svm.SVC(C=c, kernel='linear', degree=3, gamma='auto', max_iter=12000)
    SVM.fit(train_x, train_y)


def SVM_classify(doc):
    tokens = tokenize([doc])[0]['tokens']
    test_x = [0 for _ in range(len(vocabulary))]
    for term in tokens:
        if term in vocabulary:
            test_x[vocabulary[term]] += 1
    test_x = scipy.sparse.csr_matrix([test_x[vocabulary[term]] * idf[term] for term in vocabulary.keys()])
    return SVM.predict(test_x)[0]


def SVM_test(docs, c):
    new_data = deepcopy(docs)
    results = []
    for doc in new_data:
        category = doc.pop('category')
        predicted_category = SVM_classify(doc)
        results.append((category, predicted_category))
    print('SVM:')
    print('\tC: ' + str(c))
    evaluation(results)


def SVM_find_best_hyper_parameter(train_docs, test_docs, parameters):
    print("Finding best C for SVM:")
    SVM_train(train_docs, parameters[0])
    SVM_test(test_docs, parameters[0])
    SVM_train(train_docs, parameters[1])
    SVM_test(test_docs, parameters[1])
    SVM_train(train_docs, parameters[2])
    SVM_test(test_docs, parameters[2])


def SVM_run(train_docs, test_docs, c):
    SVM_train(train_docs, c)
    SVM_test(test_docs, c)


<div style="direction:rtl;line-height:200%;text-align:justify" dir=rtl>
    <font face="B Mitra" size=4 color="#639a67" >
- یافتن بهترین مقدار C
    </font>
        <br/>
    <font face="B Mitra" size=1>
* از نصف داده‌ها استفاده‌شده‌است.
    </font>
    <br/>
    <font face="B Mitra" size=3>
با توجه به نتایج زیر، بین مقادیر 0.001، 0.01 و 0.1 برای C، بهترین مقدار 0.01 است.
    </font>
</div>

In [12]:
SVM_find_best_hyper_parameter(training_data[:int(len(training_data) / 2)], test_data[:int(len(test_data) / 2)], [0.001, 0.01, 0.1])

Finding best C for SVM:
SVM:
	C: 0.001
	Category 1: 
		Precision: 0.92898
		Recall: 0.85379
	Category 2: 
		Precision: 0.92658
		Recall: 0.97861
	Category 3: 
		Precision: 0.87714
		Recall: 0.81003
	Category 4: 
		Precision: 0.78908
		Recall: 0.87363
	Accuracy: 0.87867
	Confusion Matrix: 
		327   15   16   25  
		  3  366    1    4  
		  9    7  307   56  
		 13    7   26  318  
	Macro Averaged FI: 0.87828
SVM:
	C: 0.01
	Category 1: 
		Precision: 0.89946
		Recall: 0.86423
	Category 2: 
		Precision: 0.94010
		Recall: 0.96524
	Category 3: 
		Precision: 0.87397
		Recall: 0.84169
	Category 4: 
		Precision: 0.84073
		Recall: 0.88462
	Accuracy: 0.88867
	Confusion Matrix: 
		331   14   19   19  
		  7  361    2    4  
		 15    7  319   38  
		 15    2   25  322  
	Macro Averaged FI: 0.88841
SVM:
	C: 0.1
	Category 1: 
		Precision: 0.87500
		Recall: 0.85901
	Category 2: 
		Precision: 0.93211
		Recall: 0.95455
	Category 3: 
		Precision: 0.86464
		Recall: 0.82586
	Category 4: 
		Precision: 0.8311

<div style="direction:ltr;line-height:200%;text-align:justify" dir=rtl>
    <font face="B Mitra" size=6 color="#2b580c" >
        <br/>
Random Forest
            </font>
</div>

In [13]:
from sklearn.ensemble import RandomForestClassifier
from copy import deepcopy
import scipy.sparse


vocabulary = {}
random_forest_classifier = None
idf = {}


def RF_train(docs, number_of_trees, max_depth):
    output = tf_idf(tokenize(docs))
    global vocabulary
    vocabulary = output[0]
    global idf
    idf = output[1]
    train_x = scipy.sparse.csc_matrix(output[2])
    train_y = [docs[i]['category'] for i in range(len(docs))]
    global random_forest_classifier
    random_forest_classifier = RandomForestClassifier(n_estimators=number_of_trees, max_depth=max_depth)
    random_forest_classifier.fit(train_x, train_y)


def RF_classify(doc):
    tokens = tokenize([doc])[0]['tokens']
    test_x = [0 for _ in range(len(vocabulary))]
    for term in tokens:
        if term in vocabulary:
            test_x[vocabulary[term]] += 1
    test_x = scipy.sparse.csc_matrix([test_x[vocabulary[term]] * idf[term] for term in vocabulary.keys()])
    return random_forest_classifier.predict(test_x)[0]


def RF_test(docs, number_of_trees, max_depth):
    new_data = deepcopy(docs)
    results = []
    for doc in new_data:
        category = doc.pop('category')
        predicted_category = RF_classify(doc)
        results.append((category, predicted_category))
    print('Random Forest:')
    print('\tNumber of trees: ' + str(number_of_trees))
    print('\tMaximum depth: ' + str(max_depth))
    evaluation(results)


def RF_find_best_hyper_parameter(train_docs, test_docs, parameters):
    print("Finding best number of trees & depth for Random Forest:")
    RF_train(train_docs, parameters['n_trees'][0], parameters['max_depth'][0])
    RF_test(test_docs, parameters['n_trees'][0], parameters['max_depth'][0])
    RF_train(train_docs, parameters['n_trees'][1], parameters['max_depth'][1])
    RF_test(test_docs, parameters['n_trees'][1], parameters['max_depth'][1])
    RF_train(train_docs, parameters['n_trees'][2], parameters['max_depth'][2])
    RF_test(test_docs, parameters['n_trees'][2], parameters['max_depth'][2])


def RF_run(train_docs, test_docs, number_of_trees, max_depth):
    RF_train(train_docs, number_of_trees, max_depth)
    RF_test(test_docs, number_of_trees, max_depth)


<div style="direction:rtl;line-height:200%;text-align:justify" dir=rtl>
    <font face="B Mitra" size=4 color="#639a67" >
- یافتن بهترین مقادیر برای تعداد درخت‌ها و عمق‌ها
    </font>
        <br/>
    <font face="B Mitra" size=1>
* از نصف داده‌ها استفاده‌شده‌است.
    </font>
    <br/>
    <font face="B Mitra" size=3>
با توجه به نتایج زیر، بین مقادیر 25، 50 و 100 برای تعداد درخت‌ها، بهترین مقدار 100 و بین مقادیر 5000، 10000 و 20000 برای عمق درخت‌ها، بهترین مقدار 20000 است.
    </font>
</div>

In [15]:
RF_find_best_hyper_parameter(training_data[:int(len(training_data) / 2)], test_data[:int(len(test_data) / 2)],
                                       {'n_trees': [25, 50, 100], 'max_depth': [5000, 10000, 20000]})

Finding best number of trees & depth for Random Forest:
Random Forest:
	Number of trees: 25
	Maximum depth: 5000
	Category 1: 
		Precision: 0.85154
		Recall: 0.79373
	Category 2: 
		Precision: 0.81412
		Recall: 0.92513
	Category 3: 
		Precision: 0.80290
		Recall: 0.73087
	Category 4: 
		Precision: 0.74263
		Recall: 0.76099
	Accuracy: 0.80267
	Confusion Matrix: 
		304   29   23   27  
		 10  346    5   13  
		 23   23  277   56  
		 20   27   40  277  
	Macro Averaged FI: 0.80115
Random Forest:
	Number of trees: 50
	Maximum depth: 10000
	Category 1: 
		Precision: 0.86501
		Recall: 0.81984
	Category 2: 
		Precision: 0.84524
		Recall: 0.94920
	Category 3: 
		Precision: 0.83237
		Recall: 0.75989
	Category 4: 
		Precision: 0.74933
		Recall: 0.76374
	Accuracy: 0.82333
	Confusion Matrix: 
		314   24   15   30  
		  5  355    3   11  
		 22   17  288   52  
		 22   24   40  278  
	Macro Averaged FI: 0.82174
Random Forest:
	Number of trees: 100
	Maximum depth: 20000
	Category 1: 
		Precision: 0

<div style="direction:ltr;line-height:200%;text-align:justify" dir=rtl>
    <font face="B Mitra" size=6 color="#2b580c" >
        <br/>
        KMeans
            </font>
</div>

In [17]:
import numpy


def kmeans_run(docs):
    new_docs = tokenize(docs)
    vocabulary, idf, train_matrix = tf_idf(new_docs)
    categories = [new_docs[i]['category'] for i in range(len(new_docs))]
    centroids = [numpy.random.random(len(vocabulary)) for _ in range(4)]
    clusters = [[] for _ in range(4)]
    train_matrix = numpy.asarray(train_matrix)
    temp = [row @ row for row in train_matrix]
    x2 = numpy.asarray([[temp[i] for _ in range(4)] for i in range(len(docs))])
    for i in range(100):
        for j in range(4):
            clusters[j].clear()
        xy = train_matrix @ numpy.transpose(centroids)
        temp = [row @ row for row in numpy.asarray(centroids)]
        y2 = numpy.asarray([temp for _ in range(len(docs))])
        distance = list(x2 - 2 * xy + y2)
        for j in range(len(new_docs)):
            clusters[list(distance[j]).index(min(list(distance[j])))].append(j)
        for j in range(4):
            sum_of_vectors = [0 for _ in range(len(vocabulary))]
            for k in clusters[j]:
                sum_of_vectors = numpy.add(sum_of_vectors, train_matrix[k])
            centroids[j] = numpy.asarray(sum_of_vectors / len(clusters[j]) if len(clusters[j]) > 0 else sum_of_vectors)


In [None]:
kmeans_run(training_data[:int(len(training_data) / 2)])