# KNN
In this notebook we have two news datasets, one with categories labeled and one without labels. We use Elasticsearch to index the dataset with labels and further implement K-Nearest Neighbors algorithm to classify and label the unlabeled dataset. 

In [1]:
from elasticsearch import Elasticsearch
from elasticsearch import helpers
import json
from tqdm import tqdm
from gensim.models import Word2Vec
import numpy as np
import random
import pandas as pd

## Preprocessing and Indexing

First we define a `mapping` to specify fields their types so we can index the documents accordingly. 

In [4]:
mapping = { 
    "mappings": {
        "properties": {
          "content": {
            "type": "text"
          },
            "category":{
                "type": "keyword"
            },
            "vec":{
                "type": "dense_vector",
                "dims": 200,
                "index": True,
                "similarity": "cosine"
            }
        }
    }
}

In [5]:
es = Elasticsearch("http://localhost:9200")

In [6]:
index_name = 'ir_knn'

In [7]:
# Delete index if one does exist
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)

# Create index
es.indices.create(index=index_name, body=mapping)

  es.indices.create(index=index_name, body=mapping)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'ir_knn'})

Here we preprocess our labeled news articles and come up with a list of tokens for each one of them:

In [8]:
from preprocess import Preprocess

df = pd.read_excel('./data/IR01_3_test_4k.xlsx')

preprocess = Preprocess()

df_processed = df

df_processed['content'] = df['content'].apply(preprocess.normalize)
df_processed['content'] = df_processed['content'].apply(preprocess.tokenize)
df_processed['content'] = df_processed['content'].apply(preprocess.stem)
df_processed['content'] = df_processed['content'].apply(preprocess.redact_stops)

Using the `embeding` module we create a numerical vector representation of each news articles (documents) and store them in `doc_vectors`.

In [9]:
from  embeding import DocToVec

dataset = [[] for _ in range(df.shape[0])]

for index, row in df_processed.iterrows():
    dataset[index] = row['content'].split(" ")

doc2vec = DocToVec(dataset , vec_size = 200 , model_path = './data/word2vec.model')

There is no pre-trained model. Going to train a model ...




In [10]:
doc_vectors = []
for i in range(df.shape[0]):
  doc_vectors.append(doc2vec.embed(dataset[i])) 

We define a function to later on preprocess and filter every new articles that we wanna label.

In [12]:
def filter_doc(content):
    content = preprocess.normalize(content)
    content = preprocess.tokenize(content)
    content = preprocess.stem(content)
    return preprocess.redact_stops(content).split(" ")

In [13]:
data_tmp = []
for index, row in df_processed.iterrows():
    doc = dict()
    doc['content'] = row['content']
    doc['vec'] = list(doc_vectors[index])
    doc['category'] = row['category']
    data_tmp.append(doc)

In [14]:
data_bulk = [
    {
        "_index" : index_name,
        "_id" : i + 1,
        "_source": data_tmp[i]
    }
    for i in range(len(data_tmp))
]
resp = helpers.bulk(
  es,
  data_bulk,
  index = index_name
)
print(resp)

(4022, [])


## Labeling
Here we load our test dataset and iterate through them. Each time we make a vector representation of that document and then perform a **KNN Search** of 10 nearest neighbors in our index for that document. From these neighbors we find the most common category of news amongst them and label our unlabeled document with it.

In [15]:
test_dataset = pd.read_excel("./data/IR01_3_46k.xlsx")

In [16]:
def doc_to_vec(content):
    content_processed = filter_doc(content)
    return list(doc2vec.embed(content_processed))
    

In [17]:
test_dataset['category'] = 'NaN'
for index, row in test_dataset.iterrows():
    res = []
    vec = doc_to_vec(row['content'])    
    try:
        resp = es.knn_search(index=index_name,knn={
        "field": "vec",
        "query_vector": vec,
        "k": 10,
        "num_candidates": 100
        }
        ,source=['content','category'],)

        for result in resp['hits']['hits']:
            res.append(result['_source']['category'])
        category = max(set(res), key=res.count)

    except:
        category = 'Others'
    
    test_dataset.at[index,'category'] = category

        

  vec /= sum_weights


In [18]:
del test_dataset['Unnamed: 0']

In [160]:
writer = pd.ExcelWriter('./data/IR01_3_46k_classified_3.xlsx')
test_dataset.to_excel(writer)
writer.save()

## Indexing Labeled Documents
We index our newly labeled documents and then perform a boolean query search to make sure that we have the right categories.

In [19]:
classified_index_name = 'classified_index'

In [20]:
if es.indices.exists(index=classified_index_name):
    es.indices.delete(index=classified_index_name)

    es.indices.create(index=classified_index_name)

In [21]:
from elasticsearch.helpers import bulk

def bulk_sync():
    actions = [
        {
            '_index': classified_index_name,
            '_id':doc_id,
            '_source': doc.to_dict()
        } for doc_id,doc in test_dataset.iterrows()
    ]
    bulk(es, actions)

In [22]:
bulk_sync()

In [23]:
def get_query(text, category):
    body ={
    "query":{  
        "bool": {
            "must": [
                { "match": { "category": category }},
                { "match": { "content": text }}
            ]
        }
    }
    }
    
    return body

In [24]:
queries = [
    ("نتایج مسابقات لیگ برتر ایران", "sport"),
    ("تحریم های آمریکا", "economy"),
    ("ویروس کرونا", "health")
]

In [25]:
query_results = []
for q in queries:
    res_search = es.search(index=classified_index_name, body=get_query(q[0], q[1]), explain=True)
    query_results.append(dict(res_search))

  res_search = es.search(index=classified_index_name, body=get_query(q[0], q[1]), explain=True)


In [26]:
for res, q in zip(query_results, queries):
    print(q)
    for doc in res['hits']['hits']:
        print('{}\t{}'.format(doc['_source']['url'], doc['_source']['category']))
        print('-----')
    print("----------------------------")

('نتایج مسابقات لیگ برتر ایران', 'sport')
https://www.farsnews.ir/news/13991127001103/زمان-برگزاری-مسابقات-روز-آخر-لیگ-هندبال-تغییر-کرد	sport
-----
https://www.farsnews.ir/news/13991203000693/پورمحمد-نباید-صدر-جدول-را-به-راحتی-از-دست-بدهیم-وضعیت-برگزاری-و-داوری	sport
-----
https://www.farsnews.ir/news/13991215000268/قهرمانی-لیگ-برتر-کاراته-به-مس-رفسنجان-رسید	sport
-----
https://www.farsnews.ir/news/13991113000405/ولیان-هدف‌مان-پشتوانه‌سازی-برای-هندبال-است	sport
-----
https://www.farsnews.ir/news/13991113001111/لیگ-برتر-والیبال|-نتایج-کامل-هفته-بیست‌وپنجم-تثبیت-جایگاه-بالانشین‌ها	sport
-----
https://www.farsnews.ir/news/13991105000678/گروه‌بندی-مرحله-سوم-لیگ-برتر-هندبال-مشخص-شد	sport
-----
https://www.farsnews.ir/news/13991126000645/لیگ‌برتر-تنیس-روی-میز|-صعود-یاران-عالمیان-و-شهرداری-کرج-به-نیمه-نهایی	sport
-----
https://www.farsnews.ir/news/13991221000389/دورنهایی-فوتبال-بانوان|-شکست-بم-در-نصف-جهان-سیرجان-به-صدر-صعود-کرد	sport
-----
https://www.isna.ir/news/98030401704/کولاکوویچ-می-توا