# Installing required packages

In [6]:
!pip3 install nltk
!pip3 install numpy

Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.
Defaulting to user installation because normal site-packages is not writeable


## Downloading nltk packages

In [54]:
import nltk
nltk.download('wordnet', 'stopwords', 'punkt')

True

# Importing packages

In [79]:
import json
from math import sqrt, log
import numpy as np
from nltk.stem import PorterStemmer, WordNetLemmatizer
from math import log
from nltk.corpus import stopwords as sw

# Reading data

In [74]:
def get_data(path):    
    '''Reading the json files from passed path'''
    with open(path) as d:
        json_data = json.load(d)
    return json_data

# Data Operations

In [51]:
def get_doc_terms(doc_info):
    '''concatenate all terms (title + body) of the passed doc'''
    body_terms = doc_info['body'].split()
    title_terms = doc_info['title'].split()
    all_terms = body_terms + title_terms
    return all_terms

## Count term frequency over collection

In [80]:
stopwords = sw.words('english')

In [81]:
def get_document_frequency(json_data):
    vectors = {}
    
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    
    for doc_id, doc in enumerate(json_data):
        terms = get_doc_terms(doc)
        for term in terms:
            if term not in stopwords:
                term = lemmatizer.lemmatize(term)
                if term not in vectors:
                    vectors[term] = []
                if doc_id not in vectors[term]:
                    vectors[term].append(doc_id)
    
    vectors_list = list(vectors.items())
    term_map_index = {}
    
    for i in range(len(vectors_list)):
        term = vectors_list[i][0]
        term_map_index[term] = i
    
    frequencies = []
    for word, freq_list in vectors_list:
        frequencies.append(len(freq_list))
    
    return term_map_index, frequencies

In [60]:
def get_vector(json_data, term_map_index, frequencies, train_docs_no = 24000):
    term_no = len(term_map_index)
    vector = np.zeros((train_docs_no, term_no))
        
    for doc_id, doc in enumerate(json_data):
        terms = get_doc_terms(doc)
        doc_term_freq = {}
        
        for term in terms:
            if term not in stopwords:
                if term not in doc_term_freq:
                    doc_term_freq[term] = 0
                doc_term_freq[term] += 1
                
        for term in doc_term_freq:
            if term in term_map_index:
                index = term_map_index[term]
                doc_frequency = frequencies[index]
                tf = doc_term_freq[term]
                idf = log(train_docs_no/doc_frequency)
                vector[doc_id, index] = tf * idf
    
    return vector

In [None]:
def get_category(data):
    '''return list of categories for train and test docs'''
    categories = []
    
    for doc in data:
        category = doc['category']
        categories.append(category)
        
    return categories

## Assigning variables

In [83]:
train_data = get_data('./data/train.json')
test_data = get_data('./data/validation.json')
# print(list(train_data)[:10])

train_categories = get_category(train_data)
test_categories = get_category(test_data)
# print(train_categories)[:10]

N = len(train_data)
term_map_index, frequencies = get_document_frequency(train_data)

N = N // 10

#randomly pick N data from train
train_data = train_data[:N]
test_data = test_data[:300]

#train and test vectors 
train_vector = get_vector(train_data, term_map_index, frequencies, N)
test_vector = get_vector(test_data, term_map_index, frequencies, N)

# kNN (k Nearest Neighbor)

There are two methods used for measuring nearest neighbors: 
1. Cosine Similarity
2. Euclidean Distance

The Following sections would be the implementation of each method. 

## Cosine Similarity

Cosine Similarity = train.test^t / |train|.|test^t|

Since |test^t| is constant, we can ignore its value and don't calculate it. 

In [72]:
def get_cosine_similarity(train, test):
    '''consine_similarity = train.test^t / |train|.|test|'''
    test_transpose = test.T
    dot_product = np.dot(train, test_transpose)
    normalization = np.linalg.norm(train, axis=1)
    similarity = dot_product / normalization
    return similarity

## Euclidean Distance

In [73]:
def get_euclidean_distance(train, test):
    '''Euclidean Distance'''
    x_2 = np.sum(test * test, axis=1)
    test_transpose = test.T
    xy = np.dot(train, test_transpose, axis=1)
    y_2 = np.sum(train * train, axis = 1)
    
    #make the test 1-column for evaluation
    x_2 = np.reshape(x_2, shape(-1, 1))
    result = x_2 + y_2 - 2 * xy
    euclidean_distance = np.sqrt(result)
    return euclidean_distance

## kNN 

In [None]:
def knn(doc_id, scores, category, method, k):
    
    
valid_k = [1, 3, 5]



# Naive Bayes



In [69]:
def get_category_count(category_no = 4):
    category_count = {}
    
    for i in range(1, category_no + 1):
        category_count[str(i)] = 0
    
    for doc_id in vectors:
        category_id = str(vectors[doc_id]['category'])
        category_count[category_id] += 1
    
    return category_count

def naive_bayes_training(category_no=4):
    N = len(json_data)
    category_count = get_category_count()
    