# ASSIGNMENT 3 - VECTOR SPACE MODEL

# Install nltk package
* models/**punkt**

In [9]:
import nltk

nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

# Input
* **data**: dictionary
    * ex: {'0': { 'title': ..., 'link': ..., 'content': ..., 'summary': ... }, '1': {...}, ... }
* stored in **orig_data**

In [1]:
# import library
import json
import string
from nltk.tokenize import word_tokenize
import copy
import sys
import numpy as np

# file name - get from crawler
filename = 'crawler.txt'

# load data
with open(filename, 'r', encoding='utf8') as json_file:
    orig_data = json.load(json_file)

# Load stopwords
* open Vietnamese stopwords file (stopwords.txt)

In [2]:
def load_stopwords(filename='stopwords.txt'):
    stop_words = []
    with open(filename, 'r', encoding='utf8') as file:
        stop_words = file.readlines()
    stop_words = [item.strip() for item in stop_words] 
    return stop_words

stop_words = load_stopwords()

# Preprocessing data
* **Description**: concatenate all content fields
    * remove **punctuation**
    * remove **endline**
    * remove **spaces** - generated by beautifulsoup
    * transform **lowercase**
    * remove **trailing spaces**
* **Input**: orig_data
* **Output**: dictionary with content

In [3]:
def preprocessing():
    global stop_words, orig_data
    data = {}
    keys = ['title', 'content', 'summary']
    translator = str.maketrans('','', string.punctuation)
    for item in orig_data:
        data[item] = ''
        for key in keys:
            data[item] += orig_data[item][key] + ' '
        data[item] = word_tokenize(data[item].translate(translator).replace('\n', ' ').replace('\xa0', ' ').lower().strip())
        data[item] = [w for w in data[item] if w not in stop_words]
    return data

# Generate terms from given documents
* **Input**: preprocessing data
* **Output**: set of terms

In [5]:
def get_terms(data):
    terms = set()
    for key in data:
        for item in data[key]:
            terms.add(item)
    return list(terms)

# Create tf table
* **Input**: preprocessing data and set of terms
* **Output**: tf table

In [6]:
def create_tf_table(docs, terms):
    tf_table = np.zeros((len(terms), len(docs)))
    
    # doc_id is string since it's json key
    for doc_id in docs:
        for term_id, term in enumerate(terms):
            tf_table[term_id][int(doc_id)] = docs[doc_id].count(term)
    tf_table = (1+np.ma.log10(tf_table)).filled(0)
    return tf_table

# Create idf table
* **Input**: preprocessing data and set of terms
* **Output**: idf table

In [7]:
def create_idf_table(docs, terms):
    idf_table = np.zeros(len(terms))
    for doc_id in docs:
        for term_id, term in enumerate(terms):
            if term in docs[doc_id]:
                idf_table[term_id] += 1
    idf_table = (1+np.ma.log10(len(docs)/(idf_table+1))).filled(0).reshape(-1,1)
    return idf_table

# Create tf-idf table
* **Input**: preprocessing data and set of terms
* **Output**: tf-idf table

In [8]:
def create_tf_idf_table(docs, terms):
    tf = create_tf_table(docs, terms)
    idf = create_idf_table(docs, terms)
    tf_idf = tf * idf
    return tf_idf

# Search
* **Input**: query, number of documents that you want to show, tf-idf table (all docs), terms
* **Output**: list of tuples containing index and score

In [9]:
# sort by the second element in pair
def sort_by_val(item):
    return item[1]

def search(query, rank, table, terms):
    
    # remove trailing spaces, lowercase
    query = query.lower().strip().split(' ')
    
    # calculate tf_idf of query
    tf_idf = create_tf_idf_table({"0": query}, terms)
    
    # score
    score = table.T.dot(tf_idf.reshape(-1))
        
    # remove score=0
    score = sorted([(idx, val) for idx, val in enumerate(score) if val], reverse=True, key=sort_by_val)
        
    return score[:rank]

# Export result
* **Input**: list of documents' index and score
* Default **filename** is output.txt

In [10]:
def export(result, filename='output.txt'):
    global orig_data
    with open(filename, 'w', encoding='utf8') as file:
        file.write('Total: ' + str(len(result)) + '\n')
        for i in result:
            file.write('Document ID - Score: ' + str(i) + '\n')
            for key in orig_data[str(i[0])]:
                file.write(key + ':\n' + orig_data[str(i[0])][key] + '\n')
            file.write('\n')

# Print result
* Print document ID and score

In [11]:
def print_result(result):
    print('Document ID - Score')
    for item in result:
        print(item[0], '-', item[1])

# Main
* preprocessing data
* generate set of terms
* create tf-idf table from data and terms
* input query
* input the number of result that you want to show
* search query in documents (calculate tf-idf of query)
* export result
* print on screen

In [13]:
def main():
    # preprocessing data from orig_data
    data = preprocessing()
    
    # get all terms of data
    terms = get_terms(data)
    
    # create tf-idf table
    table = create_tf_idf_table(data, terms)
    
    # input query. Ex: đoàn tàu đánh cá
    query = input('Input query: ')
    
    # input the number of results you want to show
    rank = input('The number of results you want to show: ')
    # convert to int
    rank = int(rank)
    
    # search
    result = search(query, rank, table, terms)
    
    # print result
    print_result(result)
    
    # export
    export(result)
    
if __name__ == '__main__':
    main()

Input query: đoàn tàu đánh cá
The number of results you want to show: 10
Document ID - Score
1 - 6.177126557806552
0 - 5.113319555842285
20 - 3.5230749795061937
3 - 3.26213764425216
16 - 3.2103411340860672
26 - 2.418853632128971
25 - 2.233520818524947
13 - 1.7103760357302962
8 - 1.63106882212608
10 - 1.3146322847517478
