# ASSIGNMENT 2 - BOOLEAN SPACE MODEL

# Install nltk package
* models/**punkt**

In [9]:
import nltk

nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

# Input
* **data**: dictionary
    * ex: {'0': { 'title': ..., 'link': ..., 'content': ..., 'summary': ... }, '1': {...}, ... }
* stored in **orig_data**

In [33]:
# import library
import json
import string
from nltk.tokenize import word_tokenize
import copy
import sys

# file name - get from crawler
filename = 'crawler.txt'

# load data
with open(filename, 'r', encoding='utf8') as json_file:
    orig_data = json.load(json_file)

# Load stopwords
* open Vietnamese stopwords file (stopwords.txt)

In [18]:
def load_stopwords(filename='stopwords.txt'):
    stop_words = []
    with open(filename, 'r', encoding='utf8') as file:
        stop_words = file.readlines()
    stop_words = [item.strip() for item in stop_words] 
    return stop_words

stop_words = load_stopwords()

# Preprocessing data
* **Description**: concatenate all content fields
    * remove **punctuation**
    * remove **endline**
    * remove **spaces** - generated by beautifulsoup
    * transform **lowercase**
    * remove **trailing spaces**
* **Input**: orig_data
* **Output**: dictionary with content

In [29]:
def preprocessing():
    global stop_words, orig_data
    keys = ['title', 'content', 'summary']
    translator = str.maketrans('','', string.punctuation)
    for item in orig_data:
        data[item] = ''
        for key in keys:
            data[item] += orig_data[item][key] + ' '
        data[item] = word_tokenize(data[item].translate(translator).replace('\n', ' ').replace('\xa0', ' ').lower().strip())
        data[item] = [w for w in data[item] if w not in stop_words]
    return data

{'0': ['mắt', 'đoàn', 'tàu', 'đầu', 'tiên', 'tuyến', 'metro', 'nhổn', 'ga', 'hà', 'nội', 'đoàn', 'tàu', 'thiết', 'kế', 'công', 'năng', 'hiện', 'đại', 'sơn', 'màu', 'sắc', 'long', '4', 'khoang', '94', 'ghế', '2910', 'ban', 'quản', 'lý', 'đường', 'sắt', 'đô', 'thị', 'hà', 'nội', 'mrb', 'ubnd', 'thành', 'phố', 'tập', 'đoàn', 'alstom', 'pháp', 'thầu', 'sản', 'xuất', 'tàu', 'tuyến', 'đường', 'sắt', 'đô', 'thị', 'nhổn', 'ga', 'hà', 'nội', 'ký', 'biên', 'ghi', 'hợp', 'đồng', 'hiện', 'mắt', 'đoàn', 'tàu', 'đầu', 'tiên', 'tuyến', 'metro', 'chủ', 'tịch', 'ubnd', 'tp', 'hà', 'nội', 'nguyễn', 'đức', 'ký', 'biên', 'ghi', 'đối', 'tác', 'alstom', 'ảnh', 'mrb', 'đại', 'diện', 'alstom', 'đoàn', 'tàu', 'thiết', 'kế', 'công', 'năng', 'hiện', 'đại', 'sơn', 'màu', 'sắc', 'long', 'vốn', 'biểu', 'tượng', 'việt', 'nam', 'thiết', 'kế', 'mặt', 'khoang', 'tàu', 'thiết', 'kế', 'hiện', 'đại', 'giám', 'đốc', 'dự', 'án', 'alstom', 'đoàn', 'tàu', 'tuyến', 'metro', '3', 'nhổnga', 'hà', 'nội', '4', 'khoang', 'tổng', 'c

# Generate terms from given documents
* **Input**: preprocessing data
* **Output**: set of terms

In [108]:
def get_terms(data):
    terms = set()
    for key in data:
        for item in data[key]:
            terms.add(item)
    return terms

# Create boolean table
* **Input**: preprocessing data and set of terms
* **Output**: boolean table

In [103]:
def create_boolean_table(data, terms):
    boolean_table = {}
    for term in terms:
        boolean_table[term] = []
        for item in data:
            if term in data[item]:
                boolean_table[term].append(item)
    return boolean_table

# Search
* **Input**: query, boolean table, number of documents
* **Output**: list of document's indexes which contain given query

In [104]:
def search(query, table, num_docs):
    #split AND
    query = query.split('AND')
    
    # remove single quote and trailing spaces, lowercase
    query = [w.replace("'", "").lower().strip() for w in query if w.strip()]
    
    # initialize result
    result = { str(x) for x in range(num_docs) }
    
    for w in query:
        tmp = set()
        if w in table:
            for doc in table[w]:
                tmp.add(doc)
        # get intersection
        result &= tmp
        
    return result

# Export result
* **Input**: list of document's indexes
* Default **filename** is output.txt

In [105]:
def export(result, filename='output.txt'):
    global orig_data
    with open(filename, 'w', encoding='utf8') as file:
        file.write('Total: ' + str(len(result)) + '\n')
        for i in result:
            for key in orig_data[i]:
                file.write(key + ':\n' + orig_data[i][key] + '\n')
            file.write('\n')

# Main
* preprocessing data
* generate set of terms
* create boolean table from data and terms
* input query
* search query in documents
* export result

In [106]:
def main():
    # preprocessing data from orig_data
    data = preprocessing()
    
    # get all terms of data
    terms = get_terms(data)
    
    # create boolean table
    table = create_boolean_table(data, terms)
    
    # input query. Ex: mắt AND đoàn
    query = input('Input query: ')
    
    # search
    result = search(query, table, len(data))
    
    # export
    export(result)
    
if __name__ == '__main__':
    main()

Input query: mắt AND đoàn
