# ASSIGNMENT 2 - BOOLEAN SPACE MODEL

# Install nltk package
* models/**punkt**

In [9]:
import nltk

nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

# Input
* **data**: dictionary
    * ex: {'0': { 'title': ..., 'link': ..., 'content': ..., 'summary': ... }, '1': {...}, ... }
* stored in **orig_data**

In [1]:
# import library
import json
import string
from nltk.tokenize import word_tokenize
import copy
import sys

# file name - get from crawler
filename = 'crawler.txt'

# load data
with open(filename, 'r', encoding='utf8') as json_file:
    orig_data = json.load(json_file)

# Load stopwords
* open Vietnamese stopwords file (stopwords.txt)

In [2]:
def load_stopwords(filename='stopwords.txt'):
    stop_words = []
    with open(filename, 'r', encoding='utf8') as file:
        stop_words = file.readlines()
    stop_words = [item.strip() for item in stop_words] 
    return stop_words

stop_words = load_stopwords()

# Preprocessing data
* **Description**: concatenate all content fields
    * remove **punctuation**
    * remove **endline**
    * remove **spaces** - generated by beautifulsoup
    * transform **lowercase**
    * remove **trailing spaces**
* **Input**: orig_data
* **Output**: dictionary with content

In [3]:
def preprocessing():
    global stop_words, orig_data
    data = {}
    keys = ['title', 'content', 'summary']
    translator = str.maketrans('','', string.punctuation)
    for item in orig_data:
        data[item] = ''
        for key in keys:
            data[item] += orig_data[item][key] + ' '
        data[item] = word_tokenize(data[item].translate(translator).replace('\n', ' ').replace('\xa0', ' ').lower().strip())
        data[item] = [w for w in data[item] if w not in stop_words]
    return data

# Generate terms from given documents
* **Input**: preprocessing data
* **Output**: set of terms

In [4]:
def get_terms(data):
    terms = set()
    for key in data:
        for item in data[key]:
            terms.add(item)
    return terms

# Create boolean table
* **Input**: preprocessing data and set of terms
* **Output**: boolean table

In [5]:
def create_boolean_table(data, terms):
    boolean_table = {}
    for term in terms:
        boolean_table[term] = []
        for item in data:
            if term in data[item]:
                boolean_table[term].append(item)
    return boolean_table

# Preprocess query
* **Input**: query in boolean format
* **Output**: string array which each element is a word
    * Ex: query = 'đoàn AND NOT tàu' => result = ['đoàn', 'AND', 'NOT', 'tàu']

In [6]:
def preprocessing_query(query):
    # split
    query = query.split(' ')
    
    # remove quotes and trailing spaces, lowercase
    query = [w.replace("'", "").replace('"', '').lower().strip() if 'NOT' not in w and w.strip() not in ['AND', 'OR'] else w.strip() for w in query if w.strip()]

    return query

# Search
* **Input**: query, boolean table, number of documents
* **Output**: list of document's indexes which contain given query

In [7]:
# init operators
def init_operators(_and=False, _or=False, _not=False):
    return {
        'AND': _and,
        'OR': _or,
        'NOT': _not
    }

In [8]:
def search(query, table, num_docs):
    # preprocess query
    query = preprocessing_query(query)
    
    # initialize result
    result = { str(x) for x in range(num_docs) }
    # all document indices
    all_docs = { str(x) for x in range(num_docs) }
    
    # name operators
    bool_names = ['AND', 'OR', 'NOT']
    
    # boolean operators
    operators = init_operators(True)
    
    for w in query:
        if w in bool_names:
            operators[w] = True
        else:
            tmp = set()
            if w in table:
                for doc in table[w]:
                    tmp.add(doc)
            if (operators['NOT']):
                tmp = all_docs - tmp
            if (operators['AND']):
                result &= tmp
            if (operators['OR']):
                result |= tmp
            operators = init_operators()
        
    return result

# Export result
* **Input**: list of document's indexes
* Default **filename** is output.txt

In [9]:
def export(result, filename='output.txt'):
    global orig_data
    with open(filename, 'w', encoding='utf8') as file:
        file.write('Total: ' + str(len(result)) + '\n')
        for i in result:
            for key in orig_data[i]:
                file.write(key + ':\n' + orig_data[i][key] + '\n')
            file.write('\n')

# Main
* preprocessing data
* generate set of terms
* create boolean table from data and terms
* input query
* search query in documents
* export result

In [12]:
def main():
    # preprocessing data from orig_data
    data = preprocessing()
    
    # get all terms of data
    terms = get_terms(data)
    
    # create boolean table
    table = create_boolean_table(data, terms)
    
    # input query. Ex: mắt AND đoàn
    query = input('Input query: ')
    
    # search
    result = search(query, table, len(data))
    
    # export
    export(result)
    
if __name__ == '__main__':
    main()

Input query: đoàn AND NOT tàu
