In [9]:
import json
from preprocessing import preprocess_document

def load_data(dataset_name='med'):
    """
    Loads data from a given dataset name
    !!!Datasets must be json files in /data/datasets/ folder !!!
    :param dataset_name: str
    :return: dataset (corpus), queries unlabelled, queries labels
    """
    path_data_folder = 'data/datasets/'
    dataset_path = path_data_folder + dataset_name + '_dataset.json'
    queries_path = path_data_folder + dataset_name + '_queries.json'
    ground_truth_path = path_data_folder + dataset_name + '_groundtruth.json'
    with open(dataset_path) as json_file:
        dataset = json.load(json_file)

    with open(queries_path) as json_file:
        queries = json.load(json_file)

    with open(ground_truth_path) as json_file:
        ground_truth = json.load(json_file)

    return dataset['dataset'], queries['queries'], ground_truth['groundtruth']

def descripteur_ensembliste_document(document):
    """
    Creates ensembliste descriptor
    :param document: must be bag words dict {"word" : frequency}
    :return: list of unique word in document
    """
    return [word for word in document.keys()]

def index_construction(corpus_tokenized):
    """
    Builds linear index.
    :param corpus_tokenized: dict of documents (one document is a bag of words)
    :return: dict{document_id : descripteur_ensembliste_document(document)}
    """
    result = dict()
    for id in corpus_tokenized:
        result[id] = descripteur_ensembliste_document(corpus_tokenized[id])
    return result

In [2]:
dataset, queries, bagwords = load_data('cacm')

In [4]:
def preprocess_corpus(corpus, stop_list, stop_words=True, stemm=True, bag_words=True):
    """
    Preprocess all documents in a corpus.
    :param corpus: list of documents (a document is a dict {"text" : "content of document"})
    :param stop_list: array of stop words to remove from documents
    :param stop_words: boolean True to remove them False otherwise
    :param stemm: boolean True to stemm words False otherwise
    :param bag_words: boolean True to turn documents into bagwords False otherwise
    :return: dictionary of documents preprocessed {id: document}
    """
    corpus_preprocessed = dict()
    for element in corpus:
        document = preprocess_document(element['text'], stop_list, stop_words, stemm, bag_words)
        corpus_preprocessed[element['id']] = document

    return corpus_preprocessed

In [7]:
import numpy as np
stop_list = np.genfromtxt('data/stoplist/stoplist-english.txt', dtype='str')

corpus = preprocess_corpus(dataset, stop_list)

In [10]:
index = index_construction(corpus)