In [1]:
from subprocess import call
import pandas as pd
import numpy as np
import scipy as sp
import pickle
import itertools
from multiprocessing import Pool

import metapy
from sklearn.feature_extraction.text import TfidfVectorizer
import logging

In [2]:
NUM_PROCESSES=8

Set up error logging

In [3]:
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')

# modified from https://stackoverflow.com/questions/11232230/logging-to-two-files-with-different-settings
def generate_logger(name):
    handler = logging.FileHandler('{}.log'.format(name), mode='w')
    handler.setFormatter(formatter)

    logger = logging.getLogger(name)
    logger.setLevel(logging.DEBUG)
    logger.addHandler(handler)

    return logger

loggers = [generate_logger('search/wiki_{}/wiki_{}'.format(i, i)) for i in range(1, 9)]

Load in our vectorizer

In [4]:
pickle_in = open("out/vectorizer.pickle","rb")
vectorizer = pickle.load(pickle_in)
pickle_in.close()

vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [5]:
# transform our raw queries using tf idf
def transform_query(raw_query):
    combined_query = ". ".join(raw_query) + "."
    sparse_query = vectorizer.transform([combined_query])
    return sparse_query

Extract the queries using MeTaPy

In [6]:
def get_sentences(paragraph):
    return [sentence.strip() for sentence in paragraph.split(".")]

def write_doc(document, filename, p_index):
    sentences = [get_sentences(paragraph) for paragraph in document.split("\n")]
    
    # flatten list
    sentences = list(itertools.chain.from_iterable(sentences))
    
    # get rid of empty strings: https://stackoverflow.com/questions/3845423/remove-empty-strings-from-a-list-of-strings
    sentences = list(filter(None, sentences))

    # write document to file
    with open(filename, 'w+') as doc_file:
        for sentence in sentences:
            doc_file.write("{}\n".format(sentence))
    
    # write metadata
    with open('search/wiki_{}/metadata.dat'.format(p_index), 'w+') as meta_file:
        for i in range(len(sentences)):
            meta_file.write("SEN{}\n".format(i))

def remove_old_idx(p_index):
    call(["rm", "-r", "search/idx_{}".format(p_index)])
    
def get_stringified_list(idx, search_results):
    return [idx.metadata(doc_id).get('content') for (doc_id, score) in search_results]

def search(document, summary, p_index):
    write_doc(document, 'search/wiki_{}/wiki_{}.dat'.format(p_index, p_index), p_index)
    remove_old_idx(p_index)
    
    idx = metapy.index.make_inverted_index('search/config_{}.toml'.format(p_index))
    
    ranker = metapy.index.OkapiBM25()
    
    query = metapy.index.Document()
    query.content(summary)
    
    search_results = ranker.score(idx, query, num_results=5)
    return get_stringified_list(idx, search_results)

Generate raw and transform queries

In [7]:
def generate_queries(args):
    p_df, p_index = args
    
    logger = loggers[p_index - 1]

    data = {"title": [], "raw_query": [], "sentence_summary": []}
    
    queries = []

    documents_generated = 0
    queries_generated = 0

    total_documents = p_df.shape[0]

    for row in p_df.iterrows():
        title = row[1]['title']
        summary = row[1]['summary']
        document = row[1]['document']
        
        logger.debug(title)
        
        if (not document.strip()) or (not summary.strip()):
            continue
        
        sentences = get_sentences(summary)
        sentences = list(filter(None, sentences))

        for sentence in sentences:

            # extract query
            raw_query = search(document, sentence, p_index)
            query = transform_query(raw_query)

            # add query info to data
            data["raw_query"].append(raw_query)
            data["sentence_summary"].append(sentence)
            data["title"].append(title)

            queries.append(query)
            
            queries_generated += 1
        
        documents_generated += 1
        
        if documents_generated % 400 == 0:
            print("Process {}: Generated {} queries for {} documents, {:.4f}% complete".format(p_index, queries_generated, documents_generated, documents_generated/total_documents * 100))
    
    print("Process {}: Finished generating queries".format(p_index))

    return pd.DataFrame(data=data), sp.sparse.vstack(queries)

In [8]:
def store_queries_data(df_queries, queries, index):
    # reorganize dataframe
    df_final = df_queries[['title', 'raw_query', 'sentence_summary']]
    df_final = df_final.reindex(df_final.index.rename('query_index'))
    df_final.index = df_final.index.astype(int)
    
    # store queries matrix
    sp.sparse.save_npz("out/queries_matrix_{}.npz".format(index), queries)

    # store queries csv
    df_final.to_csv("out/wiki_queries_{}.csv".format(index))

In [9]:
def generate_queries_multiprocess(df, batch_index, num_processes=NUM_PROCESSES):  
    pool = Pool(processes=num_processes)
    
    p_dfs = np.array_split(df, num_processes)
    
    args_by_process = [(p_dfs[i], i+1) for i in range(len(p_dfs))]
    results = pool.map(generate_queries, args_by_process)

    pool.close()

    df_queries = pd.concat([result[0] for result in results], ignore_index=True)
    queries = sp.sparse.vstack([result[1] for result in results])
    
    return df_queries, queries

def create_queries_batched(num_batches=8):
    for i in range(1, 9):
        batch_df = pd.read_csv("data/wiki_summaries_{}.csv".format(i))
        
        print("Batch {}: Started generating queries".format(i))
        
        df_queries, queries = generate_queries_multiprocess(batch_df, i)
        
        store_queries_data(df_queries, queries, i)
        
        print("Batch {}: Finished generating queries".format(i))

create_queries_batched()

Batch 1: Started generating queries
Process 4: Finished generating queries
Process 3: Finished generating queries
Process 8: Finished generating queries
Process 5: Finished generating queries
Process 1: Finished generating queries
Process 2: Finished generating queries
Process 6: Finished generating queries
Process 7: Finished generating queries
Batch 1: Finished generating queries
Batch 2: Started generating queries
Process 5: Finished generating queries
Process 8: Finished generating queries
Process 7: Finished generating queries
Process 6: Finished generating queries
Process 3: Finished generating queries
Process 1: Finished generating queries
Process 4: Finished generating queries
Process 2: Finished generating queries
Batch 2: Finished generating queries
Batch 3: Started generating queries
Process 6: Finished generating queries
Process 5: Finished generating queries
Process 8: Finished generating queries
Process 4: Finished generating queries
Process 7: Finished generating queries