In [1]:
from subprocess import call
import pandas as pd
import numpy as np
import scipy as sp
import pickle
import itertools
from multiprocessing import Pool

import metapy
from sklearn.feature_extraction.text import TfidfVectorizer
import logging

In [15]:
NUM_PROCESSES=8

In [None]:
df = pd.read_csv('wiki_old_input.csv')

Set up error logging

In [18]:
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')

# modified from https://stackoverflow.com/questions/11232230/logging-to-two-files-with-different-settings
def generate_logger(name):
    handler = logging.FileHandler('{}.log'.format(name), mode='w')
    handler.setFormatter(formatter)

    logger = logging.getLogger(name)
    logger.setLevel(logging.DEBUG)
    logger.addHandler(handler)

    return logger

loggers = [generate_logger('search/wiki_{}/wiki_{}'.format(i, i)) for i in range(1, 9)]

Train vectorizer

In [5]:
corpus = df['document'].tolist()
len(corpus)

8650

In [6]:
vectorizer = TfidfVectorizer()
vectorizer.fit_transform(corpus)

<8650x1001117 sparse matrix of type '<class 'numpy.float64'>'
	with 18827210 stored elements in Compressed Sparse Row format>

In [39]:
# transform our raw queries using tf idf
def transform_query(raw_query):
    sparse_query = vectorizer.transform([raw_query])
    return sparse_query

Extract the queries using MeTaPy

In [32]:
def get_sentences(summary):
    return [sentence.strip() for sentence in summary.split(".")]

def write_doc(document, filename, p_index):
    sentences = [sentence.strip() for sentence in document.split(".")]
    
    # get rid of empty strings: https://stackoverflow.com/questions/3845423/remove-empty-strings-from-a-list-of-strings
    sentences = list(filter(None, sentences))

    # write document to file
    with open(filename, 'w+') as doc_file:
        for sentence in sentences:
            doc_file.write("{}\n".format(sentence))
    
    # write metadata
    with open('search/wiki_{}/metadata.dat'.format(p_index), 'w+') as meta_file:
        for i in range(len(sentences)):
            meta_file.write("SEN{}\n".format(i))

def remove_old_idx(p_index):
    call(["rm", "-r", "search/idx_{}".format(p_index)])
    
def get_stringified_list(idx, search_results):
    return [idx.metadata(doc_id).get('content') for (doc_id, score) in search_results]

def search(document, summary, p_index):
    write_doc(document, 'search/wiki_{}/wiki_{}.dat'.format(p_index, p_index), p_index)
    remove_old_idx(p_index)
    
    idx = metapy.index.make_inverted_index('search/config_{}.toml'.format(p_index))
    
    ranker = metapy.index.OkapiBM25()
    
    query = metapy.index.Document()
    query.content(summary)
    
    search_results = ranker.score(idx, query, num_results=5)
    return get_stringified_list(idx, search_results)

Generate raw and transform queries

In [58]:
def generate_queries(args):
    p_df, p_index = args
    
    logger = loggers[p_index - 1]

    data = {"title": [], "raw_query": [], "sentence_summary": []}
    
    queries = []

    documents_generated = 0
    queries_generated = 0

    total_documents = p_df.shape[0]

    for row in p_df.iterrows():
        title = row[1]['title']
        summary = row[1]['summary']
        document = row[1]['document']
        
        logger.debug(title)
        
        if (not document.strip()) or (not summary.strip()):
            continue
        
        sentences = get_sentences(summary)
        sentences = list(filter(None, sentences))
        
        # only take first 3 sentences
        sentences = sentences[:3]

        for sentence in sentences:

            # extract query
            raw_queries = search(document, sentence, p_index)
            
            for raw_query in raw_queries:
                query = transform_query(raw_query)

                # add query info to data
                data["raw_query"].append(raw_query)
                data["sentence_summary"].append(sentence)
                data["title"].append(title)

                queries.append(query)
            
                queries_generated += 1
        
        documents_generated += 1
        
        if documents_generated % 20 == 0:
            print("Process {}: Generated {} queries for {} documents, {:.4f}% complete".format(p_index, queries_generated, documents_generated, documents_generated/total_documents * 100))
    
    print("Process {}: Finished generating queries".format(p_index))

    return pd.DataFrame(data=data), sp.sparse.vstack(queries)

In [59]:
def store_queries_data(df_queries, queries):
    # reorganize dataframe
    df_final = df_queries[['title', 'raw_query', 'sentence_summary']]
    df_final = df_final.reindex(df_final.index.rename('query_index'))
    df_final.index = df_final.index.astype(int)
    
    # store queries matrix
    sp.sparse.save_npz("out/queries_matrix.npz", queries)

    # store queries csv
    df_final.to_csv("out/wiki_queries.csv")

In [60]:
def generate_queries_multiprocess(df, num_processes=NUM_PROCESSES):  
    pool = Pool(processes=num_processes)
    
    p_dfs = np.array_split(df, num_processes)
    
    args_by_process = [(p_dfs[i], i+1) for i in range(len(p_dfs))]
    results = pool.map(generate_queries, args_by_process)

    pool.close()

    df_queries = pd.concat([result[0] for result in results], ignore_index=True)
    queries = sp.sparse.vstack([result[1] for result in results])
    
    return df_queries, queries

def create_queries():
    print("Started generating queries")

    df_queries, queries = generate_queries_multiprocess(df)

    store_queries_data(df_queries, queries)

    print("Finished generating queries")

create_queries()

Started generating queries
Process 1: Generated 274 queries for 20 documents, 1.8484% complete
Process 2: Generated 265 queries for 20 documents, 1.8484% complete
Process 3: Generated 282 queries for 20 documents, 1.8501% complete
Process 4: Generated 275 queries for 20 documents, 1.8501% complete
Process 1: Generated 564 queries for 40 documents, 3.6969% complete
Process 2: Generated 565 queries for 40 documents, 3.6969% complete
Process 5: Generated 288 queries for 20 documents, 1.8501% complete
Process 3: Generated 562 queries for 40 documents, 3.7003% complete
Process 6: Generated 280 queries for 20 documents, 1.8501% complete
Process 4: Generated 560 queries for 40 documents, 3.7003% complete
Process 1: Generated 834 queries for 60 documents, 5.5453% complete
Process 7: Generated 290 queries for 20 documents, 1.8501% complete
Process 8: Generated 295 queries for 20 documents, 1.8501% complete
Process 2: Generated 832 queries for 60 documents, 5.5453% complete
Process 3: Generated 

Process ForkPoolWorker-203:
Process ForkPoolWorker-204:
Traceback (most recent call last):
Process ForkPoolWorker-206:
  File "/usr/local/Cellar/python3/3.6.1/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/local/Cellar/python3/3.6.1/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Process ForkPoolWorker-202:
Process ForkPoolWorker-205:
  File "/usr/local/Cellar/python3/3.6.1/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
Traceback (most recent call last):
  File "/usr/local/Cellar/python3/3.6.1/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/pool.py", line 44, in mapstar
    return list(map(*args))
Traceback (most recent call last):
  File "/usr/local/Cellar/python3/3.6.1/Frameworks/Pyth

  File "/usr/local/Cellar/python3/3.6.1/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "<ipython-input-39-6efda619f4da>", line 3, in transform_query
    sparse_query = vectorizer.transform([raw_query])
  File "/usr/local/lib/python3.6/site-packages/sklearn/feature_extraction/text.py", line 1410, in transform
    return self._tfidf.transform(X, copy=False)
  File "/usr/local/lib/python3.6/site-packages/sklearn/feature_extraction/text.py", line 1111, in transform
    X = X * self._idf_diag
  File "<ipython-input-58-cd785e07ce8b>", line 34, in generate_queries
    raw_queries = search(document, sentence, p_index)
  File "/usr/local/lib/python3.6/site-packages/scipy/sparse/base.py", line 369, in __mul__
    return self._mul_sparse_matrix(other)
  File "/usr/local/lib/python3.6/site-packages/sklearn/feature_extraction/text.py", line 1111, in transform
    X = X * self._idf_diag
  File "/usr

KeyboardInterrupt: 

In [None]:
# store vectorizer
pickle_out = open("out/vectorizer.pickle","wb")
pickle.dump(vectorizer, pickle_out)
pickle_out.close()