In [10]:
from subprocess import call
import pandas as pd
import numpy as np
import scipy as sp
import pickle
import itertools
from multiprocessing import Pool

import metapy
from sklearn.feature_extraction.text import TfidfVectorizer
import logging

In [11]:
NUM_PROCESSES=8

In [12]:
df = pd.read_csv('wiki_old_updated.csv')

Set up error logging

In [13]:
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')

# modified from https://stackoverflow.com/questions/11232230/logging-to-two-files-with-different-settings
def generate_logger(name):
    handler = logging.FileHandler('{}.log'.format(name), mode='w')
    handler.setFormatter(formatter)

    logger = logging.getLogger(name)
    logger.setLevel(logging.DEBUG)
    logger.addHandler(handler)

    return logger

loggers = [generate_logger('search/wiki_{}/wiki_{}'.format(i, i)) for i in range(1, 9)]

Train vectorizer

In [14]:
corpus = df['document'].tolist()
len(corpus)

8650

In [15]:
vectorizer = TfidfVectorizer()
vectorizer.fit_transform(corpus)

<8650x994889 sparse matrix of type '<class 'numpy.float64'>'
	with 18586133 stored elements in Compressed Sparse Row format>

In [16]:
# transform our raw queries using tf idf
def transform_query(raw_query):
    sparse_query = vectorizer.transform([raw_query])
    return sparse_query

Extract the queries using MeTaPy

In [162]:
def get_sentences(summary):
    return [sentence.strip() for sentence in summary.split(".")]

def write_doc(sentences, filename, p_index):
    # write document to file
    with open(filename, 'w+') as doc_file:
        for sentence in sentences:
            doc_file.write("{}\n".format(sentence))
    
    # write metadata
    with open('search/wiki_{}/metadata.dat'.format(p_index), 'w+') as meta_file:
        for i in range(len(sentences)):
            meta_file.write("SEN{}\n".format(i))

def remove_old_idx(p_index):
    call(["rm", "-r", "search/idx_{}".format(p_index)])
    
def get_stringified_list(idx, search_results):
    return [idx.metadata(doc_id).get('content') for (doc_id, score) in search_results]

# normalized = score / (num words in query) * log(num documents)
def normalize_scores(search_results, sentences, summary):
    num_words_in_query = len(summary.split(" "))
    num_sentences = len(sentences)
    
    f = lambda score: score / (num_words_in_query * np.log10(num_sentences))
    return [f(score) for (doc_id, score) in search_results]

def search(document, summary, p_index, num_results=5):
    sentences = [sentence.strip() for sentence in document.split(".")]
    
    # get rid of empty strings: https://stackoverflow.com/questions/3845423/remove-empty-strings-from-a-list-of-strings
    sentences = list(filter(None, sentences))
    
    write_doc(sentences, 'search/wiki_{}/wiki_{}.dat'.format(p_index, p_index), p_index)
    remove_old_idx(p_index)
    
    idx = metapy.index.make_inverted_index('search/config_{}.toml'.format(p_index))
    
    ranker = metapy.index.OkapiBM25()
    
    query = metapy.index.Document()
    query.content(summary)
    
    search_results = ranker.score(idx, query, num_results=num_results)

    normalized_scores = normalize_scores(search_results, sentences, summary)
    
    stringified_results = get_stringified_list(idx, search_results)

    return list(zip(stringified_results, normalized_scores))

Generate raw and transform queries

In [181]:
def generate_queries(args):
    p_df, p_index = args
    
    logger = loggers[p_index - 1]

    data = {"title": [], "raw_query": [], "sentence_summary": [], "document": [], "normalized_score": []}
    
    queries = []

    documents_generated = 0
    queries_generated = 0

    total_documents = p_df.shape[0]

    for row in p_df.iterrows():
        title = row[1]['title']
        summary = row[1]['summary']
        document = row[1]['document']
        headers = row[1]['headers']
        sidebar = row[1]['sidebar']
        
        logger.debug(title)
        
        if (not document.strip()) or (not summary.strip()):
            continue
        
        sentences = get_sentences(summary)
        sentences = list(filter(None, sentences))
        
        # only take first 3 sentences
        sentences = sentences[:3]

        for sentence in sentences:

            # extract query
            raw_queries = search(document, sentence, p_index)
            
            for raw_query, normalized_score in raw_queries:
                query = transform_query(raw_query)

                # add query info to data
                data["raw_query"].append(raw_query)
                data["sentence_summary"].append(sentence)
                data["title"].append(title)
                data["document"].append(document)
                data["normalized_score"].append(normalized_score * (-1)) # TODO: remove -1. This is just to differentiate

                queries.append(query)
            
                queries_generated += 1
            
        # more experimental queries
        if not pd.isnull(headers):
            for header in headers.split(" --- "):
                sentence_summary = search(summary, header, p_index, num_results=1)
                if len(sentence_summary) == 0:
                    continue
                
                header_query = transform_query(header)

                data["raw_query"].append(header)
                data["sentence_summary"].append(sentence_summary[0][0])
                data["title"].append(title)
                data["document"].append(document)
                data["normalized_score"].append(sentence_summary[0][1])

                queries.append(header_query)

                queries_generated += 1

        if not pd.isnull(sidebar):
            for sidebar_entry in sidebar.split(" --- "):
                sentence_summary = search(summary, sidebar_entry, p_index, num_results=1)
                if len(sentence_summary) == 0:
                    continue
                
                sidebar_query = transform_query(sidebar_entry)

                data["raw_query"].append(sidebar_entry)
                data["sentence_summary"].append(sentence_summary[0][0])
                data["title"].append(title)
                data["document"].append(document)
                data["normalized_score"].append(sentence_summary[0][1])

                queries.append(sidebar_query)

                queries_generated += 1
        
        documents_generated += 1
        
        if documents_generated % 20 == 0:
            print("Process {}: Generated {} queries for {} documents, {:.4f}% complete".format(p_index, queries_generated, documents_generated, documents_generated/total_documents * 100))
    
    print("Process {}: Finished generating queries".format(p_index))

    return pd.DataFrame(data=data), sp.sparse.vstack(queries)

In [182]:
def store_queries_data(df_queries, queries):
    # reorganize dataframe
    df_final = df_queries[['title', 'raw_query', 'normalized_score', 'sentence_summary', 'document']]
    df_final = df_final.reindex(df_final.index.rename('query_index'))
    df_final.index = df_final.index.astype(int)
    
    # store queries matrix
    sp.sparse.save_npz("out/queries_matrix.npz", queries)

    # store queries csv
    df_final.to_csv("out/wiki_queries.csv")

In [183]:
def generate_queries_multiprocess(df, num_processes=NUM_PROCESSES):  
    pool = Pool(processes=num_processes)
    
    p_dfs = np.array_split(df, num_processes)
    
    args_by_process = [(p_dfs[i], i+1) for i in range(len(p_dfs))]
    results = pool.map(generate_queries, args_by_process)

    pool.close()

    df_queries = pd.concat([result[0] for result in results], ignore_index=True)
    queries = sp.sparse.vstack([result[1] for result in results])
    
    return df_queries, queries

def create_queries():
    print("Started generating queries")

    df_queries, queries = generate_queries_multiprocess(df)

    store_queries_data(df_queries, queries)

    print("Finished generating queries")

create_queries()

Started generating queries
Process 7: Finished generating queries
Process 6: Finished generating queries
Process 4: Finished generating queries
Process 8: Finished generating queries
Process 3: Finished generating queries
Process 1: Finished generating queries
Process 2: Finished generating queries
Process 5: Finished generating queries
Finished generating queries


In [184]:
# store vectorizer
pickle_out = open("out/vectorizer.pickle","wb")
pickle.dump(vectorizer, pickle_out)
pickle_out.close()

In [185]:
out_df = pd.read_csv('out/wiki_queries.csv')

In [186]:
out_df

Unnamed: 0,query_index,title,raw_query,normalized_score,sentence_summary,document
0,0,James Bond.txt,"[17] Between 1953 and 1966, two years after hi...",-0.205253,The James Bond series focuses on a fictional B...,This article is about the spy series in genera...
1,1,James Bond.txt,James Bond novels and short stories,-0.179785,The James Bond series focuses on a fictional B...,This article is about the spy series in genera...
2,2,James Bond.txt,Ian Fleming created the fictional character of...,-0.178786,The James Bond series focuses on a fictional B...,This article is about the spy series in genera...
3,3,James Bond.txt,The Young Bond series of novels was started by...,-0.166493,The James Bond series focuses on a fictional B...,This article is about the spy series in genera...
4,4,James Bond.txt,Created by,-0.152363,The James Bond series focuses on a fictional B...,This article is about the spy series in genera...
5,5,James Bond.txt,"After Fleming's death a continuation novel, Co...",-0.249045,"Since Fleming's death in 1964, eight other aut...",This article is about the spy series in genera...
6,6,James Bond.txt,"[68] On 26 September 2013 Solo, written by Wil...",-0.239818,"Since Fleming's death in 1964, eight other aut...",This article is about the spy series in genera...
7,7,James Bond.txt,"[54] By the time he moved on to other, non-Bon...",-0.227171,"Since Fleming's death in 1964, eight other aut...",This article is about the spy series in genera...
8,8,James Bond.txt,"Benson, Raymond (1988)",-0.226701,"Since Fleming's death in 1964, eight other aut...",This article is about the spy series in genera...
9,9,James Bond.txt,[35] Although novelizations of two of the Eon ...,-0.199662,"Since Fleming's death in 1964, eight other aut...",This article is about the spy series in genera...


In [192]:
def print_info(print_df):
    for row in print_df.iterrows():
        print("----- QUERY -----")
        print(row[1]['raw_query'], row[1]['normalized_score'])
        print("\n")
        print("----- SUMMARY -----")
        print(row[1]['sentence_summary'])
        print("\n--------------------\n")

print_info(out_df[15:46])

----- QUERY -----
Creation and inspiration 0.2131679046037428


----- SUMMARY -----
The films are the longest continually running film series of all time and have grossed over $7

--------------------

----- QUERY -----
Novels and related works 0.4334063409124895


----- SUMMARY -----
Additionally Charlie Higson wrote a series on a young James Bond, and Kate Westbrook wrote three novels based on the diaries of a recurring series character, Moneypenny

--------------------

----- QUERY -----
Ian Fleming novels 1.5619319870454311


----- SUMMARY -----
The James Bond series focuses on a fictional British Secret Service agent created in 1953 by writer Ian Fleming, who featured him in twelve novels and two short-story collections

--------------------

----- QUERY -----
Post-Fleming novels 1.869902456976239


----- SUMMARY -----
The James Bond series focuses on a fictional British Secret Service agent created in 1953 by writer Ian Fleming, who featured him in twelve novels and two short-sto

In [39]:
# print_info(out_df.loc[out_df.raw_query.str.len() < .5 * out_df.sentence_summary.str.len(), ['raw_query', 'sentence_summary']])

----- QUERY -----
James Bond novels and short stories


----- SUMMARY -----
The James Bond series focuses on a fictional British Secret Service agent created in 1953 by writer Ian Fleming, who featured him in twelve novels and two short-story collections

--------------------

----- QUERY -----
Created by


----- SUMMARY -----
The James Bond series focuses on a fictional British Secret Service agent created in 1953 by writer Ian Fleming, who featured him in twelve novels and two short-story collections

--------------------

----- QUERY -----
[68] On 26 September 2013 Solo, written by William Boyd, was published, set in 1969


----- SUMMARY -----
Since Fleming's death in 1964, eight other authors have written authorised Bond novels or novelizations: Kingsley Amis, Christopher Wood, John Gardner, Raymond Benson, Sebastian Faulks, Jeffery Deaver, William Boyd and Anthony Horowitz

--------------------

----- QUERY -----
Benson, Raymond (1988)


----- SUMMARY -----
Since Fleming's death i

In [40]:
# IDEAS:
# - pull stuff from the right-hand side as queries (ex/ "Created by")
# - pull content headers as queries (ex/ "origins")
# - use the title as another query