In [1]:
from subprocess import call
import pandas as pd
import numpy as np
import scipy as sp
import pickle

import metapy
from sklearn.feature_extraction.text import TfidfVectorizer

Read in our original data

In [2]:
df_docs = pd.read_csv("data/wiki_documents.csv")
df_docs.head()

Unnamed: 0.1,Unnamed: 0,Titles,Documents,Tokenized Documents
0,0,James Bond.txt,This article is about the spy series in genera...,['This article is about the spy series in gene...
1,1,Speed.txt,This article is about the property of moving b...,['This article is about the property of moving...
2,2,Official language.txt,An official language is a language that is giv...,['An official language is a language that is g...
3,3,Federal Register.txt,Federal Register. Cover. Type. Daily official ...,"['Federal Register', 'Cover', 'Type', 'Daily o..."
4,4,Flash memory.txt,For the neuropsychological concept related to ...,['For the neuropsychological concept related t...


In [3]:
df_summs = pd.read_csv("data/wiki_summaries.csv")
df_summs.head()

Unnamed: 0.1,Unnamed: 0,Titles,Summaries,Tokenized Summaries
0,0,James Bond.txt,The James Bond series focuses on a fictional B...,"[""The James Bond series focuses on a fictional..."
1,1,Speed.txt,"In everyday use and in kinematics, the speed o...","['In everyday use and in kinematics, the speed..."
2,2,Official language.txt,An official language is a language that is giv...,['An official language is a language that is g...
3,3,Federal Register.txt,The Federal Register (FR or sometimes Fed. Reg...,['The Federal Register (FR or sometimes Fed. R...
4,4,Flash memory.txt,Flash memory is an electronic (solid-state) no...,['Flash memory is an electronic (solid-state) ...


In [4]:
df_summs.shape

(8650, 4)

Train our Vectorizer to fit the corpus

In [5]:
corpus = df_docs['Documents'].tolist()

vectorizer = TfidfVectorizer()
vectorizer.fit_transform(corpus)

<8650x1001117 sparse matrix of type '<class 'numpy.float64'>'
	with 18827210 stored elements in Compressed Sparse Row format>

In [6]:
# transform our raw queries using tf idf
def transform_query(raw_query):
    combined_query = ". ".join(raw_query) + "."
    sparse_query = vectorizer.transform([combined_query])
    return sparse_query

Extract the queries using MeTaPy

In [7]:
def write_doc(document, filename):
    sentences = [sentence.strip() for sentence in document.split(".")]
    
    # get rid of empty strings: https://stackoverflow.com/questions/3845423/remove-empty-strings-from-a-list-of-strings
    sentences = list(filter(None, sentences))

    # write document to file
    with open(filename, 'w+') as doc_file:
        for sentence in sentences:
            doc_file.write("{}\n".format(sentence))
    
    # write metadata
    with open('wiki/metadata.dat', 'w+') as meta_file:
        for i in range(len(sentences)):
            meta_file.write("SEN{}\n".format(i))

def remove_old_idx():
    call(["rm", "-r", "idx"])
    
def get_stringified_list(idx, search_results):
    return [idx.metadata(doc_id).get('content') for (doc_id, score) in search_results]

def search(document, summary):
    write_doc(document, 'wiki/wiki.dat')
    remove_old_idx()
    
    idx = metapy.index.make_inverted_index('config.toml')
    
    ranker = metapy.index.OkapiBM25()
    
    query = metapy.index.Document()
    query.content(summary)
    
    search_results = ranker.score(idx, query, num_results=5)
    return get_stringified_list(idx, search_results)

document = df_docs.loc[df_docs['Titles'] == 'James Bond.txt', 'Documents'].iloc[0]
summary = df_summs.loc[df_summs['Titles'] == 'James Bond.txt', 'Summaries'].iloc[0]

print(summary)
search(document, summary)

The James Bond series focuses on a fictional British Secret Service agent created in 1953 by writer Ian Fleming, who featured him in twelve novels and two short-story collections. Since Fleming's death in 1964, eight other authors have written authorised Bond novels or novelizations: Kingsley Amis, Christopher Wood, John Gardner, Raymond Benson, Sebastian Faulks, Jeffery Deaver, William Boyd and Anthony Horowitz. The latest novel is Trigger Mortis by Anthony Horowitz, published in September 2015. Additionally Charlie Higson wrote a series on a young James Bond, and Kate Westbrook wrote three novels based on the diaries of a recurring series character, Moneypenny.. The character has also been adapted for television, radio, comic strip, video games and film. The films are the longest continually running film series of all time and have grossed over $7.040 billion in total, making it the fourth-highest-grossing film series to date, which started in 1962 with Dr. No, starring Sean Connery 

['As of 2018, there have been twenty-four films in the Eon Productions series',
 'The most recent Bond film, Spectre (2015), stars Daniel Craig in his fourth portrayal of Bond; he is the sixth actor to play Bond in the Eon series',
 '040 billion in total, making it the fourth-highest-grossing film series to date, which started in 1962 with Dr',
 'The Bond films are renowned for a number of features, including the musical accompaniment, with the theme songs having received Academy Award nominations on several occasions, and two wins',
 'Additionally Charlie Higson wrote a series on a young James Bond, and Kate Westbrook wrote three novels based on the diaries of a recurring series character, Moneypenny']

Generate raw and transform queries

In [8]:
def generate_queries():
    data = {"Titles": [], "Raw Query": [], "Query Index": []}
    
    queries = []

    queries_generated = 0
    total_queries = df_summs.shape[0]

    for title in df_summs['Titles']:
        summary = df_summs.loc[df_summs['Titles'] == title, 'Summaries'].iloc[0]
        document = df_docs.loc[df_docs['Titles'] == title, 'Documents'].iloc[0]

        # extract query
        raw_query = search(document, summary)
        query = transform_query(raw_query)
        
        # add query info to data
        data["Titles"].append(title)
        data["Raw Query"].append(raw_query)
        data["Query Index"].append(queries_generated)
        queries.append(query)
        
        queries_generated += 1
        
        if queries_generated % 400 == 0:
            print("Generated {} queries, {:.4f}% complete".format(queries_generated, queries_generated/total_queries * 100))
    
    print("Finished generating queries")

    return pd.DataFrame(data=data), sp.sparse.vstack(queries)

df_queries, queries = generate_queries()

print(queries.shape)
df_queries

Generated 400 queries, 0.0462% complete
Generated 800 queries, 0.0925% complete
Generated 1200 queries, 0.1387% complete
Generated 1600 queries, 0.1850% complete
Generated 2000 queries, 0.2312% complete
Generated 2400 queries, 0.2775% complete
Generated 2800 queries, 0.3237% complete
Generated 3200 queries, 0.3699% complete
Generated 3600 queries, 0.4162% complete
Generated 4000 queries, 0.4624% complete
Generated 4400 queries, 0.5087% complete
Generated 4800 queries, 0.5549% complete
Generated 5200 queries, 0.6012% complete
Generated 5600 queries, 0.6474% complete
Generated 6000 queries, 0.6936% complete
Generated 6400 queries, 0.7399% complete
Generated 6800 queries, 0.7861% complete
Generated 7200 queries, 0.8324% complete
Generated 7600 queries, 0.8786% complete
Generated 8000 queries, 0.9249% complete
Generated 8400 queries, 0.9711% complete
Finished generating queries
(8650, 1001117)


Unnamed: 0,Query Index,Raw Query,Titles
0,0,"[As of 2018, there have been twenty-four films...",James Bond.txt
1,1,[[1] The average speed of an object in an inte...,Speed.txt
2,2,"[[1] Since ""the means of expression of a peopl...",Official language.txt
3,3,[The Federal Register is compiled by the Offic...,Federal Register.txt
4,4,[Although flash memory is technically a type o...,Flash memory.txt
5,5,[The aims of the county administrative board i...,Counties of Sweden.txt
6,6,"[The Twentieth United States Census, conducted...",1980 United States Census.txt
7,7,"[On the other hand, keyboard instruments, such...",Percussion instrument.txt
8,8,[South Australia shares borders with all of th...,South Australia.txt
9,9,"[Colorado is bordered by Wyoming to the north,...",Colorado.txt


Merge all the dataframes together

In [12]:
df_summs_short = df_summs.filter(['Summaries'], axis=1)
df_docs_short = df_docs.filter(['Documents'], axis=1)

df_final = pd.concat([df_queries, df_summs_short, df_docs_short], axis=1)

# reorganize dataframe
df_final = df_final[['Titles', 'Query Index', 'Raw Query', 'Summaries', 'Documents']]
df_final = df_final.set_index("Query Index")
df_final.index = df_final.index.astype(int)

df_final.head()

Unnamed: 0_level_0,Titles,Raw Query,Summaries,Documents
Query Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,James Bond.txt,"[As of 2018, there have been twenty-four films...",The James Bond series focuses on a fictional B...,This article is about the spy series in genera...
1,Speed.txt,[[1] The average speed of an object in an inte...,"In everyday use and in kinematics, the speed o...",This article is about the property of moving b...
2,Official language.txt,"[[1] Since ""the means of expression of a peopl...",An official language is a language that is giv...,An official language is a language that is giv...
3,Federal Register.txt,[The Federal Register is compiled by the Offic...,The Federal Register (FR or sometimes Fed. Reg...,Federal Register. Cover. Type. Daily official ...
4,Flash memory.txt,[Although flash memory is technically a type o...,Flash memory is an electronic (solid-state) no...,For the neuropsychological concept related to ...


Store all the data

In [None]:
# store vectorizer
pickle_out = open("out/vectorizer.pickle","wb")
pickle.dump(vectorizer, pickle_out)
pickle_out.close()

# store queries matrix
sp.sparse.save_npz("out/queries_matrix.npz", queries)

# store queries csv
df_final.to_csv("out/wiki_queries.csv")