In [2]:
from subprocess import call
import pandas as pd
import numpy as np
import scipy as sp
import pickle
import itertools

import metapy
from sklearn.feature_extraction.text import TfidfVectorizer

Read in our original data

In [3]:
df = pd.read_csv("data/wiki_summaries.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,title,summary,document
0,0,Anarchism,Anarchism is a political philosophy that advoc...,"While opposition to the state is central, anar..."
1,1,Autism,Autism is a developmental disorder characteriz...,Autism is caused by a combination of genetic a...
2,2,Albedo,"Albedo () (, meaning ""whiteness"") is the measu...",Surface albedo is defined as the ratio of irra...
3,3,A,"A (named , plural ""As"", ""A's"", ""a""s, ""a's"" or ...","The earliest certain ancestor of ""A"" is aleph ..."
4,4,Alabama,Alabama is a state in the southeastern region ...,"Alabama is nicknamed the ""Yellowhammer State"",..."


In [4]:
df.shape

(9102, 4)

Train our Vectorizer to fit the corpus

In [5]:
corpus = df['document'].tolist()

vectorizer = TfidfVectorizer()
vectorizer.fit_transform(corpus)

<9102x17845 sparse matrix of type '<class 'numpy.float64'>'
	with 10934208 stored elements in Compressed Sparse Row format>

In [6]:
# transform our raw queries using tf idf
def transform_query(raw_query):
    combined_query = ". ".join(raw_query) + "."
    sparse_query = vectorizer.transform([combined_query])
    return sparse_query

Extract the queries using MeTaPy

In [22]:
def get_sentences(paragraph):
    return [sentence.strip() for sentence in paragraph.split(".")]

def write_doc(document, filename):
    sentences = [get_sentences(paragraph) for paragraph in document.split("\n")]
    
    # flatten list
    sentences = list(itertools.chain.from_iterable(sentences))
    
    # get rid of empty strings: https://stackoverflow.com/questions/3845423/remove-empty-strings-from-a-list-of-strings
    sentences = list(filter(None, sentences))

    # write document to file
    with open(filename, 'w+') as doc_file:
        for sentence in sentences:
            doc_file.write("{}\n".format(sentence))
    
    # write metadata
    with open('wiki/metadata.dat', 'w+') as meta_file:
        for i in range(len(sentences)):
            meta_file.write("SEN{}\n".format(i))

def remove_old_idx():
    call(["rm", "-r", "idx"])
    
def get_stringified_list(idx, search_results):
    return [idx.metadata(doc_id).get('content') for (doc_id, score) in search_results]

def search(document, summary):
    write_doc(document, 'wiki/wiki.dat')
    remove_old_idx()
    
    idx = metapy.index.make_inverted_index('config.toml')
    
    ranker = metapy.index.OkapiBM25()
    
    query = metapy.index.Document()
    query.content(summary)
    
    search_results = ranker.score(idx, query, num_results=5)
    return get_stringified_list(idx, search_results)

# document = df_docs.loc[df_docs['Titles'] == 'James Bond.txt', 'Documents'].iloc[0]
# summary = df_summs.loc[df_summs['Titles'] == 'James Bond.txt', 'Summaries'].iloc[0]

# print(summary)
# search(document, summary)

Generate raw and transform queries

In [20]:
def generate_queries():
    data = {"title": [], "raw_query": [], "query_index": [], "sentence_summary": []}
    
    queries = []

    documents_generated = 0
    queries_generated = 0

    total_documents = df.shape[0]

    for row in df.iterrows():
        title = row[1]['title']
        summary = row[1]['summary']
        document = row[1]['document']
        
        sentences = get_sentences(summary)
        sentences = list(filter(None, sentences))

        for sentence in sentences:

            # extract query
            raw_query = search(document, sentence)
            query = transform_query(raw_query)

            # add query info to data
            data["raw_query"].append(raw_query)
            data["query_index"].append(queries_generated)
            data["sentence_summary"].append(sentence)
            data["title"].append(title)

            queries.append(query)
            
            queries_generated += 1
        
        documents_generated += 1
        
    if queries_generated % 400 == 0:
        print("Generated queries for {} documents, {:.4f}% complete".format(documents_generated, documents_generated/total_documents * 100))
    
    print("Finished generating queries")

    return pd.DataFrame(data=data), sp.sparse.vstack(queries)

df_queries, queries = generate_queries()

print(queries.shape)
df_queries.head()

Finished generating queries
(21, 17845)


Unnamed: 0,query_index,raw_query,sentence_summary,title
0,0,"[Susan Brown claims that ""as anarchism is a po...",Anarchism is a political philosophy that advoc...,Anarchism
1,1,"[While opposition to the state is central, ana...",These are often described as stateless societi...,Anarchism
2,2,[He thought that the spread of the use of reas...,"Anarchism holds the state to be undesirable, u...",Anarchism
3,3,[Autism is one of the five pervasive developme...,Autism is a developmental disorder characteriz...,Autism
4,4,[About half of parents of children with ASD no...,Parents usually notice signs in the first two ...,Autism


Merge all the dataframes together

In [21]:
# reorganize dataframe
df_final = df_queries[['title', 'query_index', 'raw_query', 'sentence_summary']]
df_final = df_final.set_index("query_index")
df_final.index = df_final.index.astype(int)

df_final.head()

Unnamed: 0_level_0,title,raw_query,sentence_summary
query_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Anarchism,"[Susan Brown claims that ""as anarchism is a po...",Anarchism is a political philosophy that advoc...
1,Anarchism,"[While opposition to the state is central, ana...",These are often described as stateless societi...
2,Anarchism,[He thought that the spread of the use of reas...,"Anarchism holds the state to be undesirable, u..."
3,Autism,[Autism is one of the five pervasive developme...,Autism is a developmental disorder characteriz...
4,Autism,[About half of parents of children with ASD no...,Parents usually notice signs in the first two ...


Store all the data

In [10]:
# store vectorizer
pickle_out = open("out/vectorizer.pickle","wb")
pickle.dump(vectorizer, pickle_out)
pickle_out.close()

# store queries matrix
sp.sparse.save_npz("out/queries_matrix.npz", queries)

# store queries csv
df_final.to_csv("out/wiki_queries.csv")