In [1]:
from subprocess import call
import pandas as pd
import numpy as np
import scipy as sp
import pickle
import itertools

import metapy
from sklearn.feature_extraction.text import TfidfVectorizer

Read in our original data

In [2]:
df = pd.read_csv("data/wiki_summaries.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,title,summary,document
0,0,Anarchism,Anarchism is a political philosophy that advoc...,"While opposition to the state is central, anar..."
1,1,Autism,Autism is a developmental disorder characteriz...,Autism is caused by a combination of genetic a...
2,2,Albedo,"Albedo () (, meaning ""whiteness"") is the measu...",Surface albedo is defined as the ratio of irra...
3,3,A,"A (named , plural ""As"", ""A's"", ""a""s, ""a's"" or ...","The earliest certain ancestor of ""A"" is aleph ..."
4,4,Alabama,Alabama is a state in the southeastern region ...,"Alabama is nicknamed the ""Yellowhammer State"",..."


In [3]:
df.shape

(9102, 4)

Train our Vectorizer to fit the corpus

In [4]:
corpus = df['document'].tolist()

vectorizer = TfidfVectorizer()
vectorizer.fit_transform(corpus)

<9102x17845 sparse matrix of type '<class 'numpy.float64'>'
	with 10934208 stored elements in Compressed Sparse Row format>

In [5]:
# transform our raw queries using tf idf
def transform_query(raw_query):
    combined_query = ". ".join(raw_query) + "."
    sparse_query = vectorizer.transform([combined_query])
    return sparse_query

Extract the queries using MeTaPy

In [6]:
def write_doc(document, filename):
    sentences = [[sentence.strip() for sentence in paragraph.split(".")] for paragraph in document.split("\n")]
    
    # flatten list
    sentences = list(itertools.chain.from_iterable(sentences))
    
    # get rid of empty strings: https://stackoverflow.com/questions/3845423/remove-empty-strings-from-a-list-of-strings
    sentences = list(filter(None, sentences))

    # write document to file
    with open(filename, 'w+') as doc_file:
        for sentence in sentences:
            doc_file.write("{}\n".format(sentence))
    
    # write metadata
    with open('wiki/metadata.dat', 'w+') as meta_file:
        for i in range(len(sentences)):
            meta_file.write("SEN{}\n".format(i))

def remove_old_idx():
    call(["rm", "-r", "idx"])
    
def get_stringified_list(idx, search_results):
    return [idx.metadata(doc_id).get('content') for (doc_id, score) in search_results]

def search(document, summary):
    write_doc(document, 'wiki/wiki.dat')
    remove_old_idx()
    
    idx = metapy.index.make_inverted_index('config.toml')
    
    ranker = metapy.index.OkapiBM25()
    
    query = metapy.index.Document()
    query.content(summary)
    
    search_results = ranker.score(idx, query, num_results=5)
    return get_stringified_list(idx, search_results)

# document = df_docs.loc[df_docs['Titles'] == 'James Bond.txt', 'Documents'].iloc[0]
# summary = df_summs.loc[df_summs['Titles'] == 'James Bond.txt', 'Summaries'].iloc[0]

# print(summary)
# search(document, summary)

Generate raw and transform queries

In [7]:
def generate_queries():
    data = {"raw_query": [], "query_index": []}
    
    queries = []

    queries_generated = 0
    total_queries = df.shape[0]

    for row in df.iterrows():
        summary = row[1]['summary']
        document = row[1]['document']

        # extract query
        raw_query = search(document, summary)
        query = transform_query(raw_query)
        
        # add query info to data
        data["raw_query"].append(raw_query)
        data["query_index"].append(queries_generated)
        queries.append(query)
        
        queries_generated += 1
        
        if queries_generated % 400 == 0:
            print("Generated {} queries, {:.4f}% complete".format(queries_generated, queries_generated/total_queries * 100))
    
    print("Finished generating queries")

    return pd.DataFrame(data=data), sp.sparse.vstack(queries)

df_queries, queries = generate_queries()

print(queries.shape)
df_queries.head()

Generated 400 queries, 4.3946% complete
Generated 800 queries, 8.7893% complete
Generated 1200 queries, 13.1839% complete
Generated 1600 queries, 17.5786% complete
Generated 2000 queries, 21.9732% complete
Generated 2400 queries, 26.3678% complete
Generated 2800 queries, 30.7625% complete
Generated 3200 queries, 35.1571% complete
Generated 3600 queries, 39.5517% complete
Generated 4000 queries, 43.9464% complete
Generated 4400 queries, 48.3410% complete
Generated 4800 queries, 52.7357% complete
Generated 5200 queries, 57.1303% complete
Generated 5600 queries, 61.5249% complete
Generated 6000 queries, 65.9196% complete
Generated 6400 queries, 70.3142% complete
Generated 6800 queries, 74.7089% complete
Generated 7200 queries, 79.1035% complete
Generated 7600 queries, 83.4981% complete
Generated 8000 queries, 87.8928% complete
Generated 8400 queries, 92.2874% complete
Generated 8800 queries, 96.6820% complete
Finished generating queries
(9102, 17845)


Unnamed: 0,query_index,raw_query
0,0,[It is critical of formal organisations such a...
1,1,[Autism is one of the five pervasive developme...
2,2,[Unless given for a specific wavelength (spect...
3,3,[There are some other cases aside from italic ...
4,4,[Alabama is bordered by the states of Tennesse...


Merge all the dataframes together

In [9]:
df_final = pd.concat([df, df_queries], axis=1)

# reorganize dataframe
df_final = df_final[['title', 'query_index', 'raw_query', 'summary', 'document']]
df_final = df_final.set_index("query_index")
df_final.index = df_final.index.astype(int)

df_final.head()

Unnamed: 0_level_0,title,raw_query,summary,document
query_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Anarchism,[It is critical of formal organisations such a...,Anarchism is a political philosophy that advoc...,"While opposition to the state is central, anar..."
1,Autism,[Autism is one of the five pervasive developme...,Autism is a developmental disorder characteriz...,Autism is caused by a combination of genetic a...
2,Albedo,[Unless given for a specific wavelength (spect...,"Albedo () (, meaning ""whiteness"") is the measu...",Surface albedo is defined as the ratio of irra...
3,A,[There are some other cases aside from italic ...,"A (named , plural ""As"", ""A's"", ""a""s, ""a's"" or ...","The earliest certain ancestor of ""A"" is aleph ..."
4,Alabama,[Alabama is bordered by the states of Tennesse...,Alabama is a state in the southeastern region ...,"Alabama is nicknamed the ""Yellowhammer State"",..."


Store all the data

In [10]:
# store vectorizer
pickle_out = open("out/vectorizer.pickle","wb")
pickle.dump(vectorizer, pickle_out)
pickle_out.close()

# store queries matrix
sp.sparse.save_npz("out/queries_matrix.npz", queries)

# store queries csv
df_final.to_csv("out/wiki_queries.csv")