In [5]:
import pandas as pd
import numpy as np

from multiprocessing import Process

In [6]:
df = pd.read_csv('wiki_dump/wiki_dump.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,document,id,title,url
0,0,\nAnarchism\n\nAnarchism is a political philos...,12,Anarchism,https://en.wikipedia.org/wiki?curid=12
1,1,\nAutism\n\nAutism is a developmental disorder...,25,Autism,https://en.wikipedia.org/wiki?curid=25
2,2,"\nAlbedo\n\nAlbedo () (, meaning ""whiteness"") ...",39,Albedo,https://en.wikipedia.org/wiki?curid=39
3,3,"\nA\n\nA (named , plural ""As"", ""A's"", ""a""s, ""a...",290,A,https://en.wikipedia.org/wiki?curid=290
4,4,\nAlabama\n\nAlabama is a state in the southea...,303,Alabama,https://en.wikipedia.org/wiki?curid=303


In [7]:
def extract_summary_data(document, title):
    sentences = document.split("\n")
    sentences = list(filter(None, sentences))
    
    if len(sentences) < 2 or sentences[0] == np.nan:
        return pd.Series([np.NaN, np.NaN, np.NaN])

    if sentences[0] == title:
        sentences = sentences[1:]
    else:
        print('err', sentences[0], df['title'][0])

    summary = sentences[0]
    new_document = "\n".join(sentences[1:])
    
    if len(new_document) == 0:
        return pd.Series([np.NaN, np.NaN, np.NaN])

    return pd.Series([title, summary, new_document])

extract_summary_data(df['document'][0], df['title'][0])

0                                            Anarchism
1    Anarchism is a political philosophy that advoc...
2    While opposition to the state is central, anar...
dtype: object

In [21]:
def create_summaries_csv(p_df, index):
    summaries_df = p_df.apply(lambda x: extract_summary_data(x['document'], x['title']), axis=1)
    summaries_df.columns = ['title', 'summary', 'document']
    
    summaries_df.dropna(inplace=True)

    print("finished summaries csv {} containing {} documents".format(index, summaries_df.shape[0]))

    summaries_df.to_csv('data/wiki_summaries_{}.csv'.format(index))

def create_summaries_batched(num_batches=8):
    chunk_size = df.shape[0] // num_batches
    extra = df.shape[0] - (chunk_size * num_batches)
    
    rows_covered = 0
    for i in range(num_batches):
        start = rows_covered
        length = chunk_size + int(i < extra)
        
        rows_covered += length

        end = start + length
        
        p_df = df[start:end]
        index = i + 1
        
        create_summaries_csv(p_df, index)

create_summaries_batched()

finished summaries csv 1 containing 1724 documents
finished summaries csv 2 containing 1764 documents
finished summaries csv 3 containing 1719 documents
finished summaries csv 4 containing 1782 documents
finished summaries csv 5 containing 1699 documents
finished summaries csv 6 containing 1677 documents
finished summaries csv 7 containing 1779 documents
finished summaries csv 8 containing 1744 documents
