In [4]:
import os
import pandas as pd

In [5]:
data_dir = 'wiki_data/'
out_csv_path = 'documents.csv'
urls_path = 'train_urls.txt'
titles_path = '../wikipedia-biography-dataset/wikipedia-biography-dataset/train/train.title'

In [6]:
def parse_document(document_lines):
    start = 0
    while start < len(document_lines) and document_lines[start] != "Contents\n":
        start += 1

    doc = []
    for i in range(start, len(document_lines)):
        if document_lines[i][0] == "^":
            continue

        line_length = len(document_lines[i].split(" "))
        if "References" == document_lines[i][:len("References")] and line_length < 2:
            break

        if line_length > 10:
            doc.append(document_lines[i])

    return ' '.join(doc)

In [7]:
def get_contents_index(article_lines):
    for i in range(len(article_lines)):
        if article_lines[i] == 'Contents\n':
            return i
    
    return -1

def parse_summary(article_lines):
    index = get_contents_index(article_lines)
    
    while(len(article_lines[index].split(" "))) < 8 and index >= 0:
        index -= 1
    
    summary_indices = []
    summary_indices.append(index)
    
    summary = []
    while len(article_lines[index].split(" ")) > 8 and index >= 0:
        summary.insert(0, article_lines[index].strip())
        index -= 1
        
    summary_indices.append(index)
        
    return ' '.join(summary), summary_indices[::-1]

In [8]:
HEADERS_TO_IGNORE = ["See also", "References", "Bibliography", "External links"]

def parse_content_headers(article_lines):
    index = get_contents_index(article_lines) + 1
    
    headers = []
    
    while True:
        line = article_lines[index]
        if line[0].isnumeric():
            header = " ".join(line.split(" ")[1:]).strip()
            if header not in HEADERS_TO_IGNORE:
                headers.append(header)
        elif line[0] != "\n":
            break
        
        index += 1
    
    return headers

In [9]:
def get_sidebar_index(article_lines, file):
    title = file.strip('.txt')
    for i in range(len(article_lines)):
        if article_lines[i].lower().strip() == (title.strip()):
            return i
    
    return -1

def parse_right_sidebar(article_lines, file, end_index):
    index = get_sidebar_index(article_lines, file)
    if index == -1:
        return []
    
    sidebar = []
    
    for i in range(index, end_index):
        line = article_lines[i]
        if line != "\n" and len(line.split(" ")) <= 3:
            sidebar.append(line.strip())
    
    return sidebar

In [19]:
def parse_dataset_for_documents(csv_path, data_dir, urls_path, titles_path, num_docs):
    df = pd.DataFrame(columns=[
        'title', 'document number', 'document', 'summary', 'headers', 'sidebar', 'url'
    ])
    
    docs = []
    summaries = []
    numbers = []
    used_document_count = 0
    skipped_document_count = 0
    ordered_urls = []
    ordered_titles = []
    wiki_headers = []
    wiki_sidebars = []
    
    urls = open(urls_path, 'r').readlines()
    titles = open(titles_path, 'r').readlines()
    filenames = os.listdir(data_dir)
    num_docs = num_docs if num_docs > 0 else len(filenames)

    print(num_docs)
    for i in range(num_docs):
        filename = filenames[i]
        doc_num = int(filename.split(".")[0])
        extension = filename.split(".")[1]
        if extension != 'txt':
            continue
            
        title = titles[doc_num].replace('-lrb- ', '(').replace(' -rrb-', ')')
        lines = open(data_dir + filename).readlines()
        document = parse_document(lines)
        summary, summary_indices = parse_summary(lines)
        lower_bound, upper_bound = summary_indices
        
        headers = parse_content_headers(lines)
        sidebar = parse_right_sidebar(lines, title, lower_bound)
        
        if document != "" and summary != "" and len(sidebar) > 0:
            url = urls[doc_num]
            
            docs.append(document)
            summaries.append(summary)
            numbers.append(doc_num)
            ordered_titles.append(title[:len(title) - 1])
            ordered_urls.append(url[:len(url) - 1])
            wiki_headers.append(' --- '.join(headers))
            wiki_sidebars.append(' --- '.join(sidebar))
            
            used_document_count += 1
        else:
            skipped_document_count += 1
                    
        total_count = used_document_count + skipped_document_count
        if (total_count) % 10000 == 0:
            print('{0} processed. {1} used. {2} skipped.'.format(total_count, used_document_count, skipped_document_count))

    df['document'] = docs
    df['summary'] = summaries
    df['document number'] = numbers
    df['title'] = ordered_titles
    df['headers'] = wiki_headers
    df['sidebar'] = wiki_sidebars
    df['url'] = ordered_urls
    
    df.to_csv(csv_path, index=False)
    
    return used_document_count, skipped_document_count

In [20]:
used_count, skipped_count = parse_dataset_for_documents(out_csv_path, data_dir, urls_path, titles_path, -1)

202489
10000 processed. 3839 used. 6161 skipped.
20000 processed. 7592 used. 12408 skipped.
30000 processed. 11481 used. 18519 skipped.
40000 processed. 15278 used. 24722 skipped.
50000 processed. 19102 used. 30898 skipped.
60000 processed. 22923 used. 37077 skipped.
70000 processed. 26754 used. 43246 skipped.
80000 processed. 30690 used. 49310 skipped.
90000 processed. 34547 used. 55453 skipped.
100000 processed. 38444 used. 61556 skipped.
110000 processed. 42304 used. 67696 skipped.
120000 processed. 46223 used. 73777 skipped.
130000 processed. 50173 used. 79827 skipped.
140000 processed. 54035 used. 85965 skipped.
150000 processed. 57869 used. 92131 skipped.
160000 processed. 61711 used. 98289 skipped.
170000 processed. 65551 used. 104449 skipped.
180000 processed. 69412 used. 110588 skipped.
190000 processed. 73299 used. 116701 skipped.
200000 processed. 77133 used. 122867 skipped.


In [21]:
df = pd.read_csv(out_csv_path)

In [22]:
df

Unnamed: 0,title,document number,document,summary,headers,sidebar,url
0,harsha purasinghe,91809,"Harsha completed his ordinary level at Wesley,...",Harsha Purasinghe is a serial entrepreneur who...,Background --- Awards & Recognitions --- Publi...,"Harsha Purasinghe --- Born --- Colombo, Sri La...",https://en.wikipedia.org/wiki/index.php?curid=...
1,jenrry mejía,24269,Mejía was born in Azua in the southwestern Dom...,Jenrry Manuel Mejía (Spanish pronunciation: [ˈ...,Early life --- Professional career --- Minor l...,"Jenrry Mejía --- Relief pitcher --- Azua, Domi...",https://en.wikipedia.org/wiki/index.php?curid=...
2,darby hendrickson,70462,"He was drafted in the fourth round, seventy-th...","Darby Joseph Hendrickson (born August 28, 1972...",Playing career --- Career statistics --- Regul...,"Darby Hendrickson --- Born --- Richfield, MN, ...",https://en.wikipedia.org/wiki/index.php?curid=...
3,flavio boltro,31978,Flavio Boltro starts playing trumpet at 9 year...,"Flavio Boltro (born May 5, 1961) is an Italian...",Career --- The Italian years --- The French ye...,Flavio Boltro --- Flavio Boltro (2010) --- Bac...,https://en.wikipedia.org/wiki/index.php?curid=...
4,isaac aketxe,193694,"Aketxe was born in Bilbao, Biscay. Having emer...",Isaac Aketxe Barrutia (Spanish pronunciation: ...,Football career --- Personal life,Isaac Aketxe --- Personal information --- Full...,https://en.wikipedia.org/wiki/index.php?curid=...
5,melinda metz,90040,Roswell High is a young adult book series writ...,Melinda Metz is an American author of young ad...,Works --- Roswell High --- Fingerprints --- Pa...,Melinda Metz --- Born --- United States --- Oc...,https://en.wikipedia.org/wiki/index.php?curid=...
6,zeynel doğan,58431,"Voice of My Father, as Mehmet, 2012 (also cred...","Zeynel Doğan, (pronounced [zejnel doɣɑn]), *Oc...",Filmography --- Awards,Zeynel Doğan --- Born --- 1979-10-10 --- Elbis...,https://en.wikipedia.org/wiki/index.php?curid=...
7,ricardo virtuoso,163714,Virtuoso played in the youthsystems of Guarati...,"Ricardo Magno Virtuoso Guarà (born March 1, 19...",Career --- Club --- International --- Futsal -...,Ricardo Virtuoso --- Personal information --- ...,https://en.wikipedia.org/wiki/index.php?curid=...
8,alexey prokurorov,32481,Prokurorov was born in the village of Mishino ...,Alexey Alexeyevich Prokurorov (Russian: Алексе...,Career --- World Cup results --- Individual po...,Alexey Prokurorov --- Personal information ---...,https://en.wikipedia.org/wiki/index.php?curid=...
9,bethany firth,136059,Bethany Charlotte Firth was born on 14 Februar...,"Bethany Charlotte Firth, MBE (born 14 February...",Personal life --- Swimming career,Bethany Firth --- MBE --- Personal information...,https://en.wikipedia.org/wiki/index.php?curid=...


In [23]:
print(len(df['sidebar']))
df['sidebar'].isnull().sum()

78120


0