In [215]:
import os
import pandas as pd

In [216]:
data_dir = 'wiki_data/'
out_csv_path = 'documents.csv'
urls_path = 'train_urls.txt'
titles_path = '../wikipedia-biography-dataset/wikipedia-biography-dataset/train/train.title'

In [217]:
def parse_document(document_lines):
    start = 0
    while start < len(document_lines) and document_lines[start] != "Contents\n":
        start += 1

    doc = []
    for i in range(start, len(document_lines)):
        if document_lines[i][0] == "^":
            continue

        line_length = len(document_lines[i].split(" "))
        if "References" == document_lines[i][:len("References")] and line_length < 2:
            break

        if line_length > 10:
            doc.append(document_lines[i])

    return ' '.join(doc)

In [218]:
def get_contents_index(article_lines):
    for i in range(len(article_lines)):
        if article_lines[i] == 'Contents\n':
            return i
    
    return -1

def parse_summary(article_lines):
    index = get_contents_index(article_lines)
    
    while(len(article_lines[index].split(" "))) < 8 and index >= 0:
        index -= 1
    
    summary_indices = []
    summary_indices.append(index)
    
    summary = []
    while len(article_lines[index].split(" ")) > 8 and index >= 0:
        summary.insert(0, article_lines[index].strip())
        index -= 1
        
    summary_indices.append(index)
        
    return ' '.join(summary), summary_indices[::-1]

In [219]:
HEADERS_TO_IGNORE = ["See also", "References", "Bibliography", "External links"]

def parse_content_headers(article_lines):
    index = get_contents_index(article_lines) + 1
    
    headers = []
    
    while True:
        line = article_lines[index]
        if line[0].isnumeric():
            header = " ".join(line.split(" ")[1:]).strip()
            if header not in HEADERS_TO_IGNORE:
                headers.append(header)
        elif line[0] != "\n":
            break
        
        index += 1
    
    return headers

In [220]:
def get_sidebar_index(article_lines, file):
    title = file.strip('.txt')
    for i in range(len(article_lines)):
        if article_lines[i].lower().strip() == (title.strip()):
            return i
    
    return -1

def parse_right_sidebar(article_lines, file, end_index):
    index = get_sidebar_index(article_lines, file)
    if index == -1:
        return []
    
    sidebar = []
    
    for i in range(index, end_index):
        line = article_lines[i]
        if line != "\n" and len(line.split(" ")) <= 3:
            sidebar.append(line.strip())
    
    return sidebar

In [226]:
def parse_dataset_for_documents(csv_path, data_dir, urls_path, titles_path, num_docs):
    df = pd.DataFrame(columns=[
        'title', 'document number', 'document', 'summary', 'header', 'sidebar', 'url'
    ])
    
    docs = []
    summaries = []
    numbers = []
    used_document_count = 0
    skipped_document_count = 0
    ordered_urls = []
    ordered_titles = []
    wiki_headers = []
    wiki_sidebars = []
    
    urls = open(urls_path, 'r').readlines()
    titles = open(titles_path, 'r').readlines()
    filenames = os.listdir(data_dir)
    num_docs = num_docs if num_docs > 0 else len(filenames)

    print(num_docs)
    for i in range(num_docs):
        filename = filenames[i]
        doc_num = int(filename.split(".")[0])
        extension = filename.split(".")[1]
        if extension != 'txt':
            continue
            
        title = titles[doc_num].replace('-lrb- ', '(').replace(' -rrb-', ')')
        lines = open(data_dir + filename).readlines()
        document = parse_document(lines)
        summary, summary_indices = parse_summary(lines)
        lower_bound, upper_bound = summary_indices
        
        headers = parse_content_headers(lines)
        sidebar = parse_right_sidebar(lines, title, lower_bound)
        
        if document != "" and summary != "":
            url = urls[doc_num]
            
            docs.append(document)
            summaries.append(summary)
            numbers.append(doc_num)
            ordered_titles.append(title[:len(title) - 1])
            ordered_urls.append(url[:len(url) - 1])
            wiki_headers.append(' --- '.join(headers))
            wiki_sidebars.append(' --- '.join(sidebar))
            
            used_document_count += 1
        else:
            skipped_document_count += 1
                    
        if used_document_count % 10000 == 0:
            print(str(used_document_count) + ' used.')

    df['document'] = docs
    df['summary'] = summaries
    df['document number'] = numbers
    df['title'] = ordered_titles
    df['header'] = wiki_headers
    df['sidebar'] = wiki_sidebars
    df['url'] = ordered_urls
    
    df.to_csv(csv_path, index=False)
    
    return used_document_count, skipped_document_count

In [241]:
used_count, skipped_count = parse_dataset_for_documents(out_csv_path, data_dir, urls_path, titles_path, 180000)

180000
0 used.
10000 used.
20000 used.
20000 used.
30000 used.
30000 used.
30000 used.
30000 used.
30000 used.
40000 used.
50000 used.
50000 used.
50000 used.
50000 used.
50000 used.
50000 used.
50000 used.
50000 used.
60000 used.
70000 used.
70000 used.
80000 used.
90000 used.
90000 used.
100000 used.


In [242]:
df = pd.read_csv(out_csv_path)

In [243]:
print(len(df['sidebar']))
df['sidebar'].isnull().sum()

100400


31184