In [5]:
import os
import pandas as pd

In [6]:
data_dir = 'wiki_data/'
out_csv_path = 'documents.csv'
urls_path = '../wikipedia-biography-dataset/wikipedia-biography-dataset/train/train.url'
titles_path = '../wikipedia-biography-dataset/wikipedia-biography-dataset/train/train.title'
sidebars_path = '../wikipedia-biography-dataset/wikipedia-biography-dataset/train/train.box'

In [7]:
def parse_document(document_lines):
    start = 0
    while start < len(document_lines) and document_lines[start] != "Contents\n":
        start += 1

    doc = []
    for i in range(start, len(document_lines)):
        if document_lines[i][0] == "^":
            continue

        line_length = len(document_lines[i].split(" "))
        if "References" == document_lines[i][:len("References")] and line_length < 2:
            break

        if line_length > 10:
            doc.append(document_lines[i])

    return ' '.join(doc)

In [8]:
HEADERS_TO_IGNORE = ["See also", "References", "Bibliography", "External links"]

def parse_content_headers(article_lines):
    index = get_contents_index(article_lines) + 1
    
    headers = []
    
    while True:
        line = article_lines[index]
        if line[0].isnumeric():
            header = " ".join(line.split(" ")[1:]).strip()
            if header not in HEADERS_TO_IGNORE:
                headers.append(header)
        elif line[0] != "\n":
            break
        
        index += 1
    
    return headers

In [9]:
def get_contents_index(article_lines):
    for i in range(len(article_lines)):
        if article_lines[i] == 'Contents\n':
            return i
    
    return -1

def parse_summary(article_lines):
    index = get_contents_index(article_lines)
    
    while(len(article_lines[index].split(" "))) < 8 and index >= 0:
        index -= 1
    
    summary_indices = []
    summary_indices.append(index)
    
    summary = []
    while len(article_lines[index].split(" ")) > 8 and index >= 0:
        summary.insert(0, article_lines[index].strip())
        index -= 1
        
    summary_indices.append(index)
        
    return ' '.join(summary), summary_indices[::-1]

In [10]:
sidebar_for_testing = open(sidebars_path, 'r').readlines()

In [11]:
fieldnames = {}
for i in range(len(sidebar_for_testing)):
    for fieldname in ' '.join(sidebar_for_testing[i].split(':')[0].split('_')[:-1]).split('|'):
        if fieldname not in fieldnames.keys():
            fieldnames[fieldname] = 1
        else:
            fieldnames[fieldname] += 1

In [12]:
len(fieldnames.keys())

386

In [13]:
def parse_entry(sidebar_entry):
    entry_tokens = sidebar_entry.split(':')
    return entry_tokens[0], ':'.join(entry_tokens[1:])

def parse_raw_key(raw_key):
    key_tokens = raw_key.split('_')
    
    if len(key_tokens) == 1:
        return raw_key, None
    
    if key_tokens[-1].isdigit():
        return ' '.join(key_tokens[:-1]), key_tokens[-1]
    
    return ' '.join(key_tokens), None

def parse_sidebar_dict(raw_sidebar):
    parsed_sidebar = {}
    for sidebar_entry in raw_sidebar.split('\t'):
        raw_key, value = parse_entry(sidebar_entry)
        fieldname, num = parse_raw_key(raw_key)
        
        if value == '<none>':
            continue
        
        if fieldname not in parsed_sidebar:
            parsed_sidebar[fieldname] = value
        elif value == ',' or value[0] == '\'' or (parsed_sidebar[fieldname].isdigit() and value.isdigit()):
            parsed_sidebar[fieldname] += value
        else:
            parsed_sidebar[fieldname] += ' ' + value
    
    return parsed_sidebar

def clean(value):
    value = value.replace('-lrb- ', '(')
    value = value.replace(' -rrb-', ')')
    value = value.strip()
    return value

def parse_sidebar(raw_sidebar):
    parsed_sidebar_dict = parse_sidebar_dict(raw_sidebar)
    
    sidebar_values = list(parsed_sidebar_dict.keys()) + list(parsed_sidebar_dict.values())
    sidebar_values = [clean(value) for value in sidebar_values]
    
    sidebar_str = ' --- '.join(sidebar_values)
    
    return sidebar_str

parse_sidebar(sidebar_for_testing[1])

'name --- fullname --- birth date --- birth place --- height --- position --- youthyears --- youthclubs --- years --- clubs --- caps --- goals --- pcupdate --- article title --- aaron hohlbein --- aaron hohlbein --- 16 august 1985 --- middleton, wisconsin, united states --- 60 --- defender --- 2003 -- 2006 --- wisconsin badgers --- 2003 -- 2006 2006 2007 -- 2010 2010 2011 --- wisconsin badgers princeton 56ers kansas city wizards → miami fc (loan) fort lauderdale strikers --- 12431014 --- 0200 --- june 4, 2011 --- aaron hohlbein'

In [28]:
def parse_dataset_for_documents(csv_path, data_dir, urls_path, titles_path, num_docs):
    df = pd.DataFrame(columns=['document number', 'document', 'summary', 'headers', 'title', 'url', 'sidebar'])
    docs = []
    summaries = []
    numbers = []
    used_document_count = 0
    skipped_document_count = 0
    ordered_urls = []
    ordered_titles = []
    ordered_sidebars = []
    wiki_headers = []
    
    urls = open(urls_path, 'r').readlines()
    titles = open(titles_path, 'r').readlines()
    sidebars = open(sidebars_path, 'r').readlines()
    filenames = os.listdir(data_dir)
    num_docs = num_docs if num_docs > 0 else len(filenames)

    print(num_docs)
    for i in range(num_docs):
        filename = filenames[i]
        doc_num = int(filename.split(".")[0])
        extension = filename.split(".")[1]
        if extension != 'txt':
            continue
            
        title = titles[doc_num].replace('-lrb- ', '(').replace(' -rrb-', ')')
        lines = open(data_dir + filename).readlines()
        document = parse_document(lines)
        summary, summary_indices = parse_summary(lines)
        lower_bound, upper_bound = summary_indices
        
        headers = parse_content_headers(lines)
        
        if document != "" and summary != "":
            url = urls[int(doc_num)]
            sidebar = parse_sidebar(sidebars[doc_num])
            
            docs.append(document)
            summaries.append(summary)
            numbers.append(doc_num)
            ordered_titles.append(title[:len(title) - 1])
            ordered_urls.append(url[:len(url) - 1])
            ordered_sidebars.append(sidebar)
            wiki_headers.append(' --- '.join(headers))
            
            used_document_count += 1
        else:
            skipped_document_count += 1
                    
        total_count = used_document_count + skipped_document_count
        if (total_count) % 10000 == 0:
            print('{0} processed. {1} used. {2} skipped.'.format(total_count, used_document_count, skipped_document_count))
            
    df['document'] = docs
    df['summary'] = summaries
    df['document number'] = numbers
    df['title'] = ordered_titles
    df['url'] = ordered_urls
    df['sidebar'] = ordered_sidebars
    df['headers'] = wiki_headers
    
    df.to_csv(csv_path, index=False)
    
    return used_document_count, skipped_document_count

In [29]:
used_count, skipped_count = parse_dataset_for_documents(out_csv_path, data_dir, urls_path, titles_path, -1)

201320
10000 processed. 5635 used. 4365 skipped.
20000 processed. 11154 used. 8846 skipped.
30000 processed. 16772 used. 13228 skipped.
40000 processed. 22344 used. 17656 skipped.
50000 processed. 27885 used. 22115 skipped.
60000 processed. 33387 used. 26613 skipped.
70000 processed. 38954 used. 31046 skipped.
80000 processed. 44631 used. 35369 skipped.
90000 processed. 50177 used. 39823 skipped.
100000 processed. 55723 used. 44277 skipped.
110000 processed. 61353 used. 48647 skipped.
120000 processed. 66959 used. 53041 skipped.
130000 processed. 72603 used. 57397 skipped.
140000 processed. 78150 used. 61850 skipped.
150000 processed. 83710 used. 66290 skipped.
160000 processed. 89251 used. 70749 skipped.
170000 processed. 94819 used. 75181 skipped.
180000 processed. 100422 used. 79578 skipped.
190000 processed. 106038 used. 83962 skipped.
200000 processed. 111574 used. 88426 skipped.


In [30]:
df = pd.read_csv(out_csv_path)

In [31]:
df

Unnamed: 0,document number,document,summary,headers,title,url,sidebar
0,91809,"Harsha completed his ordinary level at Wesley,...",Harsha Purasinghe is a serial entrepreneur who...,Background --- Awards & Recognitions --- Publi...,harsha purasinghe,https://en.wikipedia.org/wiki/index.php?curid=...,name --- nationality --- occupation --- birth ...
1,24269,Mejía was born in Azua in the southwestern Dom...,Jenrry Manuel Mejía (Spanish pronunciation: [ˈ...,Early life --- Professional career --- Minor l...,jenrry mejía,https://en.wikipedia.org/wiki/index.php?curid=...,name --- image --- width --- caption --- team ...
2,70462,"He was drafted in the fourth round, seventy-th...","Darby Joseph Hendrickson (born August 28, 1972...",Playing career --- Career statistics --- Regul...,darby hendrickson,https://en.wikipedia.org/wiki/index.php?curid=...,position --- shoots --- height ft --- height i...
3,69510,Moore was born in Wheelock in Robertson County...,"William Tyler Moore, Sr., known as W. T. ""Bill...",Background --- Powerful legislator --- Promoti...,william t. moore (texas politician),https://en.wikipedia.org/wiki/index.php?curid=...,name --- nationality --- state senate --- dist...
4,88466,McDonald was born in Kemper County near Meridi...,"William Jesse McDonald, known as Captain Bill ...","Early years, family, education --- Law enforce...",bill mcdonald (texas ranger),https://en.wikipedia.org/wiki/index.php?curid=...,name --- image --- image size --- caption --- ...
5,31978,Flavio Boltro starts playing trumpet at 9 year...,"Flavio Boltro (born May 5, 1961) is an Italian...",Career --- The Italian years --- The French ye...,flavio boltro,https://en.wikipedia.org/wiki/index.php?curid=...,name --- image --- caption --- background --- ...
6,42093,"He was born in Chadwell Heath, Essex, the son ...",Alan Derek Piggott MBE (born 27 December 1922)...,Early years --- Royal Air Force --- Gliding ca...,derek piggott,https://en.wikipedia.org/wiki/index.php?curid=...,name --- honorific suffix --- image --- captio...
7,10326,"Born in Hong Kong, Andrew Li received his earl...",This is a Chinese name; the family name is Li....,Early life and education --- Legal career --- ...,andrew li,https://en.wikipedia.org/wiki/index.php?curid=...,name --- order --- office --- term start --- t...
8,193694,"Aketxe was born in Bilbao, Biscay. Having emer...",Isaac Aketxe Barrutia (Spanish pronunciation: ...,Football career --- Personal life,isaac aketxe,https://en.wikipedia.org/wiki/index.php?curid=...,name --- fullname --- birth date --- birth pla...
9,90040,Roswell High is a young adult book series writ...,Melinda Metz is an American author of young ad...,Works --- Roswell High --- Fingerprints --- Pa...,melinda metz,https://en.wikipedia.org/wiki/index.php?curid=...,name --- birth place --- genre --- occupation ...


In [32]:
print(len(df['sidebar']))
df['sidebar'].isnull().sum()

112321


0