In [1]:
import os 
import numpy as np
import pandas as pd
import ast

In [2]:
wiki_files = os.listdir('./wiki_data/data')
article_titles = [title[:-4] for title in wiki_files]

In [3]:
wiki_documents = []
wiki_summaries = []
titles = []
wiki_headers = []
wiki_sidebars = []

In [4]:
def get_contents_index(article_lines):
    for i in range(len(article_lines)):
        if article_lines[i] == 'Contents\n':
            return i
    
    return -1

def parse_summary(article_lines):
    index = get_contents_index(article_lines)
    
    while(len(article_lines[index].split(" "))) < 8 and index >= 0:
        index -= 1
    
    summary_indices = []
    summary_indices.append(index)
    
    summary = []
    while len(article_lines[index].split(" ")) > 8 and index >= 0:
        summary.insert(0, article_lines[index].strip())
        index -= 1
        
    summary_indices.append(index)
        
    return summary, summary_indices[::-1]

In [5]:
HEADERS_TO_IGNORE = ["See also", "References", "Bibliography", "External links"]

def parse_content_headers(article_lines):
    index = get_contents_index(article_lines) + 1
    
    headers = []
    
    while True:
        line = article_lines[index]
        if line[0].isnumeric():
            header = " ".join(line.split(" ")[1:]).strip()
            if header not in HEADERS_TO_IGNORE:
                headers.append(header)
        elif line[0] != "\n":
            break
        
        index += 1
    
    
    return headers

a_lines = open('./wiki_data/data/James Bond.txt').readlines()
parse_content_headers(a_lines)

['Publication history',
 'Creation and inspiration',
 'Novels and related works',
 'Ian Fleming novels',
 'Post-Fleming novels',
 'Young Bond',
 'The Moneypenny Diaries',
 'Adaptations',
 'Television',
 'Radio',
 'Comics',
 'Films',
 'The Eon Productions films',
 'Non-Eon films',
 'Music',
 'Video games',
 'Guns, vehicles and gadgets',
 'Guns',
 'Vehicles',
 'Gadgets',
 'Cultural impact',
 'Criticisms of James Bond']

In [6]:
def get_sidebar_index(article_lines, file):
    title = file.strip('.txt')
    for i in range(len(article_lines)):
        if article_lines[i] == (title + "\n"):
            return i
    
    return -1

def parse_right_sidebar(article_lines, file, end_index):
    index = get_sidebar_index(article_lines, file)
    
    if index == -1:
        return []
    
    sidebar = []
    
    for i in range(index, end_index):
        line = article_lines[i]
        if line != "\n" and len(line.split(" ")) <= 3:
            sidebar.append(line.strip())
    
    return sidebar

a_lines = open('./wiki_data/data/James Bond.txt').readlines()
parse_right_sidebar(a_lines, 'James Bond.txt', 76)

['James Bond',
 'Created by',
 'Ian Fleming',
 'Original work',
 'Casino Royale (1953)',
 'Print publications',
 'Novel(s)',
 'List of novels',
 'Short stories',
 'Films and television',
 'Film(s)',
 'List of films',
 'Short film(s)',
 'Happy and Glorious',
 'Television series',
 'Miscellaneous',
 'Portrayers',
 'George Baker',
 'Pierce Brosnan',
 'Daniel Craig',
 'Sean Connery',
 'Timothy Dalton',
 'Bob Holness',
 'Michael Jayston',
 'George Lazenby',
 'Roger Moore',
 'Barry Nelson',
 'David Niven',
 'Toby Stephens']

In [7]:
for file in wiki_files:
    lines = open('./wiki_data/data/' + file).readlines()
    summary, summary_indices = parse_summary(lines)
    lower_bound, upper_bound = summary_indices
    
    headers = parse_content_headers(lines)
    
    sidebar = parse_right_sidebar(lines, file, lower_bound)

    if len(summary) > 0:
        titles.append(file)
        
        wiki_summaries.append('. '.join(summary))

        cleaned_lines = [lines[i].strip() for i in range(len(lines)) if lines[i].strip() != '' and (i < lower_bound or i > upper_bound)]
        wiki_documents.append('. '.join(cleaned_lines))
        
        wiki_headers.append(' --- '.join(headers))
        
        wiki_sidebars.append(' --- '.join(sidebar))

In [8]:
df_3 = pd.DataFrame(columns=['title', 'summary', 'document', 'headers', 'sidebar'])
df_3['title'] = pd.Series(titles)
df_3['summary'] = pd.Series(wiki_summaries)
df_3['document'] = pd.Series(wiki_documents)
df_3['headers'] = pd.Series(wiki_headers)
df_3['sidebar'] = pd.Series(wiki_sidebars)
df_3.to_csv('wiki_old_updated.csv')