In [98]:
import wikipediaapi
from tqdm.auto import tqdm
from collections import Counter
import requests
from bs4 import BeautifulSoup
from collections import defaultdict
import pandas as pd

In [99]:
wiki_wiki = wikipediaapi.Wikipedia('MyProjectName (bhautikpithadiya12@gmail.com)', 'en', timeout=1000)

In [100]:
train_pages = ['Christopher_Nolan',
                'Following',
                'Memento',
                'Insomnia',
                'Batman Begins',
                'The Prestige',
                'The Dark Knight',
                'Inception',
                'The Dark Knight Rises',
                'Interstellar',
                'Dunkirk',
                'Tenet',
                'Oppenheimer']

In [101]:
def get_wiki_sections_text(page):
    ignore_sections = ["References", "See also", "External links", "Further reading", "Sources"]
    wiki_page = wiki_wiki.page(page)
    
    # Get all the sections text
    page_sections = [x.text for x in wiki_page.sections if x.title not in ignore_sections and x.text != ""]
    section_titles = [x.title for x in wiki_page.sections if x.title not in ignore_sections and x.text != ""]
    
    # Add the summary page
    page_sections.append(wiki_page.summary)
    section_titles.append("Summary")

    return page_sections, section_titles


In [102]:
def get_pages_df(pages):
    page_section_texts = []
    for page in tqdm(pages):
        sections, titles = get_wiki_sections_text(page)
        for section, title in zip(sections, titles):
            page_section_texts.append({
                'page': page,
                'section_title': title,
                'text': section
            })
    print(len(page_section_texts))
    return pd.DataFrame(page_section_texts)


In [103]:
train_pages_df = get_pages_df(train_pages)
train_pages_df.to_csv("orignal_train_pages.csv", index=False)
print(train_pages_df.shape)
train_pages_df.head()


  0%|          | 0/13 [00:00<?, ?it/s]

76
(76, 3)


Unnamed: 0,page,section_title,text
0,Christopher_Nolan,Early life,Christopher Edward Nolan was born on 30 July 1...
1,Christopher_Nolan,Personal life and public image,"Nolan is married to Emma Thomas, whom he met a..."
2,Christopher_Nolan,Filmmaking style,Nolan's films are largely centred in metaphysi...
3,Christopher_Nolan,Recognition,Nolan has made some of the most influential an...
4,Christopher_Nolan,Awards and honours,Nolan has been nominated for eight Academy Awa...


In [63]:
train_pages_df.page.unique()

array(['Christopher_Nolan', 'Following', 'Memento', 'Insomnia',
       'Batman Begins', 'The Prestige', 'The Dark Knight', 'Inception',
       'The Dark Knight Rises', 'Interstellar', 'Dunkirk', 'Tenet',
       'Oppenheimer'], dtype=object)

In [66]:
df = train_pages_df.copy()

In [64]:
columns = ['page','section_title','text']
dataset = pd.DataFrame(columns=columns)

In [65]:
dataset

Unnamed: 0,page,section_title,text


In [67]:
def creating_sliding_windows(df):
    columns = ['page','section_title','text']
    dataset = pd.DataFrame(columns=columns)
    length_of_sliding_window = 256
    page = df.page
    section_title = df.section_title
    text = df.text
    text = text.split()
    
    total_number_of_windows = len(text)//length_of_sliding_window
    
    start = 0
    end = length_of_sliding_window
    
    for i in range(total_number_of_windows+1):
        if i==total_number_of_windows:
            sliding_window = text[start:]
        else:
            sliding_window = text[start:end]
        row = {
            'page':[page],
            'section_title' : [section_title],
            'text':[' '.join(s for s in sliding_window)]
        }
        start+=length_of_sliding_window
        end +=length_of_sliding_window
        
        data = pd.DataFrame(row)
        
        dataset = pd.concat([dataset,data],ignore_index=True)
    
    return dataset    

In [68]:
for i in range(len(df)):
    dataset = pd.concat([dataset,creating_sliding_windows(df.iloc[i])])

In [69]:
dataset

Unnamed: 0,page,section_title,text
0,Christopher_Nolan,Early life,Christopher Edward Nolan was born on 30 July 1...
1,Christopher_Nolan,Early life,Belic. Nolan and Roko co-directed the surreal ...
0,Christopher_Nolan,Personal life and public image,"Nolan is married to Emma Thomas, whom he met a..."
0,Christopher_Nolan,Filmmaking style,Nolan's films are largely centred in metaphysi...
1,Christopher_Nolan,Filmmaking style,"a film theorist, wrote that Nolan has been abl..."
...,...,...,...
3,Oppenheimer,Legacy,"Prometheus, Oppenheimer is portrayed by actor ..."
4,Oppenheimer,Legacy,"to the Allied effort, resulting in powerful to..."
0,Oppenheimer,Publications,"Oppenheimer, J. Robert (1954). Science and the..."
0,Oppenheimer,Summary,J. Robert Oppenheimer (born Julius Robert Oppe...


In [70]:
dataset.reset_index(drop=True,inplace=True)

In [71]:
dataset

Unnamed: 0,page,section_title,text
0,Christopher_Nolan,Early life,Christopher Edward Nolan was born on 30 July 1...
1,Christopher_Nolan,Early life,Belic. Nolan and Roko co-directed the surreal ...
2,Christopher_Nolan,Personal life and public image,"Nolan is married to Emma Thomas, whom he met a..."
3,Christopher_Nolan,Filmmaking style,Nolan's films are largely centred in metaphysi...
4,Christopher_Nolan,Filmmaking style,"a film theorist, wrote that Nolan has been abl..."
...,...,...,...
129,Oppenheimer,Legacy,"Prometheus, Oppenheimer is portrayed by actor ..."
130,Oppenheimer,Legacy,"to the Allied effort, resulting in powerful to..."
131,Oppenheimer,Publications,"Oppenheimer, J. Robert (1954). Science and the..."
132,Oppenheimer,Summary,J. Robert Oppenheimer (born Julius Robert Oppe...


In [73]:
dataset.to_csv('train_pages.csv',index=False)

In [91]:
def creating_moving_sliding_windows(df):
    columns = ['page','section_title','text']
    dataset = pd.DataFrame(columns=columns)
    length_of_sliding_window = 128
    page = df.page
    section_title = df.section_title
    text = df.text
    text = text.split()
    
    total_number_of_windows = len(text) - length_of_sliding_window + 1
    start = 0
    end = length_of_sliding_window
    
    if total_number_of_windows<0:
        sliding_window = text[:-1]
        row = {
            'page':[page],
            'section_title' : [section_title],
            'text':[' '.join(s for s in sliding_window)]
        }
        data = pd.DataFrame(row)
        
        dataset = pd.concat([dataset,data],ignore_index=True)
    else:    
        while end <= len(text):
            sliding_window = text[start:end]
            row = {
                'page':[page],
                'section_title' : [section_title],
                'text':[' '.join(s for s in sliding_window)]
            }
            start+=1
            end +=1
            
            data = pd.DataFrame(row)
            
            dataset = pd.concat([dataset,data],ignore_index=True)
    
    return dataset    

In [92]:
columns = ['page','section_title','text']
moving_window_df = pd.DataFrame(columns=columns)

In [94]:
for i in range(len(df)):
    moving_window_df = pd.concat([moving_window_df,creating_moving_sliding_windows(df.iloc[i])])    

In [95]:
moving_window_df.reset_index(inplace=True,drop=True)

In [96]:
moving_window_df

Unnamed: 0,page,section_title,text
0,Christopher_Nolan,Early life,Christopher Edward Nolan was born on 30 July 1...
1,Christopher_Nolan,Early life,"Edward Nolan was born on 30 July 1970, in West..."
2,Christopher_Nolan,Early life,"Nolan was born on 30 July 1970, in Westminster..."
3,Christopher_Nolan,Early life,"was born on 30 July 1970, in Westminster, Lond..."
4,Christopher_Nolan,Early life,"born on 30 July 1970, in Westminster, London. ..."
...,...,...,...
15434,Oppenheimer,Summary,the development of the hydrogen bomb during a ...
15435,Oppenheimer,Summary,development of the hydrogen bomb during a 1949...
15436,Oppenheimer,Summary,of the hydrogen bomb during a 1949–1950 govern...
15437,Oppenheimer,Summary,the hydrogen bomb during a 1949–1950 governmen...


In [97]:
moving_window_df.to_csv('moving_df.csv',index=False)