In [5]:
import wikipediaapi
from tqdm.auto import tqdm
from collections import Counter
import requests
from bs4 import BeautifulSoup
from collections import defaultdict
import pandas as pd

In [6]:
wiki_wiki = wikipediaapi.Wikipedia('MyProjectName (bhautikpithadiya12@gmail.com)', 'en', timeout=1000)

In [7]:
train_pages = ['Christopher_Nolan',
                'Following',
                'Memento',
                'Insomnia',
                'Batman Begins',
                'The Prestige',
                'The Dark Knight',
                'Inception',
                'The Dark Knight Rises',
                'Interstellar',
                'Dunkirk',
                'Tenet',
                'Oppenheimer']

In [8]:
def get_wiki_sections_text(page):
    ignore_sections = ["References", "See also", "External links", "Further reading", "Sources"]
    wiki_page = wiki_wiki.page(page)
    
    # Get all the sections text
    page_sections = [x.text for x in wiki_page.sections if x.title not in ignore_sections and x.text != ""]
    section_titles = [x.title for x in wiki_page.sections if x.title not in ignore_sections and x.text != ""]
    
    # Add the summary page
    page_sections.append(wiki_page.summary)
    section_titles.append("Summary")

    return page_sections, section_titles


In [53]:
def df_to_txt(pages):
    dataset = ''
    for page in tqdm(pages):
        if '_' in page:
            page = page.replace("_"," ")
        sections, title = get_wiki_sections_text(page)
        for section,title in zip(sections,title):
            if '\n' in section:
                section = section.replace("\n","")
            # dataset += f"Page : {page}, Section Title : {title}, Content : {section} \n "
            with open('wiki_dataset.txt','a') as f:
                f.write(f"Page : {page}, Section Title : {title}, Content : {section}\n")
    
    

In [54]:
df_to_txt(train_pages)

  0%|          | 0/13 [00:00<?, ?it/s]

In [52]:
def get_pages_df(pages):
    page_section_texts = []
    for page in tqdm(pages):
        if '_' in page:
            page = page.replace("_"," ")
        sections, titles = get_wiki_sections_text(page)
        for section, title in zip(sections, titles):
            page_section_texts.append({
                'page': page,
                'section_title': title,
                'text': section
            })
    print(len(page_section_texts))
    return pd.DataFrame(page_section_texts)


In [43]:
train_pages_df = get_pages_df(train_pages)
train_pages_df.to_csv("orignal_train_pages.csv", index=False)
print(train_pages_df.shape)
train_pages_df.head()


  0%|          | 0/13 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [11]:
train_pages_df.page.unique()

array(['Christopher_Nolan', 'Following', 'Memento', 'Insomnia',
       'Batman Begins', 'The Prestige', 'The Dark Knight', 'Inception',
       'The Dark Knight Rises', 'Interstellar', 'Dunkirk', 'Tenet',
       'Oppenheimer'], dtype=object)

In [29]:
train_pages_df.section_title.unique()

array(['Early life', 'Personal life and public image', 'Filmmaking style',
       'Recognition', 'Awards and honours', 'Summary', 'Plot',
       'Production', 'Release', 'Film and television', 'Music', 'Other',
       'Signs and symptoms', 'Causes', 'Mechanism', 'Diagnosis',
       'Prevention', 'Management', 'Prognosis', 'Epidemiology',
       'Society and culture', 'Cast', 'Impact', 'Themes',
       'Critical reception', 'Awards and nominations',
       '2006 film adaptation', 'Sequel', 'Accolades',
       'In popular culture', 'Marketing', 'Space', 'Organizations',
       'Etymology and language use', 'Population', 'Politics',
       'Administration', 'Economy', 'Cuisine', 'Prototype metre',
       'Tourist attractions', 'Transport', 'Sports', 'Notable residents',
       'Climate', 'Media', 'Other uses', 'Private and political life',
       'Postwar activities', 'Final years', 'Death', 'Legacy',
       'Publications'], dtype=object)

In [12]:
df = train_pages_df.copy()

In [13]:
columns = ['page','section_title','text']
dataset = pd.DataFrame(columns=columns)

In [14]:
dataset

Unnamed: 0,page,section_title,text


In [15]:
def creating_sliding_windows(df):
    columns = ['page','section_title','text']
    dataset = pd.DataFrame(columns=columns)
    length_of_sliding_window = 256
    page = df.page
    section_title = df.section_title
    text = df.text
    text = text.split()
    
    total_number_of_windows = len(text)//length_of_sliding_window
    
    start = 0
    end = length_of_sliding_window
    
    for i in range(total_number_of_windows+1):
        if i==total_number_of_windows:
            sliding_window = text[start:]
        else:
            sliding_window = text[start:end]
        row = {
            'page':[page],
            'section_title' : [section_title],
            'text':[' '.join(s for s in sliding_window)]
        }
        start+=length_of_sliding_window
        end +=length_of_sliding_window
        
        data = pd.DataFrame(row)
        
        dataset = pd.concat([dataset,data],ignore_index=True)
    
    return dataset    

In [16]:
for i in range(len(df)):
    dataset = pd.concat([dataset,creating_sliding_windows(df.iloc[i])])

In [17]:
dataset

Unnamed: 0,page,section_title,text
0,Christopher_Nolan,Early life,Christopher Edward Nolan was born on 30 July 1...
1,Christopher_Nolan,Early life,Belic. Nolan and Roko co-directed the surreal ...
0,Christopher_Nolan,Personal life and public image,"Nolan is married to Emma Thomas, whom he met a..."
0,Christopher_Nolan,Filmmaking style,Nolan's films are largely centred in metaphysi...
1,Christopher_Nolan,Filmmaking style,"Bordwell, a film theorist, wrote that Nolan ha..."
...,...,...,...
3,Oppenheimer,Legacy,"based on American Prometheus, Oppenheimer is p..."
4,Oppenheimer,Legacy,"numbers for technological, and organizational,..."
0,Oppenheimer,Publications,"Oppenheimer, J. Robert (1954). Science and the..."
0,Oppenheimer,Summary,J. Robert Oppenheimer (born Julius Robert Oppe...


In [18]:
dataset.reset_index(drop=True,inplace=True)

In [19]:
dataset

Unnamed: 0,page,section_title,text
0,Christopher_Nolan,Early life,Christopher Edward Nolan was born on 30 July 1...
1,Christopher_Nolan,Early life,Belic. Nolan and Roko co-directed the surreal ...
2,Christopher_Nolan,Personal life and public image,"Nolan is married to Emma Thomas, whom he met a..."
3,Christopher_Nolan,Filmmaking style,Nolan's films are largely centred in metaphysi...
4,Christopher_Nolan,Filmmaking style,"Bordwell, a film theorist, wrote that Nolan ha..."
...,...,...,...
129,Oppenheimer,Legacy,"based on American Prometheus, Oppenheimer is p..."
130,Oppenheimer,Legacy,"numbers for technological, and organizational,..."
131,Oppenheimer,Publications,"Oppenheimer, J. Robert (1954). Science and the..."
132,Oppenheimer,Summary,J. Robert Oppenheimer (born Julius Robert Oppe...


In [20]:
dataset.to_csv('train_pages.csv',index=False)

In [21]:
def creating_moving_sliding_windows(df):
    columns = ['page','section_title','text']
    dataset = pd.DataFrame(columns=columns)
    length_of_sliding_window = 128
    page = df.page
    section_title = df.section_title
    text = df.text
    text = text.split()
    
    total_number_of_windows = len(text) - length_of_sliding_window + 1
    start = 0
    end = length_of_sliding_window
    
    if total_number_of_windows<0:
        sliding_window = text[:-1]
        row = {
            'page':[page],
            'section_title' : [section_title],
            'text':[' '.join(s for s in sliding_window)]
        }
        data = pd.DataFrame(row)
        
        dataset = pd.concat([dataset,data],ignore_index=True)
    else:    
        while end <= len(text):
            sliding_window = text[start:end]
            row = {
                'page':[page],
                'section_title' : [section_title],
                'text':[' '.join(s for s in sliding_window)]
            }
            start+=1
            end +=1
            
            data = pd.DataFrame(row)
            
            dataset = pd.concat([dataset,data],ignore_index=True)
    
    return dataset    

In [22]:
columns = ['page','section_title','text']
moving_window_df = pd.DataFrame(columns=columns)

In [23]:
for i in range(len(df)):
    moving_window_df = pd.concat([moving_window_df,creating_moving_sliding_windows(df.iloc[i])])    

In [24]:
moving_window_df.reset_index(inplace=True,drop=True)

In [25]:
moving_window_df

Unnamed: 0,page,section_title,text
0,Christopher_Nolan,Early life,Christopher Edward Nolan was born on 30 July 1...
1,Christopher_Nolan,Early life,"Edward Nolan was born on 30 July 1970, in West..."
2,Christopher_Nolan,Early life,"Nolan was born on 30 July 1970, in Westminster..."
3,Christopher_Nolan,Early life,"was born on 30 July 1970, in Westminster, Lond..."
4,Christopher_Nolan,Early life,"born on 30 July 1970, in Westminster, London. ..."
...,...,...,...
15472,Oppenheimer,Summary,the development of the hydrogen bomb during a ...
15473,Oppenheimer,Summary,development of the hydrogen bomb during a 1949...
15474,Oppenheimer,Summary,of the hydrogen bomb during a 1949–1950 govern...
15475,Oppenheimer,Summary,the hydrogen bomb during a 1949–1950 governmen...


In [26]:
moving_window_df.to_csv('moving_df.csv',index=False)

In [27]:
!pip install git+https://github.com/huggingface/transformers
!pip list | grep -E 'transformers|tokenizers'

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-kt4yiry4
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-kt4yiry4
  Resolved https://github.com/huggingface/transformers to commit 73014b561d5f88d728e46a57d346f516fefe3f2d
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting tokenizers<0.20,>=0.19 (from transformers==4.41.0.dev0)
  Downloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01