In [3]:
import requests
import bz2
import re
import os
import csv
from lxml import etree

# Step 1: Download the Simple English Wikipedia dump
wiki_dump_url = "https://dumps.wikimedia.org/simplewiki/latest/simplewiki-latest-pages-articles.xml.bz2"
# wiki_dump_url = "https://dumps.wikimedia.org/simplewiki/latest/simplewiki-latest-pages-meta-current.xml.bz2"
file_path = wiki_dump_url.split('/')[-1]
print(f"Downloading the Simple English Wikipedia dump into {file_path}...")

with requests.get(wiki_dump_url, stream=True) as r:
    with open(file_path, 'wb') as f:
        for chunk in r.iter_content(chunk_size=8192):
            f.write(chunk)


Downloading the Simple English Wikipedia dump into simplewiki-latest-pages-articles.xml.bz2...


In [4]:
    
xml_file_path = file_path.replace('.bz2', '')

print(f"Extracting the XML file into {xml_file_path} ...")
with bz2.BZ2File(file_path, 'rb') as f_in, open(xml_file_path, 'wb') as f_out:
    for data in iter(lambda: f_in.read(100 * 1024), b''):
        f_out.write(data)

Extracting the XML file into simplewiki-latest-pages-articles.xml ...


In [5]:
import pandas as pd
from lxml import etree
import tqdm 

def extract_latest_versions(xml_file):
    tree = etree.parse(xml_file)
    ns = {
        'mw': 'http://www.mediawiki.org/xml/export-0.10/'  # Namespace
    }
    
    data = []

    # Iterate over all pages
    for page in tqdm.tqdm(tree.xpath('//mw:page', namespaces=ns)):
        title = page.find('mw:title', namespaces=ns).text
        
        # Get all the revisions
        revisions = page.findall('mw:revision', namespaces=ns)
        
        # Get the latest revision
        latest_revision = revisions[-1]
        text_elem = latest_revision.find('mw:text', namespaces=ns)
        
        # Ensure that the text element is not None and has content
        if text_elem is not None and text_elem.text:
            content = text_elem.text
            data.append({'title': title, 'content': content})

    # Convert the data into a pandas DataFrame
    df = pd.DataFrame(data)
    return df

df = extract_latest_versions(xml_file_path)
df['content_len'] = df.content.apply(lambda x: len(x.split()))

print(df.head())  # Print the first few rows of the DataFrame


100%|██████████| 449896/449896 [00:05<00:00, 88373.54it/s]


    title                                            content
0   April  {{monththisyear|4}}\n'''April''' (Apr.) is the...
1  August  {{monththisyear|8}}\n'''August''' (Aug.) is th...
2     Art  [[File:Pierre-Auguste_Renoir,_Le_Moulin_de_la_...
3       A  {{about|the first [[letter]] in the [[alphabet...
4     Air  [[File:Kawasaki-Electric Fan.jpg|thumb|A [[wik...


In [37]:
import re

def extract_wikilinks(text):
    # Find all patterns that match [[ ... ]]
    links = re.findall(r'\[\[(.*?)\]\]', text)
    links = [f"[[{link}]]" for link in links]
    refs = re.findall(r'\<ref(.*?)/ref>', text)
    refs = [f"<ref{ref}/ref>" for ref in refs]
    return links, refs


s = df.loc[df.content_len>100].sample()
title, content  = s.title.iloc[0], s.content.iloc[0]
links, refs = extract_wikilinks(content)
print(title)
print(links)
print(refs)
print(content)
len(df.loc[df.content_len>200])

Mall
['[[File:Mall of the Emirates (3679338750).jpg|right|thumb|View of the [[Mall of the Emirates]]', '[[Dubai]]', '[[File:Eaton Centre HDR style.jpg|thumb|right|300px|The central walking area of the [[Toronto Eaton Centre]]', '[[Toronto]]', '[[Canada]]', '[[building]]', '[[shop]]', '[[shop|store]]', '[[market]]', '[[bazaar]]', '[[booth]]', '[[stall]]', '[[shopping|shop]]', '[[parking lot]]', '[[cars]]', '[[food court]]', '[[movie theatre]]', '[[America]]', '[[United Kingdom]]', '[[plaza]]', '[[West Edmonton Mall]]', '[[Edmonton]]', '[[Alberta]]', '[[Canada]]', '[[Mall of America]]', '[[Bloomington, Minnesota|Bloomington]]', '[[Minnesota]]', '[[Category:Malls| ]]']
[]
[[File:Mall of the Emirates (3679338750).jpg|right|thumb|View of the [[Mall of the Emirates]], in [[Dubai]]]]
[[File:Eaton Centre HDR style.jpg|thumb|right|300px|The central walking area of the [[Toronto Eaton Centre]] in [[Toronto]], [[Canada]]]]

A '''mall''' or '''shopping center''' is a large [[building]] that is ful

124743

In [22]:
import random
link = random.choice(links)
print(link)
# print(s.content.iloc[0].replace(f"[[{link}]]", "[CITE]"))

[[ePodunk]]


In [26]:
from transformers import BertTokenizerFast, BertModel

# Load the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', cache_dir="/mnt/HDD/hugging_cache/")
model = BertModel.from_pretrained('bert-base-uncased')

# Add the special tokens
special_tokens = {'additional_special_tokens': ['[CITE]', '[REF]']}
tokenizer.add_special_tokens(special_tokens)

# Resize the model's token embeddings to account for the new tokens
model.resize_token_embeddings(len(tokenizer))

# Verify that the tokens were added
print(tokenizer.tokenize('This is a sample text [CITE] and a reference [REF].'))

# Note: If you're saving the tokenizer and model for later use, remember to save both
# since the tokenizer now knows about the new special tokens, but other tokenizers won't.


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


['this', 'is', 'a', 'sample', 'text', '[CITE]', 'and', 'a', 'reference', '[REF]', '.']


In [34]:
tokenizer.decode(tokenizer.encode('This is a sample text [CITE] and a reference [REF].'))


'[CLS] this is a sample text [CITE] and a reference [REF]. [SEP]'

In [4]:
!pip install 
from lxml import etree
import mwparserfromhell

def extract_latest_versions(xml_file):
    tree = etree.parse(xml_file)
    ns = {
        'mw': 'http://www.mediawiki.org/xml/export-0.10/'  # Namespace
    }

    # Iterate over all pages
    for page in tree.xpath('//mw:page', namespaces=ns):
        title = page.find('mw:title', namespaces=ns).text
        
        # Get all the revisions
        revisions = page.findall('mw:revision', namespaces=ns)
        
        # Get the latest revision
        latest_revision = revisions[-1]
        text_elem = latest_revision.find('mw:text', namespaces=ns)
        
        # Ensure that the text element is not None and has content
        if text_elem is not None and text_elem.text:
            wiki_text = text_elem.text
            parsed_text = mwparserfromhell.parse(wiki_text)
            
            # Do something with the parsed text, e.g., print the first 100 characters
            print(title, ":", parsed_text[:100])
        else:
            print(title, ": No text available")

xml_file = 'simplewiki-latest-pages-articles.xml'
extract_latest_versions(xml_file)


ModuleNotFoundError: No module named 'mwparserfromhell'

In [2]:


# Step 3: Parse the XML and extract the text and links
csv_file_path = "wikipedia_articles_with_links.csv"
print("Parsing the XML and extracting text and links...")
with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(['Title', 'Text', 'Links'])

    context = etree.iterparse(xml_file_path, tag='{http://www.mediawiki.org/xml/export-0.10/}page')

    for _, page in context:
        title = page.findtext('.//{http://www.mediawiki.org/xml/export-0.10/}title')
        text = page.findtext('.//{http://www.mediawiki.org/xml/export-0.10/}text')
        links = re.findall(r'\[\[([^\]]+)\]\]', text) if text else []
        links_str = "|".join(links)
        
        csv_writer.writerow([title, text, links_str])

        # Free up memory by clearing XML element
        page.clear()
        for ancestor in page.xpath('ancestor-or-self::*'):
            while ancestor.getprevious() is not None:
                del ancestor.getparent()[0]
        break

# Clean up the downloaded and extracted files
os.remove(file_path)
os.remove(xml_file_path)

print("Finished! The data has been saved to", csv_file_path)


Parsing the XML and extracting text and links...


NameError: name 'xml_file_path' is not defined

In [4]:
import pandas as pd
df = pd.read_csv('wikipedia_articles_with_links.csv')
df

Unnamed: 0,Title,Text,Links
0,April,{{monththisyear|4}}\n'''April''' (Apr.) is the...,month|year|Julian calendar|Julian|Gregorian ca...
1,August,{{monththisyear|8}}\n'''August''' (Aug.) is th...,month|year|Gregorian calendar|July|September|d...
2,Art,"[[File:Pierre-Auguste_Renoir,_Le_Moulin_de_la_...","File:Pierre-Auguste_Renoir,_Le_Moulin_de_la_Ga..."
3,A,{{about|the first [[letter]] in the [[alphabet...,letter|alphabet|File:A cursiva.gif|thumb|Writi...
4,Air,[[File:Kawasaki-Electric Fan.jpg|thumb|A [[wik...,File:Kawasaki-Electric Fan.jpg|thumb|A [[wikt:...
...,...,...,...
446828,Alive and Kicking (song),{{Infobox song\n| name = Alive & Kicking...,Simple Minds|Once Upon a Time (Simple Minds al...
446829,Adorations,{{Infobox song\n| name = Adorations\n| c...,Killing Joke|Brighter than a Thousand Suns (al...
446830,Category:Killing Joke songs,{{songs category}},
446831,Template:Dina Carroll,{{Navbox musical artist\n| name = Dina Carroll...,Dina Carroll|So Close (album)|So Close|Only Hu...


In [7]:
df.iloc[0]

Title                                                April
Text     {{monththisyear|4}}\n'''April''' (Apr.) is the...
Links    month|year|Julian calendar|Julian|Gregorian ca...
Name: 0, dtype: object

In [16]:
for _, page in context:
    break
print(page.)

<bound method _Element.values of <Element {http://www.mediawiki.org/xml/export-0.10/}page at 0x7fdcb000d100>>


In [94]:
import numpy as np

def beta1(f, eps = 1e-3):
    x, y, z = np.random.randn(3, 1000000)
    eps = 1e-3
    a = np.sqrt(eps) * x + np.sqrt(1-eps)*z 
    b = np.sqrt(eps) * y + np.sqrt(1-eps)*z 
    a = f(a)
    b = f(b)
    a, b = a - np.mean(a), b - np.mean(b)
    ab = np.mean(a * b)
    return (1 - np.mean(a * b) / np.std(a) / np.std(b))/eps

f = lambda x: np.sin(x)
beta1(f)

1.3146768336104175

In [71]:
x * np.heaviside(x, 0)

array([-0.        , -0.        ,  0.50767157, ..., -0.        ,
        1.02885912, -0.        ])

In [9]:
import xml.etree.ElementTree as ET
import re
import bz2
import random
from pathlib import Path
from dataclasses import dataclass
from typing import List, Tuple, Dict, Optional
import json

@dataclass
class WikiLink:
    """Represents a Wikipedia link with its position and text."""
    start: int
    end: int
    target: str
    display: Optional[str] = None

class WikiTextProcessor:
    """Process wiki text to extract and mask links for training."""
    
    def __init__(self):
        # Regex for matching wiki links
        self.link_pattern = re.compile(r'\[\[([^]\|]*?)(?:\|([^]]*))?\]\]')
        self.ref_pattern = re.compile(r'<ref>.*?</ref>')
        
    def extract_links(self, text: str) -> List[WikiLink]:
        """Extract all wiki links from text with their positions."""
        links = []
        offset = 0
        
        # First remove refs to avoid interference
        text = self.ref_pattern.sub('', text)
        
        for match in self.link_pattern.finditer(text):
            start, end = match.span()
            target = match.group(1)
            display = match.group(2)
            
            # If there's no display text, use the target
            display_text = display if display else target
            
            links.append(WikiLink(
                start=start - offset,
                end=end - offset,
                target=target,
                display=display_text
            ))
            
        return links

    def create_training_example(self, text: str, mask_probability: float = 0.15) -> Tuple[str, List[Dict]]:
        """
        Create a training example by masking some links and adding them as references.
        
        Args:
            text: The wiki text
            mask_probability: Probability of masking each link
            
        Returns:
            Tuple of (masked_text, reference_list)
        """
        links = self.extract_links(text)
        masked_text = text
        references = []
        offset = 0
        
        # Remove all refs first
        masked_text = self.ref_pattern.sub('', masked_text)
        
        # Sort links by position in reverse order to preserve positions
        links.sort(key=lambda x: x.start, reverse=True)
        
        for link in links:
            if random.random() < mask_probability:
                # Calculate positions accounting for previous changes
                start = link.start - offset
                end = link.end - offset
                
                # Original link text from the wiki markup
                original = masked_text[start:end]
                
                # Replace the link with just the display text
                display_text = link.display if link.display else link.target
                masked_text = masked_text[:start] + display_text + masked_text[end:]
                
                # Update offset
                offset += len(original) - len(display_text)
                
                # Add to references
                references.append({
                    'target': link.target,
                    'display': link.display,
                    'context': display_text
                })
        
        # Add references section if there are any masked links
        if references:
            masked_text += "\n\n<Ref>:\n"
            for ref in references:
                masked_text += f"Link to [[{ref['target']}]] from context: {ref['context']}\n"
        
        return masked_text, references

class WikiLinkTrainingDataCreator:
    """Creates training data for wiki link prediction."""
    
    def __init__(self, dump_path: str):
        self.extractor = WikiDumpExtractor(dump_path)
        self.processor = WikiTextProcessor()
        
    def create_training_example_from_page(self, title: str) -> Optional[Dict]:
        """Create a training example from a specific page."""
        page_data = self.extractor.extract_page(title=title)
        if not page_data:
            return None
            
        text = page_data['text']
        masked_text, references = self.processor.create_training_example(text)
        
        return {
            'title': page_data['title'],
            'original_text': text,
            'masked_text': masked_text,
            'references': references
        }
    
    def save_training_examples(self, titles: List[str], output_path: str):
        """Create and save training examples for multiple pages."""
        examples = []
        
        for title in titles:
            example = self.create_training_example_from_page(title)
            if example:
                examples.append(example)
                
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(examples, f, indent=2)

# Example usage:
if __name__ == "__main__":
    # Create training data
    creator = WikiLinkTrainingDataCreator("./simplewiki-latest-pages-articles.xml")
    
    # Example titles
    example_titles = [
        "Python (programming language)",
        "Computer programming",
        "Machine learning"
    ]
    
    # Create and save training examples
    creator.save_training_examples(example_titles, "wiki_link_training_data.json")
    
    # Example of processing a single piece of text
    processor = WikiTextProcessor()
    sample_text = """
    '''Python''' is an [[open source]] [[programming language]]. It was made to be 
    easy-to-read-and-understand and powerful. A [[Netherlands|Dutch]] programmer named 
    [[Guido van Rossum]] made Python in 1991.
    """
    
    masked_text, refs = processor.create_training_example(sample_text)
    print("\nOriginal text:")
    print(sample_text)
    print("\nMasked text:")
    print(masked_text)
    print("\nReferences:")
    print(json.dumps(refs, indent=2))


Original text:

    '''Python''' is an [[open source]] [[programming language]]. It was made to be 
    easy-to-read-and-understand and powerful. A [[Netherlands|Dutch]] programmer named 
    [[Guido van Rossum]] made Python in 1991.
    

Masked text:

    '''Python''' isopen sourcece]] programming language. It was made to be 
    easy-to-read-and-understand and powerful. A [[Netherlands|Dutch]] programmer named 
    [[Guido van Rossum]] made Python in 1991.
    

<Ref>:
Link to [[programming language]] from context: programming language
Link to [[open source]] from context: open source


References:
[
  {
    "target": "programming language",
    "display": "programming language",
    "context": "programming language"
  },
  {
    "target": "open source",
    "display": "open source",
    "context": "open source"
  }
]


In [5]:
print(page_data['text'])

{{About|the [[programming language]]|the [[snake]]|python}}
{{Infobox programming language
| logo = Python-logo-notext.svg
| logo size = 121px
| paradigm = [[Multi-paradigm programming language|Multi-paradigm]]: [[object-oriented programming|object-oriented]],<ref>{{Cite web|title=General Python FAQ — Python 3.9.2 documentation|url=https://docs.python.org/3/faq/general.html#what-is-python|access-date=2021-03-28|website=docs.python.org|archive-date=24 October 2012|archive-url=https://web.archive.org/web/20121024164224/http://docs.python.org/faq/general.html#what-is-python|url-status=live}}</ref> [[procedural programming|procedural]] ([[imperative programming|imperative]]), [[functional programming|functional]], [[structured programming|structured]], [[reflective programming|reflective]]
| released = {{start date and age|1991|02|20|df=y}}<ref name="alt-sources-history">{{cite web |url=https://www.tuhs.org/Usenet/alt.sources/1991-February/001749.html |title=Python 0.9.1 part 01/21 |publis