In [10]:

import requests
from bs4 import BeautifulSoup
import difflib
import Levenshtein , re
import random
from tqdm import tqdm
import argparse
import multiprocessing
import pickle
from autocorrect import Speller
import pandas as pd
import nltk
import time

## WikiTypo Class

In [23]:
class WikiTypos:
    def __init__(self, args):
        self.language = args.language
        self.args = args
        
    # Function to get details about edits for a specific page
    def get_edit_details( self,page_title):
        language = self.language
        WIKIPEDIA_API_URL = 'https://'+language+'.wikipedia.org/w/api.php'
        params = {
            'action': 'query',
            'format': 'json',
            'titles': page_title,
            'prop': 'revisions',
            'rvprop': 'ids',
            'rvslots': 'main',
            'rvlimit': 250  # You can adjust the number of revisions to fetch
        }

        response = requests.get(WIKIPEDIA_API_URL, params=params)
        data = response.json()

        edit_details = []

        # Check if any revision contains content changes
        if 'pages' in data['query']:
            for page_id, page_info in data['query']['pages'].items():
                if 'revisions' in page_info:
                    for revision in page_info['revisions']:
                        edit_details.append(revision['revid'])

        return edit_details

    # Function to compare revisions and display changes
    def display_edit_changes(self,revid1, revid2 ):
        params = {
            'action': 'compare',
            'format': 'json',
            'fromrev': revid1,
            'torev': revid2
        }
        WIKIPEDIA_API_URL = 'https://'+self.language+'.wikipedia.org/w/api.php'
        response = requests.get(WIKIPEDIA_API_URL, params=params)
        data = response.json()
        ds_typo = []
        if 'compare' in data:
            html_changes = data['compare']['*']
            soup = BeautifulSoup(html_changes, 'html.parser')

            # Extract and print added and removed words
            added_words = [span.text for span in soup.find_all('ins') if span.text.strip()]
            removed_words = [span.text for span in soup.find_all('del') if span.text.strip()]

            for w in added_words:
                for rw in removed_words:
                    if (Levenshtein.distance(w.lower(), rw.lower())<=2) &  (len(w)>1)& (len(rw)>1)& (rw.isalpha()==True)& (w.isalpha()==True):
                        ds_typo.append((w,rw)) if (w,rw) not in ds_typo else ds_typo

        return ds_typo

    # Function to find and display edits for a given number of random pages
    def get_random_edits(self,batch_size):
        language = self.args.language
        batch_size = batch_size
        all_typos = []
        random_titles = [page['title'] for page in self.get_random_pages_list(batch_size )]

        for title in random_titles:
            
    #         print(f"\nEdits for '{title}':")
            edit_details = self.get_edit_details(title )

            if len(edit_details) < 2:
                pass
            else:
                ds_typo = self.display_edit_changes(edit_details[0], edit_details[1])
                if ds_typo:
                    all_typos.append(ds_typo) 
        return all_typos

    # Function to get a list of random Wikipedia pages
    def get_random_pages_list(self,batch_size):
        language = self.args.language
        batch_size = batch_size
        params = {
            'action': 'query',
            'format': 'json',
            'list': 'random',
            'rnnamespace': 0,  # Only fetch pages from the main namespace
            'rnlimit': batch_size
        }
        WIKIPEDIA_API_URL = 'https://'+language+'.wikipedia.org/w/api.php'
        response = requests.get(WIKIPEDIA_API_URL, params=params)
        data = response.json()

        return data['query']['random']
    
    def multi_typo_function(self,_):
        num_pages=self.args.num_pages
        batch_size=self.args.pages_batch_size
        all_typos =[]
        
        for i in tqdm(range(int(num_pages/batch_size))):
            try:
                atypos = self.get_random_edits( batch_size)
        #     print(all_typos)
            except:
                time.sleep(1)
                atypos=[]
            all_typos.append(atypos)
        return all_typos

    def parallel_execution(self):
        # Number of CPU cores to use
    #     num_cores = multiprocessing.cpu_count()
        if  self.args.multiprocessing == True:
            if self.args.num_cores == 'max':
                self.num_cores = multiprocessing.cpu_count()
            else:
                self.num_cores = self.args.num_cores
            inputs = [None] * self.num_cores
        else:
            self.num_cores = 1
            inputs = [None] * self.num_cores
        # Create a Pool of workers
        print(f'Total number of pages: {self.num_cores*self.args.num_pages}')
        with multiprocessing.Pool(self.num_cores) as pool:
            results = pool.map(self.multi_typo_function,inputs)
        
        return results
    def process_output(self,outputs):
        sorted_typos = []
        all_outputs = []
        for core in outputs:
            for lists in core:
                for sublists in lists:
                    for pair in sublists:
                        sorted_typos.append(pair) if pair not in sorted_typos else sorted_typos
                        all_outputs.append(pair)
        return sorted_typos, all_outputs
    
    def save_output(self,data):
        name = "wiki_typos_"+ self.language+"_p"+str(self.args.num_pages) +"_c"+str(self.num_cores)
        with open(self.args.output_dir+name+'.pkl', 'wb') as handle:
            pickle.dump(data, handle)
            
    #supports English, Polish, Turkish, Russian, Ukrainian, Czech, Portuguese, Greek, Italian, Vietnamese, French and Spanish
    def spell_check(self,output):
        out=[]
        check = Speller(lang=self.language)
        for pair in output:
            if check(pair[0]) != pair[0]:
                out.append(pair)
        return out
    
    def run(self):
        output_results = self.parallel_execution()
        
        self.output ,self.all_outputs = self.process_output(output_results)
        if self.args.spell_check_data == True:
            self.output_spelled_check = self.spell_check(self.output)
    
        

In [24]:
args_dict = dict(
    language="hi", 
    output_dir="",          # path to save the data
    revisions_size=250,     # valid range (1-500)
    pages_batch_size=100,   # valid range (1-500)
    num_pages = 40000,       # should be dividable by pages_batch_size, this is the number of pages per cpu
    num_cores=8,            # number of cpu cores for multitasking / if set to max it will use max number of cpus available
    multiprocessing= True,  
    spell_check_data = False,#supports English, Polish, Turkish, Russian, Ukrainian, Czech, Portuguese, Greek, Italian, Vietnamese, French and Spanish
    
)

In [176]:
argsen = argparse.Namespace(**args_dict)
wten = WikiTypos(argsen)

In [162]:
argsde = argparse.Namespace(**args_dict)
wtde = WikiTypos(argsde)

In [160]:
argses = argparse.Namespace(**args_dict)
wtes = WikiTypos(argses)

In [172]:
argstr = argparse.Namespace(**args_dict)
wttr = WikiTypos(argstr)

In [25]:
argshi = argparse.Namespace(**args_dict)
wthi = WikiTypos(argshi)

In [327]:
argsfr = argparse.Namespace(**args_dict)
wtfr = WikiTypos(argsfr)

In [26]:
if __name__ == "__main__":
    
    wthi.run()


Total number of pages: 320000


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 400/400 [4:10:11<00:00, 37.53s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 400/400 [4:10:13<00:00, 37.53s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 400/400 [4:10:16<00:00, 37.54s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 400/400 [4:10:23<00:00, 37.56s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 400/400 [4:10:25<00:00, 37.56s/it]
100%|██████████████████████████████████████████████████████████████████████

In [28]:
len(wthi.output)

3277

In [29]:
#### Saving the collection
with open('Noise Dics/wiki_typos_hi_p320000_l3277.pkl', 'wb') as handle:
            pickle.dump(wthi.output, handle)

In [39]:
wt.output_spelled_check
 

[('Uyezds', 'Uezds'),
 ('Bridgwood', 'Bridgewood'),
 ('itse', 'its'),
 ('Muhammmad', 'Muhammad'),
 ('kalancu', 'kalanju'),
 ('Sendin', 'Sendín'),
 ('afffecting', 'affecting'),
 ('ptofile', 'profile'),
 ('Efficent', 'Efficient'),
 ('Kākaramea', 'Kakaramea'),
 ('Ogier', 'Oger'),
 ('Commandeurs', 'Commanders'),
 ('ealy', 'early'),
 ('Societ', 'Society'),
 ('Apiti', 'Āpiti'),
 ('Yuichi', 'Yūichi'),
 ('boen', 'born'),
 ('metting', 'meeting'),
 ('Fench', 'French'),
 ('Byeon', 'Byeong'),
 ('satellote', 'satellite'),
 ('Staring', 'Starting'),
 ('GRT', 'GT'),
 ('Comuter', 'Computer'),
 ('uninhibited', 'uninhabited'),
 ('primarlly', 'primarily'),
 ('Zybin', 'Zõbin'),
 ('deactivat', 'deactivate'),
 ('FC', 'FK'),
 ('divison', 'division'),
 ('Kalkar', 'Kallar'),
 ('paarty', 'party'),
 ('Golkeeper', 'Goalkeeper'),
 ('ΕDIK', 'EDIK'),
 ('batalion', 'battalion'),
 ('Citroen', 'Citroën'),
 ('platers', 'players'),
 ('satelite', 'satellite'),
 ('Fusées', 'Fusée'),
 ('Pearce', 'Perce'),
 ('nonproft', 'nonp

### some analysis

In [151]:
from collections import Counter
correct_words = [t[1] for t in wt.output]

# Use Counter to count occurrences for each word
word_counts = Counter(correct_words)

# Filter tuples where the second part occurs more than once
for word, count in word_counts.items():
    if count>1:
        print(f"{word}: {count} times")



https: 2 times
links: 2 times
In: 12 times
it: 9 times
It: 12 times
organisation: 3 times
description: 2 times
of: 16 times
la: 3 times
SS: 4 times
San: 3 times
MF: 2 times
FC: 8 times
helicopter: 2 times
on: 19 times
band: 3 times
and: 18 times
the: 22 times
is: 22 times
The: 17 times
to: 18 times
in: 27 times
She: 4 times
at: 14 times
Muhammad: 3 times
positive: 2 times
de: 14 times
com: 2 times
or: 8 times
Use: 2 times
muscle: 2 times
Australia: 4 times
pattern: 2 times
Films: 2 times
des: 2 times
served: 2 times
Frederick: 2 times
was: 16 times
them: 3 times
se: 2 times
ne: 3 times
sw: 2 times
Massachusetts: 2 times
References: 4 times
an: 18 times
if: 5 times
Lanka: 3 times
id: 5 times
he: 13 times
July: 2 times
as: 8 times
academy: 2 times
Insurance: 2 times
External: 3 times
tie: 2 times
US: 6 times
Sports: 2 times
Jan: 2 times
player: 3 times
Awards: 2 times
released: 3 times
loss: 2 times
Jun: 3 times
May: 10 times
Mar: 2 times
Jul: 2 times
two: 3 times
no: 6 times
by: 7 times

In [152]:
result_tuples = [t for t in wt.output if t[1] == 'not']

# Print the result tuples
for result_tuple in result_tuples:
    print(result_tuple)

('no', 'not')
('now', 'not')
('No', 'not')
('note', 'not')
('nog', 'not')
('tot', 'not')
