In [1]:
from tqdm import tqdm
import requests
import pandas as pd
import re
from gensim.corpora.wikicorpus import extract_pages,filter_wiki
import os
from bs4 import BeautifulSoup as bs
import json

In [2]:
class WikipediaSpider:
    ##########################################  search funtion ##########################################
    # search funtion, for each query, it will get 10000 results at most
    def search_wikipedia(self,query, max_results=9999999999): # actual max limit is 10000
        url = "https://en.wikipedia.org/w/api.php"
        results = []
        offset = 0
        limit = 500  # Max limit for regular users
        
        while len(results) < max_results:
            #print how many results we have downloaded so far and refresh the output
            print(f"Found {len(results)} results of {query}", end="\r")
            params = {
                "action": "query",
                "list": "search",
                "srsearch": query,
                "srlimit": limit,
                "sroffset": offset,
                "utf8": "1",
                "format": "json",
            }

            response = requests.get(url, params=params)
            data = response.json()
            search_results = data["query"]["search"]
            
            for result in search_results:
                title = result["title"]
                page_id = result["pageid"]
                results.append((title.replace(' ', '_'), page_id, f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"))

            if "continue" in data:
                offset = data["continue"]["sroffset"]
            else:
                break

        return results[:max_results]
    # accept a list of queries and return a list of results
    def search_multi_wikipedia(self,queries, limit = 10):
        res = []
        for query in tqdm(queries):
            res += self.search_wikipedia(query,limit)
        return res
    
    def get_wikipedia_page_content(self, title):
        if not self.check_wikipedia_page_exists(title):
            return None
        url = "https://en.wikipedia.org/w/api.php"
        params = {
            "action": "query",
            "titles": title,
            "prop": "revisions",
            "rvprop": "content",
            "format": "json",
        }
        response = requests.get(url, params=params)
        # print(response)
        data = response.json()
        # print(data)
        page = next(iter(data["query"]["pages"].values()))
        content = page["revisions"][0]["*"]
        return content
    # get multiple pages' content, max limit is 50
    def get_multi_wikipedia_pages_content(self,titles,verbose = True):
        url = "https://en.wikipedia.org/w/api.php"
        titles_str = "|".join(titles)
        params = {
            "action": "query",
            "titles": titles_str,
            "prop": "revisions",
            "rvprop": "content",
            "format": "json",
        }
        contents = []
        response = requests.get(url, params=params)
        data = response.json()
        pages = data["query"]["pages"]
        for page_id, page in pages.items():
            title = page["title"]
            try:
                content = page["revisions"][0]["*"]
                contents.append((title, content))
            except:
                continue
        return contents
    # get hyper links from a page, not used
    def get_wikipedia_links(self, title,pllimit=100):
        url = "https://en.wikipedia.org/w/api.php"
        params = {
            "action": "query",
            "titles": title,
            "prop": "links",
            "pllimit": pllimit,  
            "format": "json",
        }
        res = []
        response = requests.get(url, params=params)
        data = response.json()
        page = next(iter(data["query"]["pages"].values()))
        links = page.get("links", [])
        for link in links:
            title = link["title"].replace(" ", "_")
            res.append(title)
        return res
    # check if a page exists, not used
    def check_wikipedia_page_exists(self, title):
        url = "https://en.wikipedia.org/w/api.php"
        params = {
            "action": "query",
            "titles": title,
            "format": "json",
        }
        response = requests.get(url, params=params)
        data = response.json()
        page = next(iter(data["query"]["pages"].values()))
        if "missing" in page:
            return False
        else:
            return True
    ##########################################  save funtion ##########################################
    # process page's content
    def wiki_replace(self,s):
        s = re.sub(':*{\|[\s\S]*?\|}', '', s)
        s = re.sub('<gallery>[\s\S]*?</gallery>', '', s)
        s = re.sub('(.){{([^{}\n]*?\|[^{}\n]*?)}}', '\\1[[\\2]]', s)
        s = filter_wiki(s)
        s = re.sub('\* *\n|\'{2,}', '', s)
        s = re.sub('\n+', '\n', s)
        s = re.sub('\n[:;]|\n +', '\n', s)
        s = re.sub('\n==', '\n\n==', s)
        # s = u'【' + d[0] + u'】\n' + s
        return s
    # get content and save to csv
    def get_content_and_save(self,title_lists,search_word,save_folder = './'):
        # create folder if not exist
        if not os.path.exists(save_folder):
            os.mkdir(save_folder)
        contents = []
        count = 0
        order = 1
        for i in tqdm(range(0,len(title_lists),50)):
            contents += self.get_multi_wikipedia_pages_content(title_lists[i:i+50])
            count+=50
            if count == 5000: # save every 5000 pages
                count=0
                file_name = search_word+'_'+str(order)+'.csv'
                file_path = os.path.join(save_folder,file_name)
                self._save_content(contents,file_path)
                contents = []
                order+=1
        if len(contents)>0: # save the rest pages
            file_name = search_word+'_'+str(order)+'.csv'
            file_path = os.path.join(save_folder,file_name)
            self._save_content(contents,file_path)
    def _save_content(self,contents,file_path):
        contents = pd.DataFrame(contents,columns=['title','content'])
        contents['content'] = contents['content'].apply(self.wiki_replace)
        contents.to_csv(file_path,index=False)
    ##########################################  category funtion ##########################################
    # sample website: https://en.wikipedia.org/wiki/Special:CategoryTree?target=Category%3AAnimals&mode=all&namespaces=
    def _get_category(self, category_title):
        url = "https://en.wikipedia.org/w/api.php"
        params = {
            "action": 'categorytree',
            "category": category_title,
            "format": "json",
            "options": json.dumps({
                "depth": 1,
                "mode": "all",
            })
        }
        response = requests.get(url, params=params)
        data = response.json()
        html_content = data['categorytree']['*']
        if html_content == '':
            return None
        soup = bs(html_content, 'html.parser')

        child_titles = soup.find_all('a', title=True)
        # eliminate "Category:" of "Category:XXXX"
        category_titles = []
        end_titles = []
        for title in child_titles:
            if title['title'].startswith('Category:'):
                processed_title = title['title'].replace('Category:', '')
                category_titles.append(processed_title)
            else:
                end_titles.append(title['title'])
        return category_titles, end_titles
########################################## broad search funtion ##########################################
    def get_category_broad(self, category_title,depth):
        search_list = {}
        end_list = {}
        search_list[0] = [category_title]
        end_list[0] = []
        for i in range(1,depth+1):
            search_list[i] = []
            end_list[i] = []
            bar = tqdm(search_list[i-1],desc='depth '+str(i))
            for title in bar:
                category_titles, end_titles = self._get_category(title)
                search_list[i].extend(category_titles)
                end_list[i].extend(end_titles)
                bar.set_postfix_str(f" end_list depth{i}:{len(end_list[i])}")
        combine_list = []
        for i in range(depth+1):
            combine_list.extend(end_list[i])
        combine_list = self.clean_list(combine_list)
        return search_list,end_list, combine_list  
    def continue_search_category(self,search_dict,end_dict,depth):
        pre_depth = max(search_dict.keys())
        for i in range(pre_depth+1,depth+1):
            search_dict[i] = []
            end_dict[i] = []
            bar = tqdm(search_dict[i-1],desc='depth '+str(i))
            for title in bar:
                category_titles, end_titles = self._get_category(title)
                search_dict[i].extend(category_titles)
                end_dict[i].extend(end_titles)
                bar.set_postfix_str(f" end_list depth{i}:{len(end_dict[i])}")
        combine_list = []
        for i in range(pre_depth+1,depth+1):
            combine_list.extend(end_dict[i])
        combine_list = self.clean_list(combine_list)
        return search_dict, end_dict, combine_list
    def clean_list(self,combine_list):
        combine_list = list(set(combine_list))
        combine_list = [title.replace(' ','_') for title in combine_list]
        return combine_list
########################################## deep search (save) funtion ##########################################
    def get_category_deep(self, category_title, depth, current_depth=0, base_path='./', path_chain=''):
        if current_depth >= depth:
            return
        if current_depth == 0:
            self.deep_search_collect_title_count = 0
            current_path = os.path.join(base_path, category_title.replace(' ', '_').replace(':', '-'))
            if not os.path.exists(current_path):
                os.makedirs(current_path)
        new_path_chain = f"{path_chain} -> {category_title}" if path_chain else category_title
        print(f"total collect num {self.deep_search_collect_title_count}, Processing category chain: {new_path_chain}".ljust(500), end="\r")  

        # If at target depth, collect and save titles
        _, end_titles = self._get_category(category_title)
        self.deep_search_collect_title_count += len(end_titles)
        self._create_and_save(end_titles, current_path)
        category_titles, _ = self._get_category(category_title)
        # create folder
        for title in category_titles:
            subcategory_path = os.path.join(current_path, title.replace(' ', '_').replace(':', '-'))
            if not os.path.exists(subcategory_path):
                os.makedirs(subcategory_path)
        # Recurse into each subcategory
        for subcategory_title in category_titles:
            self.get_category_deep(subcategory_title, depth, current_depth + 1, current_path, new_path_chain)


    def _create_and_save(self, titles, folder_path):
        contents = []
        for i in range(0,len(titles),50):
            contents += self.get_multi_wikipedia_pages_content(titles[i:i+50],verbose=False)
        # Save all contents in one CSV file
        if contents:
            csv_path = os.path.join(folder_path, 'contents.csv')
            df = pd.DataFrame(contents,columns=['title','content'])
            df.to_csv(csv_path, index=False)
    def continue_category_search(self, base_path, target_depth):
        for root, dirs, files in os.walk(base_path):
            current_depth = root.count(os.sep) - base_path.count(os.sep)
            if current_depth < target_depth:
                category_title = os.path.basename(root).replace('_', ' ').replace('-', ':')
                print(category_title)
                self.get_category_deep(category_title, target_depth, current_depth, os.path.dirname(root))
ws = WikipediaSpider()

## search according to giving word list, for each word, search up to 10000 pages

In [21]:
word_list = ['animals','animal diseases','animal anatomy','animal cognition','animal communication','animal diseases','animal migration','animal physiology','animal sexuality','animal welfare','animal rights']

In [None]:
def save_at_select_path(word_list, prefix):
    for word in word_list:
        search_result = ws.search_wikipedia(word)
        title_list = [i[0] for i in search_result]
        ws.get_content_and_save(title_list, prefix+'_'+word, save_folder='data')

## search according to category  
Sample category tree website: https://en.wikipedia.org/wiki/Special:CategoryTree?target=Category%3AAnimals&mode=all&namespaces=

search_dict save the category titles which have child categories and pages

end_dict save the titles whcih are not category

In [None]:
# search first depth
category_title = 'Animals'
search_dict, end_dict, combine_list = ws.get_category_broad(category_title,depth= 1)

In [None]:
# search 2 to 5 depth
for i in range(2,6):
    search_dict, end_dict, combine_list = ws.continue_search_category(search_dict, end_dict, i)

In [23]:
# if you want to continue search
search_dict, end_dict, combine_list = ws.continue_search_category(search_dict, end_dict, 6)

depth 6: 100%|██████████| 1387/1387 [06:27<00:00,  3.58it/s,  end_list depth6:18770]


In [24]:
# also you can search from 1 to 6 with 1 line code, but I recommend you to use above practice.
# search_dict, end_dict, combine_list = ws.get_category_broad('Animal health',6)

### save the dict and list

In [None]:
# save the dict and list as json
with open('category_dict.json','w') as f:
    json.dump(search_dict,f)
with open('category_end_dict.json','w') as f:
    json.dump(end_dict,f)
with open('category_list.json','w') as f:
    json.dump(combine_list,f)

### load the dict and list

In [None]:
# load the dict and list from json
with open('category_dict.json','r') as f:
    search_dict = json.load(f)
with open('category_end_dict.json','r') as f:
    end_dict = json.load(f)
with open('category_list.json','r') as f:
    combine_list = json.load(f)

### search and save according to combine list

In [171]:
ws.get_content_and_save(combine_list, word = category_title + '_category',save_folder='./data/'+category_title+'_category')

100%|██████████| 274/274 [02:54<00:00,  1.57it/s]
