In [1]:
import requests
import pandas as pd
S = requests.Session()

URL = "https://en.wikipedia.org/w/api.php"

COMMON_PARAMS = {
    "action":"query",
    "format":"json",
    "list":"categorymembers",
    "cmlimit": 500
}

PAGES_LIST = []

PAGES_DICT = {
}

FIRST_RUN = True

CATEGORIES_LIST = []

In [2]:
def scrape_page_text(title):
    PARAMS = {
        "action":"query",
        "format":"json",
        'prop': 'extracts',
        'explaintext': True,
        'titles': title
    }
    R = S.get(url=URL, params=PARAMS)
    article = R.json()
    pages = article['query']['pages']
    page_id = next(iter(pages))
    text = pages[page_id]['extract']
    return text

In [3]:
scrape_page_text("Ain Sakhri Lovers")

'The Ain Sakhri Lovers figurine is a sculpture that was found in one of the Ain Sakhri caves near Bethlehem. The sculpture is considered to be 11,000 years old and to be the oldest known representation of two people engaged in sexual intercourse.\n\n\n== Discovery ==\nThe sculpture was identified in 1933 by René Neuville, a French consul in Jerusalem and prehistorian, when looking through random finds obtained by the French Fathers at Bethlehem. He found the stone whilst visiting a small museum with Abbé Breuil. Neuville immediately identified it as important and was able to get an introduction to the Bedouin who had made the finds at Wadi Khareitoun. He was led to a location within the Ain Sakhri caves and it is from these caves that the sculpture gets its name. Excavations of the caves revealed that the cave had been used domestically thousands of years ago and the finds were Natufian. For this reason it is thought that the figurine was used domestically and had not been left there a

In [4]:
def scrape_page_wikidata(title):
    try:
        PARAMS = {
            "prop": "pageprops",
            "ppprop": "wikibase_item",
            "format": "json",
            "action": "query",
            "titles": title
        }

        R = S.get(url=URL, params=PARAMS)
        metadata = R.json()
        metadata = metadata['query']['pages']
        page_id = next(iter(metadata))
        entity = metadata[page_id]["pageprops"]["wikibase_item"]

        PARAMS = {
            "format": "json",
            "query": 
            '\
                SELECT ?wdLabel ?ps_Label ?wdpqLabel ?pq_Label {\
                  VALUES (?company) {(wd:' + entity + ')}\
                        ?company ?p ?statement .\
                        ?statement ?ps ?ps_ .\
                        ?wd wikibase:claim ?p.\
                        ?wd wikibase:statementProperty ?ps.\
                        OPTIONAL {\
                          ?statement ?pq ?pq_ .\
                          ?wdpq wikibase:qualifier ?pq .\
                        }\
                      SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }\
                    }\
                ORDER BY ?wd ?statement ?ps_'
        }
        R = S.get(url="https://query.wikidata.org/sparql", params=PARAMS)
        wikidata_json = R.json()
        results = {}
        for obj in wikidata_json['results']['bindings']:
            key = obj['wdLabel']['value']
            val = obj['ps_Label']['value']
            results[key] = val
        return results
    except:
        return {}

In [5]:
scrape_page_wikidata("Bassetki Statue")

{'country': 'Iraq',
 'image': 'http://commons.wikimedia.org/wiki/Special:FilePath/Bassetki%20statue.jpg',
 'material used': 'copper',
 'location of discovery': 'Bassetki',
 'instance of': 'cultural property',
 'mass': '150',
 'CDLI ID': 'P216558',
 'BabelNet ID': '02706642n',
 'location': 'National Museum of Iraq',
 'inception': '-2350-01-01T00:00:00Z',
 'Freebase ID': '/m/0h943gk'}

In [6]:
def rec_scrape_pages(category_name, cmcontinue = False):
    PARAMS = COMMON_PARAMS
    PARAMS["cmtype"] = "page"
    PARAMS["cmtitle"] = category_name
    R = S.get(url=URL, params=PARAMS)
    articles = R.json()
    if articles.get('continue', False):
        cmcontinue = articles['continue']['cmcontinue']
    payload = articles['query']['categorymembers']
    for page in payload:
        title = page['title']
        if page in PAGES_LIST:
            continue
        else:
            try:
                text = scrape_page_text(title)
                wikidata = scrape_page_wikidata(title)
                PAGES_DICT[title] = {
                    "category": category_name,
                    "title": title,
                    "text": text,
                    "location": wikidata.get('location', ''),
                    "inception": wikidata.get('inception', ''),
                    "image": wikidata.get('image', ''),
                    "discovery": wikidata.get('location of discovery', ''),
                    "country": wikidata.get('country', '')
                }
            except Exception as e:
                print("An exception occurred at page ", title)
                print(e)
    if cmcontinue:
        rec_scrape_pages(category_name, cmcontinue)

In [7]:
rec_scrape_pages("Category:Archaeological_discoveries_in_Asia")

{'batchcomplete': '', 'query': {'categorymembers': [{'pageid': 43542390, 'ns': 0, 'title': 'List of hoards in Asia'}, {'pageid': 48773752, 'ns': 0, 'title': 'Magdala stone'}, {'pageid': 44716989, 'ns': 0, 'title': 'Prehistoric grave goods in the Philippines'}, {'pageid': 52197343, 'ns': 0, 'title': 'Tillya Tepe Buddhist coin'}]}}


In [1]:
def rec_scrape_cat(category_name, cmcontinue = False):
    global FIRST_RUN
    global PAGES_DICT
    
    PARAMS = COMMON_PARAMS
    PARAMS["cmtitle"] = category_name,
    PARAMS["cmtype"] = "subcat"
    
    R = S.get(url=URL, params=PARAMS)
    subcategories = R.json()
    if subcategories.get('continue', False):
        cmcontinue = subcategories['continue']['cmcontinue']
    payload = subcategories['query']['categorymembers']
    for ind, subcategory in enumerate(payload):
        print("Saving, category processed ", category_name)
        PAGES_LIST = PAGES_LIST.extend(PAGES_DICT.keys())
        print("Pages processed ", len(PAGES_LIST))
        if FIRST_RUN:
            df = pd.DataFrame.from_dict(PAGES_DICT, orient='index')
            FIRST_RUN = False
        else:
            old = pd.read_csv("scraped.csv")
            new = pd.DataFrame.from_dict(PAGES_DICT, orient='index')
            df = pd.concat([old, new])
        df.to_csv("scraped.csv")
        PAGES_DICT = {}
        title = subcategory['title']
        if title in CATEGORIES_LIST:
            continue
        else:
            CATEGORIES_LIST.append(title)
            try:
                rec_scrape_pages(title)
                rec_scrape_cat(title)
            except:
                  print("An exception occurred at category ", title)   
    if cmcontinue:
        rec_scrape_cat(category_name, cmcontinue)

In [None]:
rec_scrape_cat("Category:Archaeological artifacts")

Saving, categories processed 0
{'batchcomplete': '', 'query': {'categorymembers': [{'pageid': 584894, 'ns': 0, 'title': 'Artifact (archaeology)'}, {'pageid': 113276, 'ns': 0, 'title': 'Adze'}, {'pageid': 2429941, 'ns': 0, 'title': 'Andiron'}, {'pageid': 4001002, 'ns': 0, 'title': 'Ard (plough)'}, {'pageid': 51513, 'ns': 0, 'title': 'Arrow'}, {'pageid': 642882, 'ns': 0, 'title': 'Assemblage (archaeology)'}, {'pageid': 1923399, 'ns': 0, 'title': 'Astragalomancy'}, {'pageid': 21808488, 'ns': 0, 'title': 'Axe of Perun'}, {'pageid': 8486816, 'ns': 0, 'title': 'Bannerstone'}, {'pageid': 1326668, 'ns': 0, 'title': 'Bâton de commandement'}, {'pageid': 1925449, 'ns': 0, 'title': 'Beaker (archaeology)'}, {'pageid': 1776719, 'ns': 0, 'title': 'Bi (jade)'}, {'pageid': 677277, 'ns': 0, 'title': 'Blade (archaeology)'}, {'pageid': 6532888, 'ns': 0, 'title': 'Bog butter'}, {'pageid': 3443698, 'ns': 0, 'title': 'Bone tool'}, {'pageid': 1491240, 'ns': 0, 'title': 'Bout-coupé'}, {'pageid': 2073328, 'ns':

Saving, categories processed 0
{'batchcomplete': '', 'query': {'categorymembers': [{'pageid': 1076007, 'ns': 0, 'title': 'Pottery of ancient Greece'}, {'pageid': 55218815, 'ns': 0, 'title': 'Ancient Greek funerary vases'}, {'pageid': 55140605, 'ns': 0, 'title': 'Arezzo 1465 vase'}, {'pageid': 53597975, 'ns': 0, 'title': 'Conservation and restoration of ancient Greek pottery'}, {'pageid': 6438977, 'ns': 0, 'title': 'Corpus vasorum antiquorum'}, {'pageid': 42212177, 'ns': 0, 'title': 'Cypriot Bichrome ware'}, {'pageid': 2881974, 'ns': 0, 'title': 'Demaratus of Corinth'}, {'pageid': 55209498, 'ns': 0, 'title': 'Drinking cup (kylix) depicting athletic combats by Onesimos'}, {'pageid': 55270051, 'ns': 0, 'title': 'Gigantomachy by the Suessula Painter'}, {'pageid': 51928965, 'ns': 10, 'title': 'Template:Greek vase painting'}, {'pageid': 8238007, 'ns': 0, 'title': 'Kalos inscription'}, {'pageid': 20156425, 'ns': 0, 'title': 'Kamares ware'}, {'pageid': 55149055, 'ns': 0, 'title': 'Kerameikos s

{'batchcomplete': '', 'query': {'categorymembers': [{'pageid': 55375021, 'ns': 0, 'title': 'Athenian Band Cup by the Oakeshott Painter (MET 17.230.5)'}, {'pageid': 51947928, 'ns': 0, 'title': 'Attic vase painting'}, {'pageid': 33468207, 'ns': 0, 'title': 'Band cup'}, {'pageid': 33489739, 'ns': 0, 'title': 'Band skyphos'}, {'pageid': 33495321, 'ns': 0, 'title': 'Bilingual vase painting'}, {'pageid': 6478905, 'ns': 0, 'title': 'François Vase'}, {'pageid': 16933614, 'ns': 0, 'title': 'Ninnion Tablet'}, {'pageid': 17887923, 'ns': 0, 'title': 'Proto-Attic vase painting'}]}}
Saving, categories processed 4
{'batchcomplete': '', 'query': {'categorymembers': [{'pageid': 6930447, 'ns': 0, 'title': 'Greek terracotta figurines'}, {'pageid': 36946842, 'ns': 0, 'title': 'Coroplast (artisan)'}, {'pageid': 50387961, 'ns': 0, 'title': 'Polyphemos reclining and holding a drinking bowl'}, {'pageid': 3321866, 'ns': 0, 'title': 'Psi and phi type figurine'}, {'pageid': 47666568, 'ns': 0, 'title': 'Standing 

Saving, categories processed 1
Saving, categories processed 1
{'batchcomplete': '', 'query': {'categorymembers': [{'pageid': 21239195, 'ns': 0, 'title': 'Ancient Roman pottery'}, {'pageid': 20940977, 'ns': 0, 'title': 'African red slip ware'}, {'pageid': 27342056, 'ns': 0, 'title': 'Ampulla'}, {'pageid': 2697812, 'ns': 0, 'title': 'Antefix'}, {'pageid': 3557520, 'ns': 0, 'title': 'Barbotine'}, {'pageid': 24414301, 'ns': 0, 'title': 'Campana reliefs'}, {'pageid': 16609679, 'ns': 0, 'title': 'Giampietro Campana'}, {'pageid': 6438977, 'ns': 0, 'title': 'Corpus vasorum antiquorum'}, {'pageid': 646278, 'ns': 0, 'title': 'Hans Dragendorff'}, {'pageid': 1160232, 'ns': 0, 'title': 'Duenos inscription'}, {'pageid': 30506791, 'ns': 0, 'title': 'Eastern sigillata A'}, {'pageid': 30506803, 'ns': 0, 'title': 'Eastern sigillata B'}, {'pageid': 30515923, 'ns': 0, 'title': 'Eastern sigillata C'}, {'pageid': 30760402, 'ns': 0, 'title': 'Eastern sigillata D'}, {'pageid': 5657659, 'ns': 0, 'title': 'Egyp

{'batchcomplete': '', 'query': {'categorymembers': [{'pageid': 733355, 'ns': 0, 'title': 'Amarna letters'}, {'pageid': 10252434, 'ns': 0, 'title': 'Amarna letters–localities and their rulers'}, {'pageid': 58411713, 'ns': 0, 'title': 'List of Amarna letters by size'}, {'pageid': 58894354, 'ns': 0, 'title': 'Glossenkeil (Amarna letters)'}, {'pageid': 57850373, 'ns': 0, 'title': 'A Goddess Travels to Egypt'}, {'pageid': 58232252, 'ns': 0, 'title': 'A brotherly quarrel'}, {'pageid': 58260326, 'ns': 0, 'title': 'A Commissioner Murdered'}, {'pageid': 59278532, 'ns': 0, 'title': 'A Hymn to the Pharaoh'}, {'pageid': 58659070, 'ns': 0, 'title': 'A Plea for Help'}, {'pageid': 58402209, 'ns': 0, 'title': 'A Reckoning Demanded'}, {'pageid': 58320527, 'ns': 0, 'title': 'A Royal Order for Glass'}, {'pageid': 58402139, 'ns': 0, 'title': 'A Throne Granted, Not Inherited'}, {'pageid': 58402160, 'ns': 0, 'title': 'A Very Serious Crime'}, {'pageid': 50338275, 'ns': 0, 'title': 'Abimilku letter 149: Neith

{'batchcomplete': '', 'query': {'categorymembers': [{'pageid': 58232252, 'ns': 0, 'title': 'A brotherly quarrel'}, {'pageid': 47053915, 'ns': 0, 'title': 'Alashiya-King, to Pharaoh: The Hand of (god)-Nergal'}, {'pageid': 58586648, 'ns': 0, 'title': 'Amarna letter EA 34'}, {'pageid': 41512041, 'ns': 0, 'title': 'Amarna letter EA 35'}, {'pageid': 58231980, 'ns': 0, 'title': 'Amarna letter EA 38'}, {'pageid': 58260040, 'ns': 0, 'title': 'Amarna letter EA 39'}, {'pageid': 58260208, 'ns': 0, 'title': 'Duty-Free'}, {'pageid': 58586665, 'ns': 0, 'title': "The Pharaoh's reproach Answered"}]}}
Saving, categories processed 1
{'batchcomplete': '', 'query': {'categorymembers': [{'pageid': 46970242, 'ns': 0, 'title': 'Amarna letter EA 15'}]}}
Saving, categories processed 2
{'batchcomplete': '', 'query': {'categorymembers': [{'pageid': 57850373, 'ns': 0, 'title': 'A Goddess Travels to Egypt'}, {'pageid': 39990282, 'ns': 0, 'title': 'Amarna letter EA 19'}, {'pageid': 57850084, 'ns': 0, 'title': 'Amar

{'batchcomplete': '', 'query': {'categorymembers': [{'pageid': 58658812, 'ns': 0, 'title': 'Amarna letter EA 299'}]}}
Saving, categories processed 10
{'batchcomplete': '', 'query': {'categorymembers': [{'pageid': 47596562, 'ns': 0, 'title': 'Amarna letter EA 323'}, {'pageid': 46956809, 'ns': 0, 'title': 'Amarna letter EA 325'}]}}
Saving, categories processed 2
{'batchcomplete': '', 'query': {'categorymembers': [{'pageid': 40688834, 'ns': 0, 'title': 'A (cuneiform)'}, {'pageid': 41289680, 'ns': 0, 'title': 'Ab (cuneiform)'}, {'pageid': 58627159, 'ns': 0, 'title': 'Am (cuneiform)'}, {'pageid': 46649881, 'ns': 0, 'title': 'An (cuneiform)'}, {'pageid': 41290635, 'ns': 0, 'title': 'Ap (cuneiform)'}, {'pageid': 58725355, 'ns': 0, 'title': 'Ar (cuneiform)'}, {'pageid': 46512072, 'ns': 0, 'title': 'ARAD (Sumerogram)'}, {'pageid': 40066881, 'ns': 0, 'title': 'As (cuneiform)'}, {'pageid': 40358940, 'ns': 0, 'title': 'Aš (cuneiform)'}, {'pageid': 58625298, 'ns': 0, 'title': 'Ba (cuneiform)'}, {'p

{'batchcomplete': '', 'query': {'categorymembers': [{'pageid': 16043, 'ns': 0, 'title': 'Jerusalem'}, {'pageid': 724309, 'ns': 0, 'title': 'Names of Jerusalem'}, {'pageid': 60285050, 'ns': 0, 'title': 'Jerusalem Sports Quarter'}, {'pageid': 60720591, 'ns': 0, 'title': 'Jerusalem Volvo Open'}, {'pageid': 55582329, 'ns': 0, 'title': 'Jerusalem, du hochgebaute Stadt'}, {'pageid': 50079177, 'ns': 0, 'title': "L'Shana Haba'ah"}, {'pageid': 59772791, 'ns': 0, 'title': 'Mission of Saint Sergius of Jerusalem'}, {'pageid': 860235, 'ns': 0, 'title': 'New Jerusalem'}, {'pageid': 57616049, 'ns': 0, 'title': 'Paulus-Haus'}, {'pageid': 53136411, 'ns': 0, 'title': 'Reunification of Jerusalem'}, {'pageid': 37870611, 'ns': 0, 'title': 'Zivotofsky v. Clinton'}, {'pageid': 45597467, 'ns': 0, 'title': 'Zivotofsky v. Kerry'}]}}
Saving, categories processed 0
