In [212]:
import requests
import pandas as pd
S = requests.Session()

URL = "https://en.wikipedia.org/w/api.php"

COMMON_PARAMS = {
    "action":"query",
    "format":"json",
    "list":"categorymembers",
    "cmlimit": 500
}

PAGES_DICT = {
}

CATEGORIES_LIST = []

In [213]:
def scrape_page_text(title):
    PARAMS = {
        "action":"query",
        "format":"json",
        'prop': 'extracts',
        'explaintext': True,
        'titles': title
    }
    R = S.get(url=URL, params=PARAMS)
    article = R.json()
    pages = article['query']['pages']
    page_id = next(iter(pages))
    text = pages[page_id]['extract']
    return text

In [214]:
scrape_page_text("Ain Sakhri Lovers")

'The Ain Sakhri Lovers figurine is a sculpture that was found in one of the Ain Sakhri caves near Bethlehem. The sculpture is considered to be 11,000 years old and to be the oldest known representation of two people engaged in sexual intercourse.\n\n\n== Discovery ==\nThe sculpture was identified in 1933 by René Neuville, a French consul in Jerusalem and prehistorian, when looking through random finds obtained by the French Fathers at Bethlehem. He found the stone whilst visiting a small museum with Abbé Breuil. Neuville immediately identified it as important and was able to get an introduction to the Bedouin who had made the finds at Wadi Khareitoun. He was led to a location within the Ain Sakhri caves and it is from these caves that the sculpture gets its name. Excavations of the caves revealed that the cave had been used domestically thousands of years ago and the finds were Natufian. For this reason it is thought that the figurine was used domestically and had not been left there a

In [215]:
def scrape_page_wikidata(title):
    try:
        PARAMS = {
            "prop": "pageprops",
            "ppprop": "wikibase_item",
            "format": "json",
            "action": "query",
            "titles": title
        }

        R = S.get(url=URL, params=PARAMS)
        metadata = R.json()
        metadata = metadata['query']['pages']
        page_id = next(iter(metadata))
        entity = metadata[page_id]["pageprops"]["wikibase_item"]

        PARAMS = {
            "format": "json",
            "query": 
            '\
                SELECT ?wdLabel ?ps_Label ?wdpqLabel ?pq_Label {\
                  VALUES (?company) {(wd:' + entity + ')}\
                        ?company ?p ?statement .\
                        ?statement ?ps ?ps_ .\
                        ?wd wikibase:claim ?p.\
                        ?wd wikibase:statementProperty ?ps.\
                        OPTIONAL {\
                          ?statement ?pq ?pq_ .\
                          ?wdpq wikibase:qualifier ?pq .\
                        }\
                      SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }\
                    }\
                ORDER BY ?wd ?statement ?ps_'
        }
        R = S.get(url="https://query.wikidata.org/sparql", params=PARAMS)
        wikidata_json = R.json()
        results = {}
        for obj in wikidata_json['results']['bindings']:
            key = obj['wdLabel']['value']
            val = obj['ps_Label']['value']
            results[key] = val
        return results
    except:
        return {}

In [216]:
scrape_page_wikidata("Bassetki Statue")

{'country': 'Iraq',
 'image': 'http://commons.wikimedia.org/wiki/Special:FilePath/Bassetki%20statue.jpg',
 'material used': 'copper',
 'location of discovery': 'Bassetki',
 'instance of': 'cultural property',
 'mass': '150',
 'CDLI ID': 'P216558',
 'BabelNet ID': '02706642n',
 'location': 'National Museum of Iraq',
 'inception': '-2350-01-01T00:00:00Z',
 'Freebase ID': '/m/0h943gk'}

In [217]:
def rec_scrape_pages(category_name, cmcontinue = False):
    PARAMS = COMMON_PARAMS
    PARAMS["cmtype"] = "page"
    PARAMS["cmtitle"] = category_name
    R = S.get(url=URL, params=PARAMS)
    articles = R.json()
    if articles.get('continue', False):
        cmcontinue = articles['continue']['cmcontinue']
    payload = articles['query']['categorymembers']
    for page in payload:
        title = page['title']
        if PAGES_DICT.get(title, False):
            continue
        else:
            try:
                text = scrape_page_text(title)
                wikidata = scrape_page_wikidata(title)
                PAGES_DICT[title] = {
                    "category": category_name,
                    "title": title,
                    "text": text,
                    "location": wikidata.get('location', ''),
                    "inception": wikidata.get('inception', ''),
                    "image": wikidata.get('image', ''),
                    "discovery": wikidata.get('location of discovery', ''),
                    "country": wikidata.get('country', '')
                }
            except Exception as e:
                print("An exception occurred at page ", title)
                print(e)
    if cmcontinue:
        rec_scrape_pages(category_name, cmcontinue)
    print(articles)

In [218]:
rec_scrape_pages("Category:Archaeological_discoveries_in_Asia")

{'batchcomplete': '', 'query': {'categorymembers': [{'pageid': 43542390, 'ns': 0, 'title': 'List of hoards in Asia'}, {'pageid': 48773752, 'ns': 0, 'title': 'Magdala stone'}, {'pageid': 44716989, 'ns': 0, 'title': 'Prehistoric grave goods in the Philippines'}, {'pageid': 52197343, 'ns': 0, 'title': 'Tillya Tepe Buddhist coin'}]}}


In [223]:
def rec_scrape_cat(category_name, cmcontinue = False):
    PARAMS = COMMON_PARAMS
    PARAMS["cmtitle"] = category_name,
    PARAMS["cmtype"] = "subcat"
    
    R = S.get(url=URL, params=PARAMS)
    subcategories = R.json()
    if subcategories.get('continue', False):
        cmcontinue = subcategories['continue']['cmcontinue']
    payload = subcategories['query']['categorymembers']
    for ind, subcategory in enumerate(payload):
        print("Saving, categories processed", ind)
        df = pd.DataFrame.from_dict(PAGES_DICT, orient='index')
        df.to_csv("scraped.csv")
        title = subcategory['title']
        if title in CATEGORIES_LIST:
            continue
        else:
            CATEGORIES_LIST.append(title)
            try:
                rec_scrape_pages(title)
                rec_scrape_cat(title)
            except:
                  print("An exception occurred at category ", title)   
    if cmcontinue:
        rec_scrape_pages(category_name, cmcontinue)

In [None]:
rec_scrape_cat("Category:Archaeological artifacts")

Saving, categories processed 0
{'batchcomplete': '', 'query': {'categorymembers': [{'pageid': 584894, 'ns': 0, 'title': 'Artifact (archaeology)'}, {'pageid': 113276, 'ns': 0, 'title': 'Adze'}, {'pageid': 2429941, 'ns': 0, 'title': 'Andiron'}, {'pageid': 4001002, 'ns': 0, 'title': 'Ard (plough)'}, {'pageid': 51513, 'ns': 0, 'title': 'Arrow'}, {'pageid': 642882, 'ns': 0, 'title': 'Assemblage (archaeology)'}, {'pageid': 1923399, 'ns': 0, 'title': 'Astragalomancy'}, {'pageid': 21808488, 'ns': 0, 'title': 'Axe of Perun'}, {'pageid': 8486816, 'ns': 0, 'title': 'Bannerstone'}, {'pageid': 1326668, 'ns': 0, 'title': 'Bâton de commandement'}, {'pageid': 1925449, 'ns': 0, 'title': 'Beaker (archaeology)'}, {'pageid': 1776719, 'ns': 0, 'title': 'Bi (jade)'}, {'pageid': 677277, 'ns': 0, 'title': 'Blade (archaeology)'}, {'pageid': 6532888, 'ns': 0, 'title': 'Bog butter'}, {'pageid': 3443698, 'ns': 0, 'title': 'Bone tool'}, {'pageid': 1491240, 'ns': 0, 'title': 'Bout-coupé'}, {'pageid': 2073328, 'ns':

{'batchcomplete': '', 'query': {'categorymembers': [{'pageid': 1076007, 'ns': 0, 'title': 'Pottery of ancient Greece'}, {'pageid': 55218815, 'ns': 0, 'title': 'Ancient Greek funerary vases'}, {'pageid': 55140605, 'ns': 0, 'title': 'Arezzo 1465 vase'}, {'pageid': 53597975, 'ns': 0, 'title': 'Conservation and restoration of ancient Greek pottery'}, {'pageid': 6438977, 'ns': 0, 'title': 'Corpus vasorum antiquorum'}, {'pageid': 42212177, 'ns': 0, 'title': 'Cypriot Bichrome ware'}, {'pageid': 2881974, 'ns': 0, 'title': 'Demaratus of Corinth'}, {'pageid': 55209498, 'ns': 0, 'title': 'Drinking cup (kylix) depicting athletic combats by Onesimos'}, {'pageid': 55270051, 'ns': 0, 'title': 'Gigantomachy by the Suessula Painter'}, {'pageid': 51928965, 'ns': 10, 'title': 'Template:Greek vase painting'}, {'pageid': 8238007, 'ns': 0, 'title': 'Kalos inscription'}, {'pageid': 20156425, 'ns': 0, 'title': 'Kamares ware'}, {'pageid': 55149055, 'ns': 0, 'title': 'Kerameikos steles'}, {'pageid': 14754006, '

{'batchcomplete': '', 'query': {'categorymembers': []}}
Saving, categories processed 0
Saving, categories processed 1
{'batchcomplete': '', 'query': {'categorymembers': [{'pageid': 33454968, 'ns': 0, 'title': 'Amphiaraos Krater'}, {'pageid': 33496181, 'ns': 0, 'title': 'Arkesilas Cup'}, {'pageid': 14672667, 'ns': 0, 'title': 'Berlin Foundry Cup'}, {'pageid': 55139644, 'ns': 0, 'title': 'Bilingual kylix by the Andokides painter'}, {'pageid': 55961952, 'ns': 0, 'title': "Boeotian Dancer's Group Kothon, Black Figure Tripod, 6th Century B.C."}, {'pageid': 55211059, 'ns': 0, 'title': 'Boxing Siana Cup'}, {'pageid': 41478771, 'ns': 0, 'title': 'Brygos cup of Würzburg'}, {'pageid': 55151929, 'ns': 0, 'title': 'Calyx-Krater by the artist called the Painter of the Berlin Hydria depicting an Amazonomachy'}, {'pageid': 17018854, 'ns': 0, 'title': 'Chigi vase'}, {'pageid': 49388959, 'ns': 0, 'title': 'Cylix of Apollo'}, {'pageid': 43345880, 'ns': 0, 'title': 'Dinos of the Gorgon Painter'}, {'pagei

In [206]:
PAGES_DICT

{'List of hoards in Asia': {'category': 'Category:Archaeological_discoveries_in_Asia',
  'title': 'List of hoards in Asia',
  'text': "The list of hoards in Asia comprises significant archaeological hoards of coins, jewellery, precious and scrap metal objects and other valuable items discovered in Asia. It includes both hoards that were buried with the intention of retrieval at a later date (personal hoards, founder's hoards, merchant's hoards, and hoards of loot), and also hoards of votive offerings which were not intended to be recovered at a later date, but excludes grave goods and single items found in isolation.\n\nBactrian Gold\nChausa hoard\nKfar Monash Hoard\nWonoboyo hoard\nZiwiye hoard\n\n\n== See also ==\nList of hoards in Britain\nList of hoards in the Channel Islands\nLists of hoards",
  'location': '',
  'inception': '',
  'image': '',
  'discovery': '',
  'country': ''},
 'Magdala stone': {'category': 'Category:Archaeological_discoveries_in_Asia',
  'title': 'Magdala sto