In [4]:
# Unified SPARQL script: fetch base and detailed info in one query
import os
import time
import json
import requests
from typing import Dict, List
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# Use specific GPUs if needed
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,..."


def create_session() -> requests.Session:
    """Create a requests session with retry logic."""
    session = requests.Session()
    retry_strategy = Retry(
        total=3,
        backoff_factor=1,
        status_forcelist=[429, 500, 502, 503, 504]
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    return session


def query_wikidata(session: requests.Session, query: str, headers: Dict[str, str], offset: int = 0, limit: int = 5000) -> List[Dict]:
    """Execute a SPARQL query on Wikidata with pagination and return the raw bindings."""
    url = "https://query.wikidata.org/sparql"
    paginated_query = f"{query}\n LIMIT {limit} \n OFFSET {offset}\n" + '''
  }
  OPTIONAL { ?item wdt:P17 ?country.      ?country rdfs:label ?countryLabel.      FILTER(LANG(?countryLabel)="en") }
  OPTIONAL { ?item wdt:P131 ?adminEntity. FILTER NOT EXISTS { ?adminEntity wdt:P31 wd:Q515. } ?adminEntity rdfs:label ?adminEntityLabel. FILTER(LANG(?adminEntityLabel)="en") }
  OPTIONAL { ?item wdt:P131 ?city. FILTER EXISTS { ?city wdt:P31 wd:Q515. } ?city rdfs:label ?cityLabel. FILTER(LANG(?cityLabel)="en") }
  OPTIONAL { ?item wdt:P276 ?loc.       ?loc rdfs:label ?locationLabel.       FILTER(LANG(?locationLabel)="en") }
  OPTIONAL { ?item wdt:P669 ?street.     ?street rdfs:label ?streetNameLabel.     FILTER(LANG(?streetNameLabel)="en") }
}
    '''
    try:
        resp = session.get(
            url,
            params={"query": paginated_query, "format": "json"},
            headers=headers,
            timeout=30
        )
        resp.raise_for_status()
        data = resp.json()
        return data.get('results', {}).get('bindings', [])
    except Exception as e:
        print(f"Error querying Wikidata at offset {offset}: {e}")
        return []


def save_to_jsonl(records: List[Dict], output_file: str):
    """Append a list of dicts to a JSONL file."""
    try:
        with open(output_file, 'a', encoding='utf-8') as f:
            for rec in records:
                f.write(json.dumps(rec, ensure_ascii=False) + '\n')
        print(f"Saved {len(records)} records to {output_file}")
    except Exception as e:
        print(f"Error saving to {output_file}: {e}")


def parse_binding(binding: Dict) -> Dict:
    """Extract simplified record from a single SPARQL binding."""
    def get_val(var: str) -> str:
        return binding.get(var, {}).get('value', '')

    uri = get_val('item')
    # Utility to extract Q-number
    qnum = lambda val: val.rsplit('/', 1)[-1] if '/' in val else val

    return {
        'itemQ': qnum(uri),
        'itemLabel': get_val('itemLabel'),
        'countryQ': qnum(get_val('country')),  
        'countryLabel': get_val('countryLabel'),
        'adminEntityQ': qnum(get_val('adminEntity')),
        'adminEntityLabel': get_val('adminEntityLabel'),
        'cityLabel': get_val('cityLabel'),
        'locationLabel': get_val('locationLabel'),
        'streetNameLabel': get_val('streetNameLabel')
    }


def main() -> List[Dict]:
    """Run unified SPARQL pipeline to fetch all info in one query."""
    # SPARQL query: union of classes plus optional detailed fields
    base_query = '''
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT DISTINCT ?item ?itemLabel ?country ?countryLabel ?adminEntity ?adminEntityLabel ?cityLabel ?locationLabel ?streetNameLabel WHERE {
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
  {
    SELECT DISTINCT ?item WHERE {
      { ?item p:P31 ?s0. ?s0 (ps:P31/(wdt:P279*)) wd:Q17350442. } UNION
      { ?item p:P31 ?s1. ?s1 (ps:P31/(wdt:P279*)) wd:Q4895393. } UNION
      { ?item p:P31 ?s2. ?s2 (ps:P31/(wdt:P279*)) wd:Q464980. } UNION
      { ?item p:P31 ?s3. ?s3 (ps:P31/(wdt:P279*)) wd:Q3918. } UNION
      { ?item p:P31 ?s4. ?s4 (ps:P31/(wdt:P279*)) wd:Q33506. }
    }

'''
    headers = {
        "User-Agent": "MyLandmarkQuery/1.0 (your.email@example.com)",
        "Accept": "application/json"
    }
    batch_size = 10
    offset = 0
    output_file = 'wikidata_all_info.jsonl'
    open(output_file, 'w').close()  # Reset output file

    session = create_session()
    all_records: List[Dict] = []

    while True:
        print(f"Querying with offset {offset}...")
        
        bindings = query_wikidata(session, base_query, headers, offset, batch_size)
        if not bindings:
            print("No bindings returned; ending.")
            break

        records = [parse_binding(b) for b in bindings]
        save_to_jsonl(records, output_file)
        all_records.extend(records)

        if len(bindings) < batch_size:
            print("Final batch reached.")
            break
        offset += batch_size
        break
        time.sleep(1)

    session.close()
    print(f"Completed: {len(all_records)} total records saved to {output_file}")
    return all_records


if __name__ == "__main__":
    main()


Querying with offset 0...
Saved 11 records to wikidata_all_info.jsonl
Completed: 11 total records saved to wikidata_all_info.jsonl
