In [1]:
from rdflib import Graph
from urllib.request import urlretrieve
import zipfile
import pandas as pd
import os

g = Graph()
g.parse("https://data.wikipathways.org/current/rdf/wikipathways-rdf-void.ttl")
downloadURL_query = """
PREFIX dcat:     <http://www.w3.org/ns/dcat#>

SELECT DISTINCT ?url
WHERE {
  [] dcat:downloadURL ?url .
}
"""

qres = g.query(downloadURL_query)
for row in qres:
    url = row.url
    prefix, delim, filename = url.rpartition('/')
    print(f"{url} -> {filename}")
    urlretrieve(url, filename)
    with zipfile.ZipFile(filename, mode="r") as archive:
        for file in archive.namelist():
            if file.endswith(".ttl"):
                print(f"reading {file}")
                ttl_file = zipfile.Path(filename, file)
                g.parse(data=ttl_file.read_text())

http://data.wikipathways.org/20240710/rdf/wikipathways-20240710-rdf-wp.zip -> wikipathways-20240710-rdf-wp.zip
reading wp/WP508.ttl
reading wp/WP4206.ttl
reading wp/WP427.ttl
reading wp/WP5224.ttl
reading wp/WP85.ttl
reading wp/WP1541.ttl
reading wp/WP4944.ttl
reading wp/WP4853.ttl
reading wp/WP2860.ttl
reading wp/WP3185.ttl
reading wp/WP2944.ttl
reading wp/WP1591.ttl
reading wp/WP5043.ttl
reading wp/WP5420.ttl
reading wp/WP332.ttl
reading wp/WP2113.ttl
reading wp/WP5181.ttl
reading wp/WP5199.ttl
reading wp/WP4545.ttl
reading wp/WP2212.ttl
reading wp/WP3248.ttl
reading wp/WP5143.ttl
reading wp/WP1016.ttl
reading wp/WP2839.ttl
reading wp/WP4504.ttl
reading wp/WP5145.ttl
reading wp/WP906.ttl
reading wp/WP917.ttl
reading wp/WP2813.ttl
reading wp/WP2487.ttl
reading wp/WP5133.ttl
reading wp/WP5195.ttl
reading wp/WP4872.ttl
reading wp/WP2221.ttl
reading wp/WP979.ttl
reading wp/WP4721.ttl
reading wp/WP112.ttl
reading wp/WP3941.ttl
reading wp/WP5234.ttl
reading wp/WP451.ttl
reading wp/WP2817.t

http://rdf.wikipathways.org/User/%(^ does not look like a valid URI, trying to serialize this will break.
http://rdf.wikipathways.org/User/%(^ does not look like a valid URI, trying to serialize this will break.
https://wikipathways.org/authors/%(^ does not look like a valid URI, trying to serialize this will break.


reading authors/WP4962.ttl
reading authors/WP4963.ttl
reading authors/WP4965.ttl
reading authors/WP4966.ttl
reading authors/WP4969.ttl
reading authors/WP497.ttl
reading authors/WP4970.ttl
reading authors/WP4971.ttl
reading authors/WP498.ttl
reading authors/WP499.ttl
reading authors/WP500.ttl
reading authors/WP501.ttl
reading authors/WP5019.ttl
reading authors/WP502.ttl
reading authors/WP5023.ttl
reading authors/WP5024.ttl
reading authors/WP5025.ttl
reading authors/WP5026.ttl
reading authors/WP5027.ttl
reading authors/WP5028.ttl
reading authors/WP5029.ttl
reading authors/WP503.ttl
reading authors/WP5030.ttl
reading authors/WP5031.ttl
reading authors/WP5033.ttl
reading authors/WP5034.ttl
reading authors/WP5035.ttl
reading authors/WP5036.ttl
reading authors/WP5037.ttl
reading authors/WP5038.ttl
reading authors/WP5039.ttl
reading authors/WP504.ttl
reading authors/WP5043.ttl
reading authors/WP5044.ttl
reading authors/WP5046.ttl
reading authors/WP5049.ttl
reading authors/WP5050.ttl
reading a

SPARQL query template for querying local rdf
===============================

In [5]:
query = """
PREFIX wp: <http://vocabularies.wikipathways.org/wp#>
PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

select distinct ?pathway (str(?label) as ?geneProduct) where {
    ?geneProduct a wp:GeneProduct . 
    ?geneProduct rdfs:label ?label .
    ?geneProduct dcterms:isPartOf ?pathway .
    ?pathway a wp:Pathway .
    ?pathway dcterms:identifier "WP716" . #Replace "WP716" with WP ID of interest
}
"""
results = g.query(query)

In [7]:
# Extract the headers from the result's vars attribute
headers = results.vars

# Extract data into a list of lists
data = []
for row in results:
    data.append([str(row[var]) for var in headers])

# Create a DataFrame from the data and headers
if not os.path.exists("data"):
    # Create the directory if it does not exist
    os.makedirs("data")
df = pd.DataFrame(data, columns=headers)
df.to_csv("data/WP716_gene_products.csv", index=False)
# Display the DataFrame
df

Unnamed: 0,pathway,geneProduct
0,https://identifiers.org/wikipathways/WP716_r12...,RXRA
1,https://identifiers.org/wikipathways/WP716_r12...,RXRa
2,https://identifiers.org/wikipathways/WP716_r12...,RXR
3,https://identifiers.org/wikipathways/WP716_r12...,MAPK1
4,https://identifiers.org/wikipathways/WP716_r12...,ERK2
...,...,...
63,https://identifiers.org/wikipathways/WP716_r12...,Aldh1a2
64,https://identifiers.org/wikipathways/WP716_r12...,Rbp2
65,https://identifiers.org/wikipathways/WP716_r12...,Lrat
66,https://identifiers.org/wikipathways/WP716_r12...,Crabp1


# Results
Download the results as csv from [here](../data/WP716_gene_products.csv)

In [8]:
from datetime import datetime
import gzip
import shutil

# Serialize to a temporary file with today's date
today_date = datetime.today().strftime('%Y-%m-%d')
temp_filename = f"../data/wikipathways_{today_date}.ttl"
g.serialize(temp_filename, format="ttl")

# Gzip the file
with open(temp_filename, 'rb') as f_in, gzip.open(f"../data/wikipathways_{today_date}.ttl.gz", 'wb') as f_out:
    shutil.copyfileobj(f_in, f_out)

Exception: "http://rdf.wikipathways.org/User/%(^" does not look like a valid URI, I cannot serialize this as N3/Turtle. Perhaps you wanted to urlencode it?