In [1]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pandas as pd
import numpy as np
import lxml.etree as etree
from datetime import datetime

In [2]:
disease_path = "/Users/ahercules/Desktop/sitemap-generator/data/diseases"
target_path = "/Users/ahercules/Desktop/sitemap-generator/data/targets"
drug_path = "/Users/ahercules/Desktop/sitemap-generator/data/molecule"

In [3]:
spark = (
    SparkSession.builder
    .master('local[*]')
    .getOrCreate()
)

In [4]:
disease_data = spark.read.parquet(disease_path)
target_data = spark.read.parquet(target_path)
drug_data = spark.read.parquet(drug_path)

In [5]:
# disease_list = (disease_data.select("id"))
# target_list = (target_data.select("id"))
# drug_list = (drug_data.select("id"))
# print("Number of diseases: %i" % disease_list.count())
# print("Number of targets: %i" % target_list.count())
# print("Number of drugs: %i" % drug_list.count())

In [6]:
# generate list of IDs for targets, diseases, and drugs
disease_id_list = list((disease_data.select("id")).toPandas()['id'])
target_id_list = list((target_data.select("id")).toPandas()['id'])
drug_id_list = list((drug_data.select("id")).toPandas()['id'])

In [7]:
# split target list into two smaller lists (sitemaps can have max 50,000 URLs)
middle_index = (len(target_id_list))//2
target_id_list_part_1 = target_id_list[:middle_index]
target_id_list_part_2 = target_id_list[middle_index:]

In [8]:
print("Number of diseases: %i" % len(disease_id_list))
print("Number of targets in part 1: %i" % len(target_id_list_part_1))
print("Number of targets in part 2: %i" % len(target_id_list_part_2))
print("Number of drugs: %i" % len(drug_id_list))

Number of diseases: 18497
Number of targets in part 1: 30304
Number of targets in part 2: 30304
Number of drugs: 13076


In [9]:
# determine current date and format in YYYY-MM-DD format
today_date = datetime.today().strftime('%Y-%m-%d')

In [10]:
def create_profile_page_sitemaps(entity, list_part, list_of_entity_ids):
    
    # create <urlset> root and set attribute values
    attribute_qname = etree.QName("http://www.w3.org/2001/XMLSchema-instance", "schemaLocation")
    namespace_mappings = {
        "xsi": "http://www.w3.org/2001/XMLSchema-instance",
        None: "http://www.sitemaps.org/schemas/sitemap/0.9"
    }
    urlset = etree.Element("urlset",
                           {attribute_qname: "http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"},
                           nsmap=namespace_mappings
                          )

    # iterate through lists of IDs for targets, diseases, or drug
    for entity_id in list_of_entity_ids:
        
        # create <url> and <loc> elements
        url = etree.SubElement(urlset,"url")
        loc = etree.SubElement(url, "loc")
             
        # set <loc> element text
        loc.text = "https://platform.opentargets.org" + "/" + entity + "/" + entity_id
        
        #create <lastmod> element
        lastmod = etree.SubElement(url, "lastmod")
        lastmod.text = today_date
    
    # set XML sitemap filename
    sitemap_filename = entity + "_" + "profile_pages" + "_" + list_part + ".xml"
    
    # create and save XML file
    xml_tree_raw = etree.ElementTree(urlset)
    with open("sitemaps/" + sitemap_filename, "wb") as xml_file:
        xml_file.write(etree.tostring(xml_tree_raw, xml_declaration=True, encoding="UTF-8", pretty_print=True))
    
    print("Created " + sitemap_filename + " sitemap")

In [11]:
create_profile_page_sitemaps("disease", "part_1", disease_id_list)
create_profile_page_sitemaps("drug", "part_1", drug_id_list)
create_profile_page_sitemaps("target", "part_1", target_id_list_part_1)
create_profile_page_sitemaps("target", "part_2", target_id_list_part_2)

Created disease_profile_pages_part_1.xml sitemap
Created drug_profile_pages_part_1.xml sitemap
Created target_profile_pages_part_1.xml sitemap
Created target_profile_pages_part_2.xml sitemap


In [12]:
def create_association_page_sitemaps(entity, list_part, list_of_entity_ids):
    
    # create <urlset> root and set attribute values
    attribute_qname = etree.QName("http://www.w3.org/2001/XMLSchema-instance", "schemaLocation")
    namespace_mappings = {
        "xsi": "http://www.w3.org/2001/XMLSchema-instance",
        None: "http://www.sitemaps.org/schemas/sitemap/0.9"
    }
    urlset = etree.Element("urlset",
                           {attribute_qname: "http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"},
                           nsmap=namespace_mappings
                          )

    # iterate through lists of IDs for targets, diseases, or drug
    for entity_id in list_of_entity_ids:
        
        # create <url> and <loc> elements
        url = etree.SubElement(urlset,"url")
        loc = etree.SubElement(url, "loc")
             
        # set <loc> element text
        loc.text = "https://platform.opentargets.org" + "/" + entity + "/" + entity_id + "/" + "associations"
        
        #create <lastmod> element
        lastmod = etree.SubElement(url, "lastmod")
        lastmod.text = today_date
    
    # set XML sitemap filename
    sitemap_filename = entity + "_" + "association_pages" + "_" + list_part + ".xml"
    
    # create and save XML file
    xml_tree_raw = etree.ElementTree(urlset)
    with open("sitemaps/" + sitemap_filename, "wb") as xml_file:
        xml_file.write(etree.tostring(xml_tree_raw, xml_declaration=True, encoding="UTF-8", pretty_print=True))
    
    print("Created " + sitemap_filename + " sitemap")

In [13]:
create_association_page_sitemaps("disease", "part_1", disease_id_list)
create_association_page_sitemaps("target", "part_1", target_id_list_part_1)
create_association_page_sitemaps("target", "part_2", target_id_list_part_2)

Created disease_association_pages_part_1.xml sitemap
Created target_association_pages_part_1.xml sitemap
Created target_association_pages_part_2.xml sitemap
