## XML Sitemap Generator

This notebook will generate all of the sitemaps that are stored within the Platform webapp ``/sitemap`` directory and linked to the [Platform index sitemap](https://www.targetvalidation.org/sitemaps/1804/index.xml). The index sitemap is the one that we submit to Google, Bing, Yahoo, Yandex, etc. to improve our SEO.

This notebook should be **run at least 3 times per year (every other release)** to ensure that search engines are correctly indexing the Platform. 

Links and documentation:

* [JS sitemap generator (stored in webapp repo)](https://github.com/opentargets/webapp/blob/master/sitemap-generator.js)
* [Creating XML sitemap from list](https://stackoverflow.com/questions/16681543/create-xml-file-with-python-by-iterating-over-lists)
* [ElementTree XML API documentation](https://docs.python.org/3.4/library/xml.etree.elementtree.html#building-xml-documents)
* [Sitemap XML format](https://www.sitemaps.org/protocol.html)

In [1]:
diseases = [
    {
        "efo_id": "EFO_0000400",
        "efo_label": "diabetes mellitus"
    }, 
    {
        "efo_id": "EFO_0000305",
        "efo_label": "breast carcinoma"
    },
    {
        "efo_id": "EFO_0003060",
        "efo_label": "non small cell lung carcinoma"
    },
    {
        "efo_id": "EFO_0003843",
        "efo_label": "pain"
    },
    {
        "efo_id": "EFO_0000616",
        "efo_label": "neoplasm",
    },    
]

targets = [
    {
        "ensembl_id": "ENSG00000113580",
        "symbol": "NR3C1"
    },
    {
        "ensembl_id": "ENSG00000146648",
        "symbol": "EGFR"
    },
    {
        "ensembl_id": "ENSG00000095303",
        "symbol": "PTGS1"
    },

    {
        "ensembl_id": "ENSG00000131747",
        "symbol": "TOP2A"
    },
    {
        "ensembl_id": "ENSG00000091831",
        "symbol": "ESR1"
    },  
]

In [16]:
import xml.etree.ElementTree as ET
from datetime import datetime

# determine current date and format in YYYY-MM-DD format
today_date = datetime.today().strftime('%Y-%m-%d')

# create root node
urlset = ET.Element("urlset")

# set XML standards and validation properties
urlset.set("xmlns", "http://www.sitemaps.org/schemas/sitemap/0.9")
urlset.set("xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance")
urlset.set("xsi:schemaLocation", "http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd")

# create child nodes of root node using diseases list values
for disease in diseases:
    url = ET.SubElement(urlset,"url")
    loc = ET.SubElement(url, "loc")
    loc.text = "https://www.targetvalidation.org/disease/" + disease["efo_id"]
    lastmod = ET.SubElement(url, "lastmod")
    lastmod.text = today_date
    changefreq = ET.SubElement(url, "changefreq")
    changefreq.text = "monthly"
    priority = ET.SubElement(url, "priority")
    priority.text = "0.6"

xml_tree = ET.ElementTree(urlset)
xml_tree.write('disease_profile_pages.xml')

In [18]:
# create root node
urlset = ET.Element("urlset")

# set XML standards and validation properties
urlset.set("xmlns", "http://www.sitemaps.org/schemas/sitemap/0.9")
urlset.set("xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance")
urlset.set("xsi:schemaLocation", "http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd")

# create child nodes of root node using diseases list values
for disease in diseases:
    url = ET.SubElement(urlset,"url")
    loc = ET.SubElement(url, "loc")
    loc.text = "https://www.targetvalidation.org/disease/" + disease["efo_id"] + "/associations"
    lastmod = ET.SubElement(url, "lastmod")
    lastmod.text = today_date
    changefreq = ET.SubElement(url, "changefreq")
    changefreq.text = "monthly"
    priority = ET.SubElement(url, "priority")
    priority.text = "0.6"

xml_tree = ET.ElementTree(urlset)
xml_tree.write('disease_association_pages.xml')

In [20]:
# create root node
urlset = ET.Element("urlset")

# set XML standards and validation properties
urlset.set("xmlns", "http://www.sitemaps.org/schemas/sitemap/0.9")
urlset.set("xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance")
urlset.set("xsi:schemaLocation", "http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd")

# create child nodes of root node using targets list values
for target in targets:
    url = ET.SubElement(urlset,"url")
    loc = ET.SubElement(url, "loc")
    loc.text = "https://www.targetvalidation.org/target/" + target["ensembl_id"] + "/associations"
    lastmod = ET.SubElement(url, "lastmod")
    lastmod.text = today_date
    changefreq = ET.SubElement(url, "changefreq")
    changefreq.text = "monthly"
    priority = ET.SubElement(url, "priority")
    priority.text = "0.6"

xml_tree = ET.ElementTree(urlset)
xml_tree.write('target_association_pages.xml')

In [21]:
# create root node
urlset = ET.Element("urlset")

# set XML standards and validation properties
urlset.set("xmlns", "http://www.sitemaps.org/schemas/sitemap/0.9")
urlset.set("xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance")
urlset.set("xsi:schemaLocation", "http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd")

# create child nodes of root node using targets list values
for target in targets:
    url = ET.SubElement(urlset,"url")
    loc = ET.SubElement(url, "loc")
    loc.text = "https://www.targetvalidation.org/target/" + target["ensembl_id"]
    lastmod = ET.SubElement(url, "lastmod")
    lastmod.text = today_date
    changefreq = ET.SubElement(url, "changefreq")
    changefreq.text = "monthly"
    priority = ET.SubElement(url, "priority")
    priority.text = "0.6"

xml_tree = ET.ElementTree(urlset)
xml_tree.write('target_profile_pages.xml')

In [23]:
static_pages = [
    {
        "title": "Home page",
        "url": "/",
        "priority": "1"
    },
    {
        "title": "About",
        "url": "/about",
        "priority": "1"
    },
    {
        "title": "Downloads",
        "url": "/downloads",
        "priority": "1"
    },
    {
        "title": "Batch search",
        "url": "/batch-search",
        "priority": "1"
    },
    {
        "title": "Variants",
        "url": "/variants",
        "priority": "0.9"
    },
    {
        "title": "Terms of use",
        "url": "/terms-of-use",
        "priority": "0.5"
    },
    {
        "title": "FAQ",
        "url": "/faq",
        "priority": "1"
    },
    {
        "title": "Scoring",
        "url": "/scoring",
        "priority": "1"
    },
    {
        "title": "Outreach",
        "url": "/outreach",
        "priority": "0.8"
    }
]
# create root node
urlset = ET.Element("urlset")

# set XML standards and validation properties
urlset.set("xmlns", "http://www.sitemaps.org/schemas/sitemap/0.9")
urlset.set("xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance")
urlset.set("xsi:schemaLocation", "http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd")

# create child nodes of root node using static_pages list values
for page in static_pages:
    url = ET.SubElement(urlset,"url")
    loc = ET.SubElement(url, "loc")
    loc.text = "https://www.targetvalidation.org" + page["url"]
    lastmod = ET.SubElement(url, "lastmod")
    lastmod.text = today_date
    changefreq = ET.SubElement(url, "changefreq")
    changefreq.text = "monthly"
    priority = ET.SubElement(url, "priority")
    priority.text = page["priority"]

xml_tree = ET.ElementTree(urlset)
xml_tree.write('static_pages.xml')

Refactored code

In [27]:
import xml.etree.ElementTree as ET
from datetime import datetime

sitemaps = [
    {
        "title": "target association pages",
        "file_name": "target_association_pages.xml",
        "default_priority": "0.8",
        "is_association_page": True,
        "entity": "target"
    },
    {
        "title": "target profile pages",
        "file_name": "target_profile_pages.xml",
        "default_priority": "0.8",
        "is_association_page": False,
        "entity": "target"
    },
    {
        "title": "disease association pages",
        "file_name": "disease_association_pages.xml",
        "default_priority": "0.8",
        "is_association_page": True,
        "entity": "disease"
    },
    {
        "title": "disease profile pages",
        "file_name": "disease_profile_pages.xml",
        "default_priority": "0.8",
        "is_association_page": False,
        "entity": "disease"
    },
    {
        "title": "static pages",
        "file_name": "static_pages.xml",
        "default_priority": "0.8",
        "is_association_page": False,
        "entity": ""
    },
]

def create_sitemap(sitemap, data_list):
    print(sitemap)
    

for sitemap in sitemaps:
    
    # create root node
    urlset = ET.Element("urlset")
    
    # set XML standards and validation properties
    urlset.set("xmlns", "http://www.sitemaps.org/schemas/sitemap/0.9")
    urlset.set("xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance")
    urlset.set("xsi:schemaLocation", "http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd")

    if sitemap["entity"] == "target":
        create_sitemap(sitemap, targets)
    
    elif sitemap["entity"] == "disease":
        create_sitemap(sitemap, diseases)
        
    else:
        create_sitemap(sitemap, static_pages)
        


{'title': 'target association pages', 'file_name': 'target_association_pages.xml', 'default_priority': '0.8', 'is_association_page': True, 'entity': 'target'}
{'title': 'target profile pages', 'file_name': 'target_profile_pages.xml', 'default_priority': '0.8', 'is_association_page': False, 'entity': 'target'}
{'title': 'disease association pages', 'file_name': 'disease_association_pages.xml', 'default_priority': '0.8', 'is_association_page': True, 'entity': 'disease'}
{'title': 'disease profile pages', 'file_name': 'disease_profile_pages.xml', 'default_priority': '0.8', 'is_association_page': False, 'entity': 'disease'}
{'title': 'static pages', 'file_name': 'static_pages.xml', 'default_priority': '0.8', 'is_association_page': False, 'entity': ''}
