In [2]:
import argparse
from Bio import Entrez

import os 
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
from datetime import date
from datetime import datetime
import numpy as np
import re
from metapub import FindIt
from metapub.convert import pmid2doi
from metapub.convert import doi2pmid
from metapub import PubMedFetcher

from collections import OrderedDict
import xml.etree.ElementTree as ET
import ssl

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

import argparse
from Bio import Entrez

ssl._create_default_https_context = ssl._create_unverified_context



## Query the NCBI database

In [3]:
# use one of the following search queries/filters
# filters = "HiChIP[All Fields] AND \"gse\"[Filter]" # the whole notebook will take around 7 min to run
# filters = "HiChIP[All Fields] AND (\"gse\"[Filter] AND (\"Homo sapiens\"[Organism] OR \"Mus musculus\"[Organism]))
# filters = "HiChIP[All Fields] AND (\"Homo sapiens\"[Organism] AND \"gse\"[Filter])"
# filters = "HiChIP[All Fields] AND (\"Homo sapiens\"[Organism] AND \"published last year\"[Filter]) AND \"gse\"[Filter]" # 2 min
# filters = "HiChIP[All Fields] AND (\"Mus musculus\"[Organism] AND \"gse\"[Filter])"

filters = "HiChIP[All Fields] AND (\"gse\"[Filter] AND (\"Homo sapiens\"[Organism] OR \"Mus musculus\"[Organism]))"

# set a dummy email 
Entrez.email = "zjiang@lji.org"

# query the NCBI database 
search_result = Entrez.esearch(db="gds", retmax=10000, term=filters)
result = Entrez.read(search_result)

# creating a regex to extract pubmed IDs
PubMedIds_pattern = re.compile('[0123456789]+')

In [4]:
# parse the results into a dataframe 
dictionary_lst = []
for Id in result["IdList"]:
    handle = Entrez.esummary(db="gds", id=Id, retmode="xml") # get summary of this entry on GEO datasets
    entry = Entrez.parse(handle)
    for column in entry:
        dictionary_lst.append(column)
df = pd.DataFrame(dictionary_lst)

KeyboardInterrupt: 

In [None]:
df.head()

In [None]:
df.shape

## Cleaning the Paper Names and Converting from PMID to DOI

In [None]:
dictionary_lst = []
for Id in result["IdList"]:
    
    handle = Entrez.esummary(db="gds", id=Id, retmode="xml")
    entry = Entrez.parse(handle)
    
    for column in entry:
        paper_title = column['title']
        new_paper_title = ''
        last_word = re.split('\s|[.]', paper_title)[-1]
        last_two_words = re.split('\s|[.]', paper_title)[-2:]
        
        # remove ".[HiChIP]", ". [HiChIP]", or "(HiChIP)" in paper titles using steps below
        if ('[' in last_word and ']' in last_word) or \
                ('(' in last_word and ')' in last_word):
            new_paper_title = re.split('\s|[.]', paper_title)[0:-1]
            new_paper_title = ' '.join(new_paper_title)
            new_paper_title = new_paper_title.strip()
            if new_paper_title.endswith('.'): # remove the last period in paper title
                new_paper_title = new_paper_title[:-1]
                
        # remove ".[Hi ChIP]", ". [Hi ChIP]", or "[Bead Array]" in paper titles using steps below
        elif ('[' in last_two_words[0] and ']' in last_two_words[1]) or \
                    ('(' in last_two_words[0] and ')' in last_two_words[1]):
            new_paper_title = re.split('\s|[.]', paper_title)[0:-2]
            new_paper_title = ' '.join(new_paper_title)
            new_paper_title = new_paper_title.strip()
            if new_paper_title.endswith('.'): # remove the last period in paper title
                new_paper_title = new_paper_title[:-1]
                
        elif paper_title.endswith('.'):
            new_paper_title = paper_title[:-1] 
            
        else:
            new_paper_title = paper_title
            
        new_paper_title = new_paper_title.replace("\xa0", " ") # remove no-break space
        column['title'] = new_paper_title
        
        # convert PMID to DOI, some have no PMID so empty string
        string = str(column['PubMedIds'])
        pdf_url = ""
        PubMedIds = PubMedIds_pattern.findall(string)
        
        if len(PubMedIds) > 0:
            DOI = pmid2doi(PubMedIds[0])
            column['PubMedIds'] = DOI
        else: column['PubMedIds'] = ""
        dictionary_lst.append(column)
        
df = pd.DataFrame(dictionary_lst)

## Reformat the Columns for Google Sheet Compatibility

In [None]:
# drop extra columns
df = pd.DataFrame(dictionary_lst)

drop_cols = ['Item', 'Id', 'GDS', 'GPL', 'GSE', 'entryType', 'ptechType',
             'valType', 'SSInfo', 'subsetInfo', 'suppFile', 'Relations', 'ExtRelations',
             'n_samples', 'SeriesTitle', 'PlatformTitle', 'PlatformTaxa', 'SamplesTaxa',
             'Projects', 'FTPLink', 'GEO2R']
df.drop(drop_cols, inplace=True, axis=1)

# rename columns
rename_cols = {"Accession":"GEO / Data link", "PubMedIds":"DOI", "title":"Paper Title",
               "taxon":"Organism", "gdsType":"Any other information", "PDAT":"Year",
               "Samples":"Other matched data"}
df.rename(columns=rename_cols, inplace=True)

# extract just the year
df["Year"] = df["Year"].str[:4]

# add index for merging
df['index'] = np.arange(len(df))

# fill na with empty values
df.fillna("",inplace=True)

# adding missing columns with empty values
df["Journal"] = ""
df["Authors"] = ""
df["Tissue/Cell Line"] = ""
df["Presenter"] = ""
df["Potential HiChIP"] = ""

## Assigning Potential HiChIP Samples

Filter out samples with no HiChIP mention and assigned Yes or Maybe to all others 

In [None]:
col = df["Other matched data"]
lst = []
GSM_IDs = [] ######## new for cell type
index = 0
for row in col:
    
    temp = []
    state = False
    
    # Check for the term HiChIP in the title
    for ele in row: 
        if ("HiChIP".casefold() in ele["Title"].casefold()) or \
                ("Hi-ChIP".casefold() in ele["Title"].casefold()):
            state = True
    
    # keep samples with HiChIP and mark "Yes"
    if state == True: 
        for ele in row:
            GSM_IDs.append(ele["Accession"]) ######## new for cell type
            if ("HiChIP".casefold() in ele["Title"].casefold()) or \
                    ("Hi-ChIP".casefold() in ele["Title"].casefold()):
                temp.append(ele["Accession"] + ": " + ele["Title"])
        temp_str = "\n".join(temp)
        df.at[index, "Potential HiChIP"] = "Yes"

    # otherwise, keep all GSM samples and mark "Maybe"
    else: 
        for ele in row:
            GSM_IDs.append(ele["Accession"]) ######## new for cell type
            temp.append(ele["Accession"] + ": " + ele["Title"])
        temp_str = "\n".join(temp)
        df.at[index,"Potential HiChIP"]="Maybe"
    lst.append(temp_str)
    index += 1
    
df["Other matched data"] = list(lst)

## Merge Rows with the Same Paper Title

In [None]:
aggregation_functions = {'Paper Title':'first', 'DOI':'max', 'Journal':'first', 'Authors':'first', 'Year':'first', 'GEO / Data link':lambda x: '\n'.join(x),
       'Any other information':lambda x: '\n'.join(x), 'Organism':'first', 'Tissue/Cell Line':'first',
       'Potential HiChIP':'first', 'Other matched data':lambda x: '\n'.join(x), 'Presenter':'first'}
df_grouped = df.groupby(df['Paper Title']).aggregate(aggregation_functions)

## Fetch Journal Name and First Author of Each Paper and Add https to DOI

In [None]:
jounrnal_lst = []
authors_lst = []
fetch = PubMedFetcher()
for DOI in df_grouped['DOI']:
    try:
        PMID = doi2pmid(DOI)
        article = fetch.article_by_pmid(PMID)
        jounrnal_lst.append(article.journal)
        authors_lst.append(article.authors[0].split()[0] + " et al.")
    except:
        jounrnal_lst.append("")
        authors_lst.append("")
df_grouped['Journal'] = jounrnal_lst
df_grouped['Authors'] = authors_lst

In [None]:
# add https address to DOI (for easy access)
df_grouped.loc[(df_grouped['DOI'] != ''), 'DOI'] = 'https://doi.org/' + df_grouped.loc[(df_grouped['DOI'] != ''), 'DOI']

## Note the Date Added, Set Output Path, and Save the Final Output File

In [None]:
# make an output directory
######## new for cell type
outdir = ''
# outdir = 'results/hichip_db/'
# os.makedirs(outdir, exist_ok=True)

In [None]:
# determining the current year, month and day
today = date.today()
date_str = today.strftime("%Y_%m_%d")

# determining current hour and minute
now = datetime.now()
time_str = now.strftime("%H_%M")

# setting the output filename
output = os.path.join(outdir, "GEO_Query.{}_{}".format(date_str, time_str))

In [None]:
# add a column that notes the added date
df_grouped['Date Added'] = date_str.replace('_', '-')
reorder = ['Paper Title', 'Journal', 'Authors', 'Year', 'DOI', 'GEO / Data link', 'Any other information',
           'Organism', 'Tissue/Cell Line', 'Potential HiChIP', 'Other matched data', 'Presenter', 'Date Added']
df_grouped = df_grouped.loc[:, reorder]

In [None]:
df_grouped.to_excel(output+".xlsx", index=False)

## Save the GSM IDs line by line

In [None]:
GSM_IDs = list(set(GSM_IDs))
with open(r'gsm_list.txt', 'w') as fp:
    for gsm in GSM_IDs:
        # write each item on a new line
        fp.write("%s\n" % gsm)

## Query GEO BioSample using the GSM IDs

In [17]:
GSM_IDs_from_file = []

# open file and read the content in a list
with open(r'gsm_list.txt', 'r') as fp:
    for line in fp:
        GSM_IDs_from_file.append(line)

# quarter = round(len(GSM_IDs_from_file)/4)
# GSM_filters = ' [All Fields] OR '.join(GSM_IDs_from_file[0:quarter])
GSM_filters = ' [All Fields] OR '.join(GSM_IDs_from_file[0:4000])


Entrez.email = "zjiang@lji.org"
search_result = Entrez.esearch(db="biosample", retmax=10000, term=GSM_filters)
result = Entrez.read(search_result)

### Filter query result by looping through XML

In [18]:
# name
# organism
# biomaterial
# disease
# organ
# celltype
# sex
# age
# extdb_name
# extdb_uuid

    
    
# 	GSM4027555 and more
#     <Attribute attribute_name="type">Tumor</Attribute> # no
# 		<Attribute attribute_name="relapse">No</Attribute> # no
# 		<Attribute attribute_name="gender" harmonized_name="sex" display_name="sex">male</Attribute> # added
# 		<Attribute attribute_name="age_days">2338</Attribute> or timepoint # added
# 		<Attribute attribute_name="Stage">4</Attribute> # no

# GSM6528200 and more
# <Attribute attribute_name="disease state" harmonized_name="disease" display_name="disease">HCC</Attribute> # added
# <Attribute attribute_name="tissue" harmonized_name="tissue" display_name="tissue">tumour</Attribute> # thinking

# GSM3930313 and more
# {'attribute_name': 'biological sample'}-->FHC should replace celltype # thinking

#### Dict of organs and their synynoms for classification

In [20]:
json = OrderedDict()
# order the classifcations from general to specific, so that as the code goes
# down the ordered dict, general classifcations are replaced with specific ones
json = {
    # QUESTION: what do I do with ARMS, which has RH3, 30, 4, 41 metasized to different organs
     # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3713458/?
    "organ": {
    # ewing sarcoma are cells in the bones or connective tissue # plb might cause problem
    "bone marrow": ["bone", "marrow", "ewing sarcoma", "t-all", "cutll1", "myeloma", "plb", "tc71", "\">rh30", "pre-b"],
    "blood": ["vessel", "leukemia", "blood", "ramos b", "haematopoietic", "b-all", "\">sem", "jurkat"], # GSM6206927 bone or blood?
    "blodily fluid": ["cutll1", "pleural", "effusion"],
    "epithelium": ["skin", "epi", "squamous", "breast", "melanoma", "erythematosus", "sle", "mda-mb-231", "keratinocyte", "melanocyte", "nasopharyngeal"], # sle might cause problem
    "endocrine gland": ["thyroid gland", "thymus", "thymic", "treg", "t-reg", "thyroid", "\">rh3\""], # GSM5680696 thyroid or epithelium
    "exocrine gland": ["lymph node", "lymphoma", "tonsil"],
    "liver": ["liver", "hcc"],
    "heart": ["heart", "cardi"],
    "embryo": ["embryo"],
    "connective tissue": ["soft tissue"],
    "musculature of body": ["muscula", "rhabdomyosarcoma", "\">sms", "a673"],
    "lung": ["lung", "\">rh4"],
    "esophagus": ["esophagus", "esophageal"], # GSM5680726 esophagus or epithelium
    "intestine": ["indestin"],
    "kidney": ["kidney", "podocyte"],
    "colon": ["colon", "colorectal"],
    "spleen": ["spleen"],
    "stomach": ["stomach"],
    "placenta": ["placenta"],
    "pancreas": ["pancrea"],
    "bladder": ["bladder"],
    "reproductive organs": ["umbilical", "uterus", "penis", "ovar", "test", "endometrial", "hela"],
    "nerve": ["neuroblastoma", "nerve", "gist", "spinal cord tissue"],
    "brain": ["brain", "caudate", "chla-10"],
    "prostate": ["prostate"],
    # dupliate the classifications except the synonyms are now cell line names or cell type names?
},
"biomaterial": {
    "cell line": ["cellline", "cell line"],
    "primary": ["primary", "primi"],
}
}

In [30]:
# setting the output filename
today = date.today()
date_str = today.strftime("%Y_%m_%d")
now = datetime.now()
time_str = now.strftime("%H_%M")
output = "GEO_Query_cell_type.{}_{}".format(date_str, time_str)
print("output file: ", output)

# 50 s per 100 GSMs
# Initialize an empty dataframe with the desired column names
cell_type_df = pd.DataFrame(columns=['name',
                                     'organism',
                                     'biomaterial',
                                     'disease',
                                     'organ',
                                     'tissue',
                                     'celltype',
                                     'strain',
                                     'sex',
                                     'age',
                                     'extdb_name',
                                     'extdb_uuid',
                                     'other'])

num_of_gsm_done = 0

# Iterate through the list of GSM IDs
for Id in result["IdList"]:
    # Get and read the SampleData XML of each GSM on BioSapmle
    handle = Entrez.esummary(db="biosample", id=Id, retmode="xml")
    entry = Entrez.read(handle)
    data = entry["DocumentSummarySet"]["DocumentSummary"][0]["SampleData"]

    gsm_data = {
        "name":"",
        "organism":"",
        "biomaterial":"",
        "disease": "",
        "organ":"",
        "tissue":"",
        "celltype":"",
        "strain":"",
        "sex":"",
        "age":"",
        "extdb_name":"",
        "extdb_uuid":"",
        "other":""
        }
    celltype_infos = []
    
    # Classify organ and biomaterial by checking synonyms in the SampleData XML text
    # check GSM4956217 and GSM4447223
    for dic in json:
        for classification in json[dic]:
            synonyms = json[dic][classification]
            found = any(syn in str(data).lower() for syn in synonyms)
            # edge cases
            if "\">RD" in str(data):
                gsm_data["organ"] = "musculature of body"
            if "RH3" in str(data):
                gsm_data["organ"] = "exocrine gland"
            if classification == "liver": # sometimes hcc is actually breast, like hcc1599 and hcc19** cell lines
                if "breast" in str(data).lower():
                    gsm_data["organ"] = "breast"
            if found:
                gsm_data[dic] = "|" + classification
        gsm_data[dic] = gsm_data[dic].lstrip("|")
    
    

    myroot = ET.fromstring(data)
    for elem in myroot:
        for subelem in elem:
            if ('db\': \'GEO' in str(subelem.attrib)):
                gsm_data["name"] = subelem.text
            if "taxonomy_name" in str(subelem.attrib):
                gsm_data["organism"] = subelem.attrib["taxonomy_name"]
            if ('disease' in str(subelem.attrib) or \
                'cancer type' in str(subelem.attrib)
               ):
                gsm_data["disease"] = subelem.text # many organ info are from here
            if ("RH4" in str(myroot.find("./Attributes/Attribute[@attribute_name='source_name']").text) or \
                "RH3" in str(myroot.find("./Attributes/Attribute[@attribute_name='source_name']").text)
               ):
                gsm_data["disease"] = "Alveolar rhabdomyosarcoma"
            if ("\">SMS" in str(data)):
                gsm_data["disease"] = "Embryonal rhabdomyosarcoma"
            if ("\">SEM" in str(data)):
                gsm_data["disease"] =  "Acute lymphoblastic leukaemia"
            try:
                if ("oma" in str(myroot.find("./Attributes/Attribute[@attribute_name='tissue']").text).lower()):
                    gsm_data["disease"] = myroot.find("./Attributes/Attribute[@attribute_name='tissue']").text
            except:
                gsm_data["disease"] = gsm_data["disease"] + ""
            if ("adenocarcinoma" in str(myroot.find("./Attributes/Attribute[@attribute_name='source_name']").text).lower() or \
                "tumor" in str(myroot.find("./Attributes/Attribute[@attribute_name='source_name']").text).lower() or \
                "tumour" in str(myroot.find("./Attributes/Attribute[@attribute_name='source_name']").text).lower() or \
                "normal" in str(myroot.find("./Attributes/Attribute[@attribute_name='source_name']").text).lower() or \
                "oma" in str(myroot.find("./Attributes/Attribute[@attribute_name='source_name']").text).lower() or \
                "cancer" in str(myroot.find("./Attributes/Attribute[@attribute_name='source_name']").text).lower()
               ):
                if gsm_data["disease"] == "":
                    gsm_data["disease"] = myroot.find("./Attributes/Attribute[@attribute_name='source_name']").text
            try:
                if ("tumor" in str(myroot.find("./Attributes/Attribute[@attribute_name='tissue']").text).lower()):
                    gsm_data["disease"] = str(myroot.find("./Attributes/Attribute[@attribute_name='tissue']").text)
            except:
                gsm_data["disease"] = gsm_data["disease"] + ""
            if ('organ' in str(subelem.attrib)): # no organ attributes at all
                gsm_data["organ"] = subelem.text
            if ('tissue' in str(subelem.attrib) or \
                'brain region' in str(subelem.attrib)
               ):
                gsm_data["tissue"] = subelem.text  # most organ info are from here # can have disease info
            if ('cell_line' in str(subelem.attrib) or \
                'biological sample' in str(subelem.attrib) or \
                'parental cell line' in str(subelem.attrib) or \
                'neuronal subtype' in str(subelem.attrib) or \
                '\'cell' in str(subelem.attrib)
                # source_name could have cell line info
#                 'cell clone' in str(subelem.attrib) or \
#                 'cell origin' in str(subelem.attrib) or \
#                 'cell tye' in str(subelem.attrib) or \
#                 'cell-type/cell line' in str(subelem.attrib) or \
#                 'cell_type' in str(subelem.attrib) # some organ info are from here # can have disease info
               ):
                celltype_infos.append(str(subelem.attrib['attribute_name']) + "-->" + subelem.text)
            if ('cell_line' in str(subelem.attrib)):
                gsm_data["biomaterial"] = "cell line"
            if ('strain' in str(subelem.attrib) or \
                'mouse model' in str(subelem.attrib)
               ):
                gsm_data["strain"] = subelem.text
            if ('gender' in str(subelem.attrib) or \
                '\'sex\'' in str(subelem.attrib)):
                gsm_data["sex"] = subelem.text
            if ("day" in str(myroot.find("./Attributes/Attribute[@attribute_name='source_name']").text).lower()):
                gsm_data["age"] = str(myroot.find("./Attributes/Attribute[@attribute_name='source_name']").text)
            if ('\'age\'' in str(subelem.attrib) or \
                'age_day' in str(subelem.attrib)
               ):
                gsm_data["age"] = subelem.text
            if ('\'time' in str(subelem.attrib) or \
                '\'differentiation' in str(subelem.attrib) or \
                'differentiation\'' in str(subelem.attrib) or \
                'developmental stage' in str(subelem.attrib) or \
                ('erythropoiesis' in str(subelem.attrib) and 'no' not in str(subelem.attrib))
               ):
                if gsm_data["age"] == "":
                    gsm_data["age"] = subelem.text
            
    gsm_data["celltype"] = '|'.join(celltype_infos) 
    gsm_data["other"] = "{source_name}" + "-->" + str(myroot.find("./Attributes/Attribute[@attribute_name='source_name']").text)
    
    cell_type_df = cell_type_df.append(gsm_data, ignore_index=True)
    
    # indicate how many GSM are processed and save the table as the loop runs
    num_of_gsm_done += 1
    if (num_of_gsm_done % 100 == 0):
        if (num_of_gsm_done % 1000 == 0):
            print("finished GSM number", num_of_gsm_done, "saving the intermediate df")
            cell_type_df.to_excel(output + ".xlsx", index=False)
        else:
            print("finished GSM number", num_of_gsm_done)
            
cell_type_df.to_excel(output + ".xlsx", index=False)

output file:  GEO_Query_cell_type.2022_12_17_19_20
finished GSM number 100
finished GSM number 200
finished GSM number 300
finished GSM number 400
finished GSM number 500
finished GSM number 600
finished GSM number 700
finished GSM number 800
finished GSM number 900
finished GSM number 1000 saving the intermediate df
finished GSM number 1100
finished GSM number 1200
finished GSM number 1300
finished GSM number 1400
finished GSM number 1500
finished GSM number 1600
finished GSM number 1700
finished GSM number 1800
finished GSM number 1900
finished GSM number 2000 saving the intermediate df
finished GSM number 2100
finished GSM number 2200
finished GSM number 2300
finished GSM number 2400
finished GSM number 2500
finished GSM number 2600
finished GSM number 2700
finished GSM number 2800
finished GSM number 2900
finished GSM number 3000 saving the intermediate df
finished GSM number 3100
finished GSM number 3200
finished GSM number 3300
finished GSM number 3400
finished GSM number 3500
fi

Sample ID: This is the unique identifier for each sample in the database. You can search for samples by their ID using this field.

Organism: This field allows you to search for samples from a specific organism, such as human, mouse, or yeast.

Tissue: This field allows you to search for samples based on their tissue type, such as liver, brain, or heart.

Disease state: This field allows you to search for samples based on their disease state, such as cancer, diabetes, or autoimmune disease.

Cell type: This field allows you to search for samples based on their cell type, such as fibroblasts, macrophages, or stem cells.

Developmental stage: This field allows you to search for samples based on their developmental stage, such as embryonic, fetal, or adult.

Experimental design: This field allows you to search for samples based on the type of experiment they were used in, such as microarray, RNA-seq, or ChIP-seq.

### Validate a GSM

In [27]:
Entrez.email = "your_email@example.com"  # Provide your email address

# Retrieve all the BioSample records
handle = Entrez.esearch(db="biosample", term="GSM4053404")
records = Entrez.read(handle)

# Iterate through all the records and print the fields and values
for record in records["IdList"]:
    handle = Entrez.esummary(db="biosample", id=record)
    record = Entrez.read(handle)
    data = record["DocumentSummarySet"]["DocumentSummary"][0]["SampleData"]
    print(str(data), "\n") # https://codebeautify.org/xmlviewer#
    myroot = ET.fromstring(data)
    for elem in myroot:
        for subelem in elem:
            if "taxonomy_name" in str(subelem.attrib):
                print(subelem.attrib, "-->", subelem.attrib["taxonomy_name"])
            else:
                print(subelem.attrib, "-->", subelem.text)

<BioSample access="public" publication_date="2021-05-23T00:00:00.000" last_update="2021-05-23T01:18:15.927" submission_date="2019-08-29T13:51:10.350" id="12658269" accession="SAMN12658269">   <Ids>     <Id db="BioSample" is_primary="1">SAMN12658269</Id>     <Id db="SRA">SRS5325210</Id>     <Id db="GEO">GSM4053404</Id>   </Ids>   <Description>     <Title>COAD-T014 RNA-seq</Title>     <Organism taxonomy_id="9606" taxonomy_name="Homo sapiens">       <OrganismName>Homo sapiens</OrganismName>     </Organism>   </Description>   <Owner>     <Name>Quantitative and Computational Biology, Baylor College of Medicine</Name>     <Contacts>       <Contact email="aayushraman09@gmail.com">         <Name>           <First>Ayush</First>           <Last>Raman</Last>         </Name>       </Contact>     </Contacts>   </Owner>   <Models>     <Model>Generic</Model>   </Models>   <Package display_name="Generic">Generic.1.0</Package>   <Attributes>     <Attribute attribute_name="source_name" harmonized_name="