This code allows merging gene lists and uses the official HGNC (HUGO Gene Nomenclature Committee) REST API to validate human gene symbols and detect previous (deprecated) symbols or aliases.

Endpoints used:

/fetch/symbol/{symbol}: verification of the existence of an approved HGNC gene symbol.

/search/{query}: detection of previous (deprecated) symbols and aliases.

The queries require an internet connection, and the results depend on the current state of the HGNC database.

For gene list merging only, refer to the second and third cells.

In [2]:
import requests
HGNC_FETCH_SYMBOL_ENDPOINT = "https://rest.genenames.org/fetch/symbol/"
HGNC_SEARCH_ENDPOINT = "https://rest.genenames.org/search/"

In [15]:
#Functions

def merge_gene_lists(list_a: str, list_b: str, delimiter=", "):
    def clean_gene(gen):
        gen = gen.strip()
        # Remove punctuation marks at the begining and/or the end.
        gen = gen.strip('.,;:!?-_()[]{}"\'/\\')
        # Mayus
        return gen.upper()
    genes_a = set(clean_gene(gen) for gen in list_a.split(delimiter) if clean_gene(gen))
    genes_b = set(clean_gene(gen) for gen in list_b.split(delimiter) if clean_gene(gen))

    return sorted(genes_a.union(genes_b))


def validate_gene_list(gene_symbols):
    approved_genes = []
    renamed_genes = []
    not_found_genes = []

    headers = {"Accept": "application/json"}

    for original_symbol in gene_symbols:
        if not original_symbol or not original_symbol.strip():
            continue

        symbol = original_symbol.strip().upper().replace(" ", "")

        # 1. Approved symbol?
        fetch_url = HGNC_FETCH_SYMBOL_ENDPOINT + symbol

        try:
            fetch_response = requests.get(fetch_url, headers=headers, timeout=10)

            if fetch_response.status_code == 200:
                fetch_data = fetch_response.json()
                num_encontrados = fetch_data["response"]["numFound"]

                if num_encontrados > 0:
                    approved_genes.append(symbol)
                    continue
        except:
            pass

        # 2. If not approved, search for alternatives
        search_url = HGNC_SEARCH_ENDPOINT + symbol

        try:
            search_response = requests.get(search_url, headers=headers, timeout=10)

            if search_response.status_code != 200:
                not_found_genes.append(original_symbol.strip())
                continue

            search_data = search_response.json()
            docs = search_data["response"]["docs"]

            if not docs:
                not_found_genes.append(original_symbol.strip())
                continue

            # 3. If something was found, complete fecth of the first result
            gene_found = False
            current_symbol = docs[0].get("symbol")  # It only takes the first result

            if current_symbol:
                detail_url = HGNC_FETCH_SYMBOL_ENDPOINT + current_symbol

                try:
                    detail_response = requests.get(detail_url, headers=headers, timeout=10)

                    if detail_response.status_code == 200:
                        detail_data = detail_response.json()
                        detail_docs = detail_data["response"]["docs"]

                        if detail_docs:
                            detail_doc = detail_docs[0]

                            # Check prev symbols for the gene
                            prev_symbols = detail_doc.get("prev_symbol", [])
                            if symbol in prev_symbols:
                                approved_genes.append(current_symbol)
                                renamed_genes.append([original_symbol.strip(), current_symbol])
                                gene_found = True

                            # Check for alias
                            if not gene_found:
                                alias_symbols = detail_doc.get("alias_symbol", [])
                                if symbol in alias_symbols:
                                    approved_genes.append(current_symbol)
                                    renamed_genes.append([original_symbol.strip(), current_symbol])
                                    gene_found = True

                except:
                    pass

            if not gene_found:
                not_found_genes.append(original_symbol.strip())

        except:
            not_found_genes.append(original_symbol.strip())

    return approved_genes, renamed_genes, not_found_genes

In [8]:
#If you only need to merge your genes lists without validating HGNC symbol.
genes1=input("Please write your first list: ")
genes2=input("Now, please write your second list: ")
genesALL_=merge_gene_lists(genes1, genes2)
print(f"Your new list of {len(genesALL_)} genes is: {", ".join(genesALL_)}")

Añada su primera lista: POP1, POU1F1, PPP1CB, PPP3CA, PRKAR1A, PRMT7, PROP1, PTH1R, PTPN11, PUF60, RAD21, RAF1, RALA, RASA2, RBBP8, RIT1, RNU4ATAC, ROR2, RPS6KA3, RRAS, RTTN, SGMS2, SHOC2, SHOX, SMARCA2, SMARCA4, SMARCAL1, SMARCB1, SMARCE1, SMC1A, SMC3, SOS1, SOS2, SOX11, SOX2, SOX3, SOX9, SPRED1, SRCAP, STAT5B
Añada su segunda lista: B3GAT3, BCSTL, BLM, BMP2, BRAF, CBL, CCDC8, CDC45, CDC6, CDKN1C, CDT1, CENPJ, CEP152, CEP63, COL27A1, CREBBP, CUL7, DHCR7, DONSON, EP300
Su nueva lista de 60 genes es: B3GAT3, BCSTL, BLM, BMP2, BRAF, CBL, CCDC8, CDC45, CDC6, CDKN1C, CDT1, CENPJ, CEP152, CEP63, COL27A1, CREBBP, CUL7, DHCR7, DONSON, EP300, POP1, POU1F1, PPP1CB, PPP3CA, PRKAR1A, PRMT7, PROP1, PTH1R, PTPN11, PUF60, RAD21, RAF1, RALA, RASA2, RBBP8, RIT1, RNU4ATAC, ROR2, RPS6KA3, RRAS, RTTN, SGMS2, SHOC2, SHOX, SMARCA2, SMARCA4, SMARCAL1, SMARCB1, SMARCE1, SMC1A, SMC3, SOS1, SOS2, SOX11, SOX2, SOX3, SOX9, SPRED1, SRCAP, STAT5B


In [4]:
#Example list.
genes_1= "ACTB, ACTG1, AMMECR1, ARCN1, ATR, B3GAT3, BCSTL, BLM, BMP2, BRAF, CBL, CCDC8, CDC45, CDC6, CDKN1C, CDT1, CENPJ, CEP152, CEP63, COL27A1, CREBBP, CUL7, DHCR7, DONSON, EP300, FGD1, FGFR3, FN1, GH1, GHR, GHRHR, GHSR, GLI2, GNAS, HDAC8, HESX1, HMGA2, HRAS, IDUA, IGF1, IGF1R, IGF2, IGFALS, INSR, IRS1, KRAS, LARP7, LFNG, LHX3, LHX4, LZTR1, MAP2K1, MAP2K2, NIPBL, NOTCH2, NRAS, OBSL1, ORC1, ORC4, ORC6, OSGEP, OTX2, PCNT, PISD, PITX2, PLAG1, POC1A, POP1, POU1F1, PPP3CA, PRMT7, PROP1, PTPN11, PUF60, RAD21, RAF1, RALA, RASA2, RBBP8, RIT1, RNU4ATAC, RRAS, RTTN, SGMS2, SHOC2, SHOX, SMARCA2, SMARCE1, SMC1A, SMC3, SOS1, SOX11, SOX2, SOX3, SRCAP, STAT5B, TALDO1, TBX19, TBX2, TBX3, TOP3A, TRIM37, TRMT10A, XRCC4"
genes_2= "ACAN, ACTB, ACTG1, ALMS1, AMMECR1, ANKRD11, ARCN1, ARID1A, ARID1B, ATR, ATRIP, B3GAT3, BLM, BMP2, BRAF, BRF1, BTK, CBL, CCDC8, CDC45, CDC6, CDT1, CENPJ, CEP152, CEP63, COL10A1, COL11A1, COL11A2, COL1A1, COL27A1, COL2A1, COL9A1, COL9A2, COL9A3, COMP, CREBBP, CRIPT, CUL7, DHCR7, DNA2, DONSON, DVL1, EP300, ERCC6, ERCC8, EVC, EVC2, FANCA, FANCC, FANCG, FBN1, FGD1, FGFR3, FN1, GH1, GHR, GHRHR, GHSR, GLI2, GLI3, GNAS, HDAC8, HESX1, HRAS, HSPG2, IDUA, IGF1, IGF1R, IGF2, IGFALS, IHH, INSR, KDM6A, KMT2D, KRAS, LARP7, LFNG, LHX3, LHX4, LIG4, LMNA, LZTR1, MAP2K1, MAP2K2, MATN3, MRAS, NBN, NF1, NIPBL, NOTCH2, NPPC, NRAS, NSMCE2, OBSL1, ORC1, ORC4, ORC6, OSGEP, OTX2, PCNT, PDE4D, PIK3R1, PISD, PLK4, POC1A, POP1, POU1F1, PPP1CB, PPP3CA, PRKAR1A, PRMT7, PROP1, PTH1R, PTPN11, PUF60, RAD21, RAF1, RALA, RASA2, RBBP8, RIT1, RNU4ATAC, ROR2, RPS6KA3, RRAS, RTTN, SGMS2, SHOC2, SHOX, SMARCA2, SMARCA4, SMARCAL1, SMARCB1, SMARCE1, SMC1A, SMC3, SOS1, SOS2, SOX11, SOX2, SOX3, SOX9, SPRED1, SRCAP, STAT5B, TALDO1, TBX2, TBX3, TOP3A, TRIM37, TRMT10A, WNT5A, XRCC4"

In [21]:
#In case you want to use your own lists:
genes_1=input("Please enter your first list: ")
genes_2=input("Now, please enter your second list: ")

Please enter your first list: ACTB, ACTG1, AMMECR1, ARCN1, ATR, B3GAT3, BCSTL, BLM, BMP2, BRAF, CBL, CCDC8, CDC45, CDC6, CDKN1C, CDT1, CENPJ, CEP152, CEP63, COL27A1, CREBBP, CUL7, DHCR7, DONS
Now, please enter your second list: AMMECR1, ANKRD11, ARCN1, ARID1A, ARID1B, ATR, ATRIP, B3GAT3, BLM, BMP2, BRAF, BRF1, BTK, CBL, CCDC8, CDC45, CDC6, CDT1, CENPJ, CEP152, CEP63, COL10A1, COL11A1}


In [22]:
genes_unidos = merge_gene_lists(genes_1, genes_2)
print(f"Total number of unique gene symbols: {len(genes_unidos)}. Validation may take a few seconds to complete.")
genes_aprobados, genes_renombrados, genes_no_encontrados = validate_gene_list(genes_unidos)

print("Your new list of ", len(genes_aprobados), "genes is:", ", ".join(genes_aprobados))

if genes_renombrados:
    print("\nIMPORTANT: The following gene symbols were updated according to HGNC:")
    for viejo, nuevo in genes_renombrados:
        print(f"{viejo} -> {nuevo}")

if genes_no_encontrados:
    print(f"The following gene symbols were not found in the HGNC database or are invalid: {", ".join(genes_no_encontrados)}")

Total number of unique gene symbols: 32. Validation may take a few seconds to complete.
Your new list of  30 genes is: ACTB, ACTG1, AMMECR1, ANKRD11, ARCN1, ARID1A, ARID1B, ATR, ATRIP, B3GAT3, BLM, BMP2, BRAF, BRF1, BTK, CBL, CCDC8, CDC45, CDC6, CDKN1C, CDT1, CPAP, CEP152, CEP63, COL10A1, COL11A1, COL27A1, CREBBP, CUL7, DHCR7

IMPORTANT: The following gene symbols were updated according to HGNC:
CENPJ -> CPAP
The following gene symbols were not found in the HGNC database or are invalid: BCSTL, DONS
