This script imports the latest HGNC data into the database

In [100]:
import pandas as pd
import numpy as np
import time
import sqlite3
from sqlalchemy import create_engine

In [106]:
# String with URL:
url_csv = 'https://www.genenames.org/cgi-bin/download/custom?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_aliases&col=gd_pub_chrom_map&col=gd_pub_acc_ids&col=md_ensembl_id&col=md_eg_id&col=md_prot_id&col=md_mim_id&status=Approved&hgnc_dbtag=on&order_by=gd_app_sym_sort&format=text&submit=submit'
# First example to read csv from URL
df_hgnc = pd.read_csv(url_csv, sep='\t')

In [107]:

# Rename the columns
df_hgnc.rename(columns = {'HGNC ID': 'HGNC_ID', 'Approved symbol':'Gene_symbol', 'Approved name': 'Approved_name', 'Previous symbols': 'Previous_symbols',
    'Alias symbols': 'Alias_symbols', 'Accession numbers': 'Accession_numbers', 'Ensembl ID(supplied by Ensembl)':'Ensembl_ID', 
    'NCBI Gene ID(supplied by NCBI)':'NCBI_ID', 'UniProt ID(supplied by UniProt)':'UniProt_ID', 
    'OMIM ID(supplied by OMIM)':'OMIM_ID' }, inplace = True)



# Add new columns for Source, Source_Date, and Download_Date
df_hgnc["Source"] = "HGNC"
df_hgnc["Source_Date"] = int(time.strftime("%Y%m%d"))
df_hgnc["Download_Date"] = int(time.strftime("%Y%m%d"))

# Convert to the proper data types
df_hgnc = df_hgnc.astype({"NCBI_ID": "Int32", "Source_Date": "Int32", "Download_Date": "Int32"})

# Write the file
df_hgnc.to_csv(r"data files/HGNC_gene_info.tsv", sep='\t')

In [108]:
# Store it in the database
engine = create_engine('sqlite:///save_pandas.db', echo=False)
sqlite_connection = engine.connect()
sqlite_table = "HGNC_gene_info"
df_hgnc.to_sql(sqlite_table, sqlite_connection, index=False, if_exists='replace')
sqlite_connection.close()

In [105]:
# Create a file and database table for previous symbols
data = []
delimiter = ','
for i, row in df_hgnc.iterrows():
    hgnc_id = row["HGNC ID"]
    symbol = row["Gene_symbol"]
    token_string = str(row["Previous_symbols"])
    if token_string != 'nan':
        for token in token_string.split(delimiter):
            if token != '-' and str(hgnc_id).startswith('H'):
                data.append({ "HGNC_ID": hgnc_id, "Gene_symbol": symbol, "Previous_symbol": token.strip()})

df_previous_sym = pd.DataFrame(data)
df_previous_sym.to_csv(r"data files/HGNC_previous_symbols.tsv", sep = '\t', index=False)

# Store it in the database
engine = create_engine('sqlite:///TargetLink.db', echo=False)
sqlite_connection = engine.connect()
sqlite_table = "HGNC_previous_symbols"
df_previous_sym.to_sql(sqlite_table, sqlite_connection, index=False, if_exists='replace')
sqlite_connection.close()