In [256]:
import pandas as pd
import numpy as np
import time
import sqlite3
from sqlalchemy import create_engine

In [263]:
# String with URL:
url_csv = 'https://www.proteinatlas.org/api/search_download.php?search=&columns=g,gs,eg,pc,di&compress=no&format=tsv'
# First example to read csv from URL
df_hpa = pd.read_csv(url_csv, sep='\t')

In [None]:
# Add a column for HGNC_ID
df_hpa["HGNC_ID"] = ''

# Rename the Gene column
df_hpa = df_hpa.rename(columns = {'Gene':'Gene_symbol'})

# Rearrange the columns and add a few new 
df_hpa = df_hpa.reindex(columns =['HGNC_ID', 'Gene_symbol', 'Gene synonym', 'Ensembl', 'Protein class', 'Disease involvement'])

# Add new columns for Source, Source_Date, and Download_Date
df_hpa["Source"] = "HPA"
df_hpa["Source_Date"] = int(time.strftime("%Y%m%d"))
df_hpa["Download_Date"] = int(time.strftime("%Y%m%d"))

In [269]:
# Fill in the HGNC column by loading by the HGNC gene list and the previous symbols, then joining
df_gene_info = pd.read_csv(r"data files/HGNC_gene_info.tsv", sep='\t')
df_prev_sym = pd.read_csv(r"data files/HGNC_previous_symbols.tsv", sep='\t')
df_hpa["HGNC_ID"] = pd.merge(df_hpa, df_gene_info, on='Gene_symbol', how='left')[["HGNC_ID_y"]]
df_hpa.loc[df_hpa["HGNC_ID"].isnull(), "HGNC_ID"] = pd.merge(df_hpa, df_prev_sym, left_on='Gene_symbol', right_on='Previous_symbol', how='left')[["HGNC_ID_y"]].squeeze()

# Write the file
df_hpa.to_csv(r"data files/HPA_gene_info.tsv", sep='\t')

In [270]:
# Store it in the database
engine = create_engine('sqlite:///TargetLink.db', echo=False)
sqlite_connection = engine.connect()
sqlite_table = "HPA_gene_info"
df_hpa.to_sql(sqlite_table, sqlite_connection, index=False, if_exists='replace')
sqlite_connection.close()