In [1]:
import pandas as pd
import numpy as np
import time
from sqlalchemy import create_engine

In [9]:
def get_df(url):
    df = pd.read_csv(url, sep='\t')
    # Add a column for HGNC_ID
    df.insert(0, "HGNC_ID", "")

    # Rename the Gene column
    df = df.rename(columns = {'Gene':'Gene_symbol'})
    return df

def finalize_file(df, out_filename, db_table_name):
    # Add new columns for Source, Source_Date, and Download_Date
    df["Source"] = "HPA"
    df["Source_Date"] = int(time.strftime("%Y%m%d"))
    df["Download_Date"] = int(time.strftime("%Y%m%d"))

    # Fill in the HGNC column by loading by the HGNC gene list and the previous symbols, then joining
    df_gene_info = pd.read_csv(r"data files/HGNC_gene_info.tsv", sep='\t')
    df_prev_sym = pd.read_csv(r"data files/HGNC_previous_symbols.tsv", sep='\t')
    df["HGNC_ID"] = pd.merge(df, df_gene_info, on='Gene_symbol', how='left')[["HGNC_ID_y"]]
    df.loc[df["HGNC_ID"].isnull(), "HGNC_ID"] = pd.merge(df, df_prev_sym, left_on='Gene_symbol', right_on='Previous_symbol', how='left')[["HGNC_ID_y"]].squeeze()

    # Write the file
    df.to_csv(out_filename, sep='\t')

    # Store it in the database
    engine = create_engine('sqlite:///TargetLink.db', echo=False)
    sqlite_connection = engine.connect()
    sqlite_table = db_table_name
    df.to_sql(sqlite_table, sqlite_connection, index=False, if_exists='replace')
    sqlite_connection.close()

df = get_df('https://www.proteinatlas.org/api/search_download.php?search=&columns=g,gs,eg,pc,di&compress=no&format=tsv')
# Rearrange the columns
df = df.reindex(columns =['HGNC_ID', 'Gene_symbol', 'Gene synonym', 'Ensembl', 'Protein class', 'Disease involvement'])
finalize_file(df, r"data files/HPA_gene_info.tsv", "HPA_gene_info")
df = get_df("https://www.proteinatlas.org/api/search_download.php?search=transmembrane&columns=g&compress=no&format=tsv")
finalize_file(df, r"data files/HPA_transmembrane.tsv", "HPA_transmembrane")

In [20]:
# Get the biological process and molecular function
df = get_df("https://www.proteinatlas.org/api/search_download.php?search=&columns=g,upbp,up_mf&compress=no&format=tsv")
bp_data = []
mf_data = []
for i, row in df.iterrows():
    symbol = row["Gene_symbol"]
    bp_tokens = str(row["Biological process"])
    mf_tokens = str(row["Molecular function"])
    if bp_tokens != 'nan':
        for token in bp_tokens.split(','):
            if token != '':
                bp_data.append({ "HGNC_ID": '', "Gene_symbol": symbol, "Biological_process": token.strip()})
    if mf_tokens != 'nan':
        for token in mf_tokens.split(','):
            if token != '':
                mf_data.append({ "HGNC_ID": '', "Gene_symbol": symbol, "Molecular_function": token.strip()})

df = pd.DataFrame(bp_data)
finalize_file(df, r"data files/HPA_biological_process.tsv", "HPA_biological_process")
df = pd.DataFrame(mf_data)
finalize_file(df, r"data files/HPA_molecular_function.tsv", "HPA_molecular_function")
