In [30]:
import requests
import time
from datetime import datetime
import lxml
import pandas as pd
from bs4 import BeautifulSoup
from sqlalchemy import create_engine

In [None]:
# Figure out the URL of the most recent categories.tsv file
source = requests.get('https://dgidb.org/downloads').text
soup = BeautifulSoup(source, 'lxml')
table = soup.select("table#tsv_downloads")
last_row = table[0]("tr")[-1]
tds = last_row.find_all('td')
source_date = tds[0].text
url = f'https://dgidb.org/data/monthly_tsvs/{source_date}/categories.tsv'

In [19]:
df = pd.read_csv(url, sep='\t')
df.head()

Unnamed: 0,entrez_gene_symbol,gene_long_name,category_sources,category
0,IDI2,ISOPENTENYL-DIPHOSPHATE DELTA ISOMERASE 2,GuideToPharmacology,ENZYME
1,WDR11,WD REPEAT DOMAIN 11,Pharos,TRANSCRIPTION FACTOR
2,KIF20B,KINESIN FAMILY MEMBER 20B,Pharos,ENZYME
3,SDF2,STROMAL CELL DERIVED FACTOR 2,HingoraniCasas,DRUGGABLE GENOME
4,CDT1,CHROMATIN LICENSING AND DNA REPLICATION FACTOR 1,Pharos,KINASE


In [26]:
# Insert a column for HGNC_ID
df.insert(0, 'HGNC_ID', '')

# Rename the Gene column
df = df.rename(columns = {'entrez_gene_symbol':'Gene_symbol'})

# Add new columns for Source, Source_Date, and Download_Date
dt_release = datetime.strptime(source_date, '%Y-%b')
str_date = int(dt_release.strftime("%Y%m%d"))
df["Source"] = "DGIdb"
df["Source_Date"] = str_date
df["Download_Date"] = int(time.strftime("%Y%m%d"))

In [29]:
# Fill in the HGNC column by loading by the HGNC gene list and the previous symbols, then joining
df_gene_info = pd.read_csv(r"data files/HGNC_gene_info.tsv", sep='\t')
df_prev_sym = pd.read_csv(r"data files/HGNC_previous_symbols.tsv", sep='\t')
df["HGNC_ID"] = pd.merge(df, df_gene_info, on='Gene_symbol', how='left')[["HGNC_ID_y"]]
df.loc[df["HGNC_ID"].isnull(), "HGNC_ID"] = pd.merge(df, df_prev_sym, left_on='Gene_symbol', right_on='Previous_symbol', how='left')[["HGNC_ID_y"]].squeeze()

# Write the file
df.to_csv(r"data files/DGIdb_categories.tsv", sep='\t')

In [31]:
# Store it in the database
engine = create_engine('sqlite:///TargetLink.db', echo=False)
sqlite_connection = engine.connect()
sqlite_table = "DGIdb_categories"
df.to_sql(sqlite_table, sqlite_connection, index=False, if_exists='replace')
sqlite_connection.close()