# Read and Process B3DB Data

First, load libraries.

In [35]:
import numpy as np
import pandas as pd

# path manipulation
from pathlib import Path

# import custom modules
import sys
sys.path.append('..')
import project_config

Read and process B3DB data for inclusion in the Therapeutics Data Commons.

In [30]:
# read B3DB data from URLs
# classification: https://github.com/theochem/B3DB/blob/main/B3DB/B3DB_classification.tsv
# regression: https://github.com/theochem/B3DB/blob/main/B3DB/B3DB_regression.tsv

# read classification data
classification_url = 'https://raw.githubusercontent.com/theochem/B3DB/main/B3DB/B3DB_classification.tsv'
classification_df = pd.read_csv(classification_url, sep='\t', header=0)

# read regression data
regression_url = 'https://raw.githubusercontent.com/theochem/B3DB/main/B3DB/B3DB_regression.tsv'
regression_df = pd.read_csv(regression_url, sep='\t', header=0)

# subset compound_name, SMILES, and logBB
classification_df = classification_df[['IUPAC_name', 'SMILES', 'BBB+/BBB-']]
regression_df = regression_df[['IUPAC_name', 'SMILES', 'logBB']]

# rename columns
classification_df.columns = ['Drug_ID', 'Drug', 'Y']
regression_df.columns = ['Drug_ID', 'Drug', 'Y']

# map classification to BBB-: 0 and BBB+: 1
classification_df['Y'] = classification_df['Y'].map({'BBB-': 0, 'BBB+': 1})

# print dimensions
print('Classification dimensions:', classification_df.shape)
print('Regression dimensions:', regression_df.shape)

Classification dimensions: (7807, 3)
Regression dimensions: (1058, 3)


Filter data to drop duplicates and NA values.

In [31]:
# drop NA values in Drug_ID
classification_df = classification_df.dropna(subset=['Drug_ID'])
regression_df = regression_df.dropna(subset=['Drug_ID'])

# drop duplicates in Drug_ID
classification_df = classification_df.drop_duplicates(subset=['Drug_ID'])
regression_df = regression_df.drop_duplicates(subset=['Drug_ID'])

# reset index
classification_df = classification_df.reset_index(drop=True)
regression_df = regression_df.reset_index(drop=True)

# print dimensions
print('Classification dimensions:', classification_df.shape)
print('Regression dimensions:', regression_df.shape)

Classification dimensions: (6167, 3)
Regression dimensions: (942, 3)


Save data to file.

In [37]:
# save to file
classification_df.to_csv(project_config.DATA_DIR / 'tdc' / 'b3db_classification.csv', index=False)
regression_df.to_csv(project_config.DATA_DIR / 'tdc' / 'b3db_regression.csv', index=False)