In [None]:
from __future__ import division, absolute_import

import sys
import os
import numpy as np
import random

import pandas as pd
from tables import *

#root
absPath = '/home/angela3/imbalance_pcm_benchmark/'
sys.path.insert(0, absPath)

from src.imbalance_functions import labelling

np.random.seed(8)
random.seed(8)

For this script to run, the cleared dataset provided by the DeepAffinity repository (https://github.com/Shen-Lab/DeepAffinity/) must be downloaded into a subfolder in this repo called *raw_data*.
The cleared dataset from DeepAffinity can be found in https://drive.google.com/open?id=1_msEbSh_YZr0NLSR_DJ_xWE9FlqBlMV9 and the files are explained in the data folder of the DeepAffinity repo.

In [None]:
#Loading the measure method for protein compound pairs (for EC50, IC50, Kd, Ki).
measurement_types = ["EC50", "IC50", "Kd", "Ki"]
list_measurements = []
for typee in measurement_types:
    measure_pairs_file = "".join((absPath, "raw_data/", typee, "_protein_compound_pair.tsv"))
    measure_pairs_df = pd.read_csv(measure_pairs_file, sep='\t', header=0)
    list_measurements.append(measure_pairs_df)
    
pairs_df = pd.concat(list_measurements)

#columns we want to collapse into one
val_cols = ['pEC50_[M]', 'pIC50_[M]', 'pKd_[M]', 'pKi_[M]']

pairs_df['activity'] = pd.to_numeric(pairs_df[val_cols].bfill(axis=1).iloc[:, 0], errors='coerce')
pairs_df.drop(val_cols, axis=1, inplace=True)

pairs_df['label'] = pairs_df.apply(labelling, axis=1)

print(pairs_df.head())
print(pairs_df.info())

#Loading SMILES data
smiles_filename = "".join((absPath, "raw_data/dcid_smi.tsv"))
smiles_df = pd.read_csv(smiles_filename, sep='\t', header=0)

print(smiles_df.head())
print(smiles_df.info())

#Loading proteins data
prot_filename = "".join((absPath, "raw_data/dpid_seq.tsv"))
prots_df = pd.read_csv(prot_filename, sep='\t', header=0)

print(prots_df.head())
print(prots_df.info())

#Loading Pfam data (just in case)
pfam_filename = "".join((absPath, "raw_data/dpid_dom.tsv"))
pfam_df = pd.read_csv(pfam_filename, sep='\t', header=0)

print(pfam_df.head())
print(pfam_df.info())

#Creating dictionary of compounds
dict_smiles = creating_smiles_dictionary(smiles_df["Canonical SMILE"].values)

#Joining dataframes
concat_pairs_smiles = pd.merge(pairs_df, smiles_df, on="DeepAffinity Compound ID")
concat_pairs_prots = pd.merge(concat_pairs_smiles, prots_df, on="DeepAffinity Protein ID")
#Pfam signatures are too heavy to save
#concat_final = pd.merge(concat_pairs_prots, pfam_df, on="DeepAffinity Protein ID")

#Trying to join proteins to families
prot_families = ['NR', 'GPCR', 'CY', 'PK', 'TR', 'IC', 'OE', 'PR']
file_fams = "".join((absPath, "raw_data/Uniprot/"))
fams_list = []
for prot_fam in prot_families:
    file_fam = "".join((file_fams, prot_fam, ".txt"))
    df_fam = pd.read_csv(file_fam, delimiter='\t', usecols=[3])
    df_fam['family'] = prot_fam
    fams_list.append(df_fam)

families = pd.concat(fams_list, ignore_index=True)
families.columns = ['Uniprot ID', 'family']
print(families.info())
#Joining dataframes
# unimos ambos 
activity_df_fams = pd.merge(concat_pairs_prots, families, on=['Uniprot ID'], how='left')
print(activity_df_fams.info())    
# Hay 242.840 prots que no tienen asignadas familias
activity_df_fams['family'].fillna("Non assigned", inplace=True)

#Saving final dataset to a csv
output_filename = "".join((absPath, "data/smiles_prots_activity.csv"))
activity_df_fams.to_csv(output_filename, sep="\t", header=True)