# 1. Data acquisition

In [1]:
from chembl_webresource_client.new_client import new_client
import os
import pandas as pd


## Class: DataDownloader
### Attributes:

    molecular_target: The name of the molecular target for which data is to be downloaded.
    path: Directory path where the data file will be saved. Default is an empty string.
    id: The identifier for the type of activity data to download (default is 'IC50').
    assay_type: Type of assay; hardcoded to 'F'.
    pIC50: A list intended to store pIC50 values (currently unused).
    selected_target: Stores the selected target's ChEMBL ID after the user selects a species.
    targets: DataFrame to store the results of the molecular target search.
### Methods:

    __init__(self, molecular_target, path='', id='IC50'):
        Initializes the class with the given parameters.
        Sets default values for assay_type and pIC50.

    load_data(self):
        Downloads data related to the molecular target.
        Searches for the molecular target using the new_client.target object.
        Converts the search result into a DataFrame and stores it in self.targets.
        Prompts the user to select a species by index.
        Sets self.selected_target based on the user's input.

    get_activity_values(self):
        Collects activity data for the specified molecular target.
        Filters activities for the selected target and standard type (IC50 by default).
        Converts the result into a DataFrame and saves it as a CSV file in the specified directory.

In [2]:
class DataDownloader:
    def __init__(self, molecular_target, path='', id='IC50'):
        self.molecular_target = molecular_target
        self.id = id
        self.assay_type = 'F'
        self.path = path
        self.pIC50 =[]
        self.selected_target = None
        self.targets = None
        

    def load_data(self):
        
        print('Downloading data for {}'.format(self.molecular_target))
        target = new_client.target
        target_query = target.search(self.molecular_target)
        self.targets = pd.DataFrame.from_dict(target_query)
        print(self.targets)
        
        n = int(input("Specify the species you would like to download your data for:"))
        self.selected_target = self.targets.target_chembl_id[n]
        print("Data was collected successfully for", self.selected_target)
    
    def get_activity_values(self):
        print("Collecting activity data for the specified molecular target")
        print("Molecular target: {} | Activity type: {} ".format(self.selected_target, self.id ))
        
        activity = new_client.activity
        res = activity.filter(target_chembl_id=self.selected_target).filter(standard_type=self.id, assay_type = self.assay_type)
        self.df_res = pd.DataFrame.from_dict(res)
        
        self.df_res.to_csv("{}/chembl_activity_IC50_{}.csv".format(self.path, self.molecular_target), index=False)


### Example usage: 

In [4]:
download_data = DataDownloader(molecular_target = 'P28223', path = os.path.join(os.getcwd(),'QSAR_5HT2A'), id='IC50')
download_data.load_data()
download_data.get_activity_values()
#retain 5-HT2A table
ser_tbl = download_data.df_res

Downloading data for P28223
                                    cross_references      organism  \
0  [{'xref_id': 'P28223', 'xref_name': None, 'xre...  Homo sapiens   
1                                                 []  Homo sapiens   
2                                                 []  Homo sapiens   
3                                                 []  Homo sapiens   
4                                                 []  Homo sapiens   
5                                                 []  Homo sapiens   

                                           pref_name  score  \
0                     Serotonin 2a (5-HT2a) receptor   15.0   
1             Serotonin 2 receptors; 5-HT2a & 5-HT2c   14.0   
2  5-hydroxytryptamine receptor 2A/Metabotropic g...   13.0   
3                       Serotonin 2 (5-HT2) receptor   12.0   
4     Dopamine D2 receptor and serotonin 2a receptor   11.0   
5                          Serotonin (5-HT) receptor    6.0   

   species_group_flag target_chembl_id 

#### maximize the number of compounds in your dataset while ensuring that their antagonist activity evaluations are expressed in the same metrics and the same bao_format

In [5]:
distinct_assays = ser_tbl['assay_chembl_id'].value_counts()
print('The number of compounds was evaluated with assays {}'.format(distinct_assays))

The number of compounds was evaluated with assays assay_chembl_id
CHEMBL5135478    38
CHEMBL5135453    38
CHEMBL4045106    37
CHEMBL1074221    34
CHEMBL4627009    34
CHEMBL4254803    30
CHEMBL3806802    27
CHEMBL4188000    23
CHEMBL4619930    19
CHEMBL4615083    15
CHEMBL1039801    15
CHEMBL4222628    12
CHEMBL4368231     9
CHEMBL3610647     9
CHEMBL4708968     7
CHEMBL4195146     5
CHEMBL3738119     3
CHEMBL617350      3
CHEMBL4276607     2
CHEMBL4709861     2
CHEMBL3872782     2
CHEMBL5142997     2
CHEMBL980680      2
CHEMBL1119160     2
CHEMBL4193009     1
CHEMBL1041066     1
CHEMBL5130084     1
CHEMBL4821089     1
CHEMBL1069951     1
CHEMBL2162152     1
CHEMBL861451      1
CHEMBL3243869     1
CHEMBL3295407     1
CHEMBL3387203     1
CHEMBL3414317     1
CHEMBL3636004     1
CHEMBL4219916     1
CHEMBL3737564     1
CHEMBL2328664     1
Name: count, dtype: int64


In [6]:
filtered_ser_tbl = ser_tbl[ser_tbl['assay_chembl_id'].isin(['CHEMBL5135453','CHEMBL4627009'])]
print(filtered_ser_tbl['bao_format']) #ensure one bao
filtered_ser_tbl.to_csv(os.path.join(os.path.join(os.getcwd(),'QSAR_5HT2A/chembl_activity_IC50_cleaned.csv')))

262    BAO_0000219
263    BAO_0000219
264    BAO_0000219
265    BAO_0000219
266    BAO_0000219
          ...     
340    BAO_0000219
341    BAO_0000219
342    BAO_0000219
343    BAO_0000219
344    BAO_0000219
Name: bao_format, Length: 72, dtype: object


In [7]:
filtered_ser_tbl

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
262,,,20696387,[],CHEMBL4627009,Antagonist activity at human 5HT2A receptor ex...,F,,,BAO_0000190,...,Homo sapiens,Serotonin 2a (5-HT2a) receptor,9606,,,IC50,nM,UO_0000065,,11.0
263,,,20696388,[],CHEMBL4627009,Antagonist activity at human 5HT2A receptor ex...,F,,,BAO_0000190,...,Homo sapiens,Serotonin 2a (5-HT2a) receptor,9606,,,IC50,nM,UO_0000065,,0.81
264,,,20696389,[],CHEMBL4627009,Antagonist activity at human 5HT2A receptor ex...,F,,,BAO_0000190,...,Homo sapiens,Serotonin 2a (5-HT2a) receptor,9606,,,IC50,nM,UO_0000065,,9.3
265,,,20696390,[],CHEMBL4627009,Antagonist activity at human 5HT2A receptor ex...,F,,,BAO_0000190,...,Homo sapiens,Serotonin 2a (5-HT2a) receptor,9606,,,IC50,nM,UO_0000065,,1300.0
266,,,20696391,[],CHEMBL4627009,Antagonist activity at human 5HT2A receptor ex...,F,,,BAO_0000190,...,Homo sapiens,Serotonin 2a (5-HT2a) receptor,9606,,,IC50,nM,UO_0000065,,4100.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
340,,,24804045,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5135453,Antagonist activity at 5-HT2A receptor (unknow...,F,,,BAO_0000190,...,Homo sapiens,Serotonin 2a (5-HT2a) receptor,9606,,,IC50,nM,UO_0000065,,500.0
341,,,24804046,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5135453,Antagonist activity at 5-HT2A receptor (unknow...,F,,,BAO_0000190,...,Homo sapiens,Serotonin 2a (5-HT2a) receptor,9606,,,IC50,nM,UO_0000065,,500.0
342,,,24804047,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5135453,Antagonist activity at 5-HT2A receptor (unknow...,F,,,BAO_0000190,...,Homo sapiens,Serotonin 2a (5-HT2a) receptor,9606,,,IC50,nM,UO_0000065,,500.0
343,,,24804048,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5135453,Antagonist activity at 5-HT2A receptor (unknow...,F,,,BAO_0000190,...,Homo sapiens,Serotonin 2a (5-HT2a) receptor,9606,,,IC50,nM,UO_0000065,,500.0
