# Generation of a csv file for downloading of IUPACs name related to the the TDP1 inhibitors


### Content   <a name="content"></a>

1. [Load and analyse PubChem BioAssay GPR151 protein activators data](#1)
2. [Create a data frame with CIDs and targets](#2)
3. [Reduce the inactive compounds](#3)
4. [Generate the csv file for IUPAC names dowloading](#4)

### Load and analyse PubChem BioAssay data <a name="1"></a>

In [1]:
import pandas as pd 

# load the PubChem AID 686978 bioassy dataset
# https://pubchem.ncbi.nlm.nih.gov/bioassay/686978
# df = pd.read_csv('AID_686978_datatable.csv', sep=';', low_memory=False, on_bad_lines='skip')
df = pd.read_csv('AID_686978_data.csv', low_memory=False, on_bad_lines='skip')

# To avoid truncation of some columns during data frame display
pd.set_option('display.max_columns', None) 

# Display the data frame
print('Shape of the data frame: ', df.shape)
df.head()

Shape of the data frame:  (424003, 48)


Unnamed: 0,PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_EXT_DATASOURCE_SMILES,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Phenotype,Potency,Efficacy,Analysis Comment,Activity_Score,Curve_Description,Fit_LogAC50,Fit_HillSlope,Fit_R2,Fit_InfiniteActivity,Fit_ZeroActivity,Fit_CurveClass,Excluded_Points,Max_Response,Activity at 0.0000295000 uM,Activity at 0.0000590000 uM,Activity at 0.0001503265 uM,Activity at 0.0002712146 uM,Activity at 0.0005895491 uM,Activity at 0.00117 uM,Activity at 0.00179 uM,Activity at 0.00299 uM,Activity at 0.00672 uM,Activity at 0.014 uM,Activity at 0.026 uM,Activity at 0.040 uM,Activity at 0.074 uM,Activity at 0.167 uM,Activity at 0.363 uM,Activity at 0.628 uM,Activity at 0.975 uM,Activity at 1.849 uM,Activity at 4.119 uM,Activity at 9.037 uM,Activity at 15.83 uM,Activity at 21.08 uM,Activity at 46.23 uM,Activity at 92.54 uM,Activity at 165.6 uM,Compound QC;;;;
0,1,109967258,50897788.0,CNCC1=NC2=C(C=C(C=C2)Cl)C(=N1)C3=CC=CN3,Active,42,http://assay.nih.gov/htsws/rest/display/dd-tdp...,,Inhibitor,14.1254,136.547,,42.0,Partial curve; high efficacy,-4.85,1.01,0.9948,-134.488,2.0597,-2.1,0 0 0 0 0,-106.311,,,,,,,,,,2.0597,,,,,0.5007,,,-17.4373,,-48.5138,,,-106.311,,,QC'd by AA Pharmaceuticals;;;
1,2,144206324,65628.0,CN1C2=C(C=C(C=C2)N(CCCl)CCCl)N=C1CCCC(=O)O,Active,41,http://assay.nih.gov/htsws/rest/display/dd-tdp...,,Inhibitor,18.8375,82.1468,,41.0,Partial curve; high efficacy,-4.725,1.9887,0.9829,-82.6097,-0.4629,-2.1,0 0 0 0 0 0 0 0,-70.9545,,,,,-3.2819,,,1.1641,,5.8554,,,-1.8326,,-4.9853,,,0.0798,,-16.3921,,,-70.9545,,,QC'd by ACC;;;
2,3,144206325,14708.0,C[C@]12CC[C@H]3[C@H]([C@@H]1CC[C@]2(C)O)CC[C@@...,Inconclusive,10,http://assay.nih.gov/htsws/rest/display/dd-tdp...,,Inhibitor,21.136,39.2903,,10.0,Single point of activity,-4.675,2.7868,0.9357,-43.4549,-4.1646,-3.0,0 0 0 0 0 0 0 0,-39.227,,,,,-4.14,,,-5.7369,,0.1681,,,-8.2838,,-6.9258,,,1.0188,,-7.8647,,,-39.227,,,QC'd by ACC;;;;
3,4,144206326,3085168.0,CC1=NC=C(C(=N1)N)CN(C=O)/C(=C(\CCO)/SS/C(=C(/C...,Inactive,0,http://assay.nih.gov/htsws/rest/display/dd-tdp...,,Inactive,,,,0.0,,,4.9549,0.6177,2.0768,-13.6113,4.0,0 0 0 0 0 0 0 1,-5.7844,,,,,-11.9232,,,-14.5799,,8.2225,,,4.9756,,4.8996,,,-10.1081,,3.3805,,,-5.7844,,,QC'd by ACC;;;;
4,5,144206327,2449.0,CC1=C(C(=O)C(=C(C1=O)C)C(CCCCCC(=O)O)C2=CC=CC=...,Inconclusive,10,http://assay.nih.gov/htsws/rest/display/dd-tdp...,,Inhibitor,26.6086,77.9682,,10.0,Single point of activity,-4.575,3.5117,0.9609,-82.1111,-4.1429,-3.0,0 0 0 0 0 0 0 0,-72.3569,,,,,-10.6429,,,-2.5661,,-11.587,,,-4.3838,,1.2149,,,1.7567,,-5.8883,,,-72.3569,,,QC'd by ACC;;;;


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 424003 entries, 0 to 424002
Data columns (total 48 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   PUBCHEM_RESULT_TAG             424003 non-null  int64  
 1   PUBCHEM_SID                    424003 non-null  int64  
 2   PUBCHEM_CID                    423975 non-null  float64
 3   PUBCHEM_EXT_DATASOURCE_SMILES  423975 non-null  object 
 4   PUBCHEM_ACTIVITY_OUTCOME       424003 non-null  object 
 5   PUBCHEM_ACTIVITY_SCORE         424003 non-null  int64  
 6   PUBCHEM_ACTIVITY_URL           424003 non-null  object 
 7   PUBCHEM_ASSAYDATA_COMMENT      0 non-null       float64
 8   Phenotype                      424003 non-null  object 
 9   Potency                        175189 non-null  float64
 10  Efficacy                       175189 non-null  float64
 11  Analysis Comment               0 non-null       float64
 12  Activity_Score                

In [3]:
# Remive missing values
df = df[df['PUBCHEM_CID'].notna()]
df.shape

(423975, 48)

In [4]:
# Remove duplicates without keeping a sample ofthem and reset the indexes 
df = df.drop_duplicates(subset='PUBCHEM_CID', keep=False).reset_index(drop=True)
df.shape

(410564, 48)

In [5]:
# Turn CID float data type into integer
df['PUBCHEM_CID'] = df['PUBCHEM_CID'].astype('int64') 

In [6]:
# Filter inhibitors 
df = df[['PUBCHEM_CID',
         'PUBCHEM_SID',
         'PUBCHEM_ACTIVITY_OUTCOME']] 
df.shape # 646675

(410564, 3)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 410564 entries, 0 to 410563
Data columns (total 3 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   PUBCHEM_CID               410564 non-null  int64 
 1   PUBCHEM_SID               410564 non-null  int64 
 2   PUBCHEM_ACTIVITY_OUTCOME  410564 non-null  object
dtypes: int64(2), object(1)
memory usage: 9.4+ MB


[<a href="#content">Back to top</a>]

## Create a data frame with CIDs and targets <a name="2"></a>

In [8]:
df['PUBCHEM_ACTIVITY_OUTCOME'] = df['PUBCHEM_ACTIVITY_OUTCOME'].astype(str) 
# Get unique values from 'column1'
unique_values = df['PUBCHEM_ACTIVITY_OUTCOME'].unique()
unique_values 

array(['Active', 'Inconclusive', 'Inactive'], dtype=object)

In [9]:
df.rename(columns={'PUBCHEM_CID':'CID',
                   'PUBCHEM_SID':'SID',
                   'PUBCHEM_ACTIVITY_OUTCOME':'target'}, inplace=True)

# Set the option to explicitly handle downcasting
pd.set_option('future.no_silent_downcasting', True)

# Create a mapping dictionary to replace string values with numeric values
mapping = {'Active': 1, 'Inactive': 0, 'Inconclusive':2}

# Replace string values with numeric values using the mapping dictionary
df['target'] = df['target'].replace(mapping)

# Display the data frame
print('Shape of the data frame: ', df.shape)
df.head()

Shape of the data frame:  (410564, 3)


Unnamed: 0,CID,SID,target
0,50897788,109967258,1
1,65628,144206324,1
2,14708,144206325,2
3,3085168,144206326,0
4,2449,144206327,2


In [10]:
# Turn CID float data type into integer
df['target'] = pd.to_numeric(df['target']) 

In [11]:
df['target'].value_counts()

target
0    236226
2    112867
1     61471
Name: count, dtype: int64

In [12]:
df_1 = df[df['target']==1]
df_1.shape

(61471, 3)

[<a href="#content">Back to top</a>]

## Reduce the inactive compounds <a name="3"></a>

In [13]:
# CSeparate target 0 compounds
df_0 = df[df['target']==0]

# Display the data frame
print('Shape of df_0: ', df_0.shape)

Shape of df_0:  (236226, 3)


In [14]:
# Shuffle the resulting data set
df_0 = df_0.sample(
    frac = 1,        # Return entire dataframe
    random_state=1   # Make result reproducible
    ).reset_index(drop=True)

# Extract every nth row     
n = 3
df_0 = df_0[df_0.index % n == 0] 
df_0.shape

(78742, 3)

In [15]:
# Concatenate target 1 and 0 compounds 
df = pd.concat([df_0, df_1])

In [16]:
# Count each of the binary targets
df['target'].value_counts()

target
0    78742
1    61471
Name: count, dtype: int64

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 140213 entries, 0 to 410562
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   CID     140213 non-null  int64
 1   SID     140213 non-null  int64
 2   target  140213 non-null  int64
dtypes: int64(3)
memory usage: 4.3 MB


In [18]:
# Shuffle the resulting data set
df = df.sample(
    frac = 1,        # Return entire dataframe
    random_state=1   # Make result reproducible
    ).reset_index(drop=True)

# Display the data frame
print('Shape of df: ', df.shape)
df.head()

Shape of df:  (140213, 3)


Unnamed: 0,CID,SID,target
0,7936857,49828105,0
1,3201532,17431515,1
2,53299818,124753804,1
3,16010787,24387343,1
4,51360167,121284805,0


[<a href="#content">Back to top</a>]

## Generate a csv file with CIDs and targets for IUPAC names dowloading  <a name="4"></a>

In [20]:
df.to_csv('CIDs_targets_TDP1.csv') 

[<a href="#content">Back to top</a>]