# Generation of a csv file for downloading of IUPACs name related to the the GPR151 protein activators 

### Content   <a name="content"></a>

1. [Load and analyse PubChem BioAssay GPR151 protein activators data](#1)
2. [Create a data frame with CIDs and targets](#2)
3. [Reduce the inactive compounds](#3)
4. [Generate the csv file for IUPAC names dowloading](#4)

### Load and analyse PubChem BioAssay GPR151 data <a name="1"></a>

In [1]:
import pandas as pd 

# load the GPR151 BioAssay PubChem AID 1508602  dataset
df = pd.read_csv('pubchem_GPR.csv', sep=';', low_memory=False, on_bad_lines='skip')

# To avoid truncation of some columns during data frame display
pd.set_option('display.max_columns', None) 

# Display the data frame
print('Shape of the data frame: ', df.shape)
df.head()

Shape of the data frame:  (646675, 9)


Unnamed: 0,PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_EXT_DATASOURCE_SMILES,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Activation at 11.2 uM
0,1,332971238,5308528.0,COC1=CC=CC=C1S(=O)(=O)CC2=CC=C(O2)C(=O)N3CCOCC3,Active,100,,,46.1972
1,2,333069819,45201490.0,C1COC2=C(CN1CC3=CC(=CC=C3)Cl)C=C(C=C2)C(C4=CN=...,Active,88,,,40.8481
2,3,333168812,2304502.0,COC1=CC=C(C=C1)C(=O)NC2=CC=CC(=C2)C(=O)N3CCCC3,Active,85,,,39.4266
3,4,333285740,1106138.0,C1=CC2=C(C=CC=C2Br)C(=C1)C(=O)NC3=CC=C(C=C3)NC...,Active,75,,,34.6718
4,5,332995771,621204.0,CC1=C(C=C(C=C1)N=CC2=CC(=C(C=C2)OC(=O)C)OC)F,Active,74,,,34.3703


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 646675 entries, 0 to 646674
Data columns (total 9 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   PUBCHEM_RESULT_TAG             646675 non-null  int64  
 1   PUBCHEM_SID                    646675 non-null  int64  
 2   PUBCHEM_CID                    646472 non-null  float64
 3   PUBCHEM_EXT_DATASOURCE_SMILES  646472 non-null  object 
 4   PUBCHEM_ACTIVITY_OUTCOME       646675 non-null  object 
 5   PUBCHEM_ACTIVITY_SCORE         646675 non-null  int64  
 6   PUBCHEM_ACTIVITY_URL           0 non-null       float64
 7   PUBCHEM_ASSAYDATA_COMMENT      0 non-null       float64
 8   Activation at 11.2 uM          646675 non-null  object 
dtypes: float64(3), int64(3), object(3)
memory usage: 44.4+ MB


In [3]:
# Remove duplicates and reset the indexes 
df = df.drop_duplicates(subset='PUBCHEM_CID', keep=False).reset_index(drop=True)
df.shape

(639809, 9)

In [4]:
# Turn CID float data type into integer
df['PUBCHEM_CID'] = df['PUBCHEM_CID'].astype('int64') 

In [5]:
# Filter inhibitors 
df = df[['PUBCHEM_CID',
         'PUBCHEM_SID',
         'PUBCHEM_ACTIVITY_OUTCOME']] 
df.shape # 646675

(639809, 3)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 639809 entries, 0 to 639808
Data columns (total 3 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   PUBCHEM_CID               639809 non-null  int64 
 1   PUBCHEM_SID               639809 non-null  int64 
 2   PUBCHEM_ACTIVITY_OUTCOME  639809 non-null  object
dtypes: int64(2), object(1)
memory usage: 14.6+ MB


[<a href="#content">Back to top</a>]

## Create a data frame with CIDs and targets <a name="2"></a>

In [7]:
df['PUBCHEM_ACTIVITY_OUTCOME'] = df['PUBCHEM_ACTIVITY_OUTCOME'].astype(str) 
# Get unique values from 'column1'
unique_values = df['PUBCHEM_ACTIVITY_OUTCOME'].unique()
unique_values 

array(['Active', 'Inactive'], dtype=object)

In [8]:
df.rename(columns={'PUBCHEM_CID':'CID',
                   'PUBCHEM_SID':'SID',
                   'PUBCHEM_ACTIVITY_OUTCOME':'target'}, inplace=True)

# Set the option to explicitly handle downcasting
pd.set_option('future.no_silent_downcasting', True)

# Create a mapping dictionary to replace string values with numeric values
mapping = {'Active': 1, 'Inactive': 0}

# Replace string values with numeric values using the mapping dictionary
df['target'] = df['target'].replace(mapping)

# Display the data frame
print('Shape of the data frame: ', df.shape)
df.head()

Shape of the data frame:  (639809, 3)


Unnamed: 0,CID,SID,target
0,5308528,332971238,1
1,45201490,333069819,1
2,2304502,333168812,1
3,1106138,333285740,1
4,621204,332995771,1


In [9]:
# Turn CID float data type into integer
df['target'] = pd.to_numeric(df['target']) 

In [10]:
df['target'].value_counts()

target
0    633173
1      6636
Name: count, dtype: int64

In [11]:
df_1 = df[df['target']==1]
df_1.shape

(6636, 3)

[<a href="#content">Back to top</a>]

## Reduce the inactive compounds <a name="3"></a>

In [12]:
# CSeparate target 0 compounds
df_0 = df[df['target']==0]

# # Shuffle the resulting data set
# df_0 = df_0.sample(
#     frac = 1,        # Return entire dataframe
#     random_state=1   # Make result reproducible
#     ).reset_index(drop=True)

# # Extract every nth row (e.g., every 3rd row)      
# n = 17
# df_0 = df_0[df_0.index % n == 0] # 0.75

# df_0 = df.iloc[:37000]           # 0.845
# df_0 = df.iloc[37000:74000]      # 0.81
# df_0 = df.iloc[74000:111000]     # 0.813
# df_0 = df.iloc[111000:148000]    # 0.813
# df_0 = df.iloc[148000:155000]    # 0.564
# df_0 = df.iloc[155000:192000]    # 0.813
# df_0 = df.iloc[192000:229000]    # 0.817
# df_0 = df.iloc[229000:266000]    # 0.814
# df_0 = df.iloc[266000:303000]    # 0.814
# df_0 = df.iloc[303000:340000]    # 0.813
# df_0 = df.iloc[340000:377000]    # 0.816 
# df_0 = df.iloc[377000:414000]    # 0.814
# df_0 = df.iloc[414000:451000]    # 0.814
# df_0 = df.iloc[451000:488000]    # 0.816
# df_0 = df.iloc[488000:525000]    # 0.814
# df_0 = df.iloc[525000:562000]    # 0.815
# df_0 = df.iloc[562000:599000]    # 0.812
# df_0 = df.iloc[599000:639809]    # 0.83

# Display the data frame
print('Shape of df_0: ', df_0.shape)

Shape of df_0:  (633173, 3)


In [13]:
# Print what should be ignored 
df_ignore = df_0.iloc[148000:155000] 
df_ignore.to_csv('unusful_for_GPR151_prediction.csv') 

In [14]:
# Create df, excluding the part with cross-validation score 0.564
df_1st_part = df_0.iloc[:148000] 
df_2nd_part = df_0.iloc[155000:] 
df_0 = pd.concat([df_1st_part, df_2nd_part])

# Shuffle the resulting data set
df_0 = df_0.sample(
    frac = 1,        # Return entire dataframe
    random_state=1   # Make result reproducible
    ).reset_index(drop=True)

# Extract every nth row     
n = 16
df_0 = df_0[df_0.index % n == 0] 
df_0.shape

(39136, 3)

In [15]:
# Concatenate target 1 and 0 compounds 
df = pd.concat([df_0, df_1])

In [16]:
# Count each of the binary targets
df['target'].value_counts()

target
0    39136
1     6636
Name: count, dtype: int64

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45772 entries, 0 to 6635
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   CID     45772 non-null  int64
 1   SID     45772 non-null  int64
 2   target  45772 non-null  int64
dtypes: int64(3)
memory usage: 1.4 MB


In [18]:
# Shuffle the resulting data set
df = df.sample(
    frac = 1,        # Return entire dataframe
    random_state=1   # Make result reproducible
    ).reset_index(drop=True)

# Display the data frame
print('Shape of df: ', df.shape)
df.head()

Shape of df:  (45772, 3)


Unnamed: 0,CID,SID,target
0,5342432,333096538,0
1,1531854,333461953,0
2,780553,333246479,0
3,734064,333159273,0
4,5052640,333429003,0


[<a href="#content">Back to top</a>]

In [19]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
import numpy as np

X = df.drop(columns='target', axis=1)
y = df['target']

# Instantiate the estimator 
model = RandomForestClassifier()

# Perform 5-fold cross-validation
scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')

print("Cross-validation scores:", np.round(scores, 3))
print("Mean cross-validation score:", np.round(scores.mean(), 3))

Cross-validation scores: [0.824 0.825 0.823 0.822 0.82 ]
Mean cross-validation score: 0.823


[<a href="#content">Back to top</a>]

## Generate a csv file with CIDs and targets for IUPAC names dowloading  <a name="4"></a>

In [20]:
df.to_csv('CIDs_targets_GPR151.csv') 

[<a href="#content">Back to top</a>]