In [None]:
from google.colab import drive #importing drive for access
drive.mount('/content/drive') #mounting drive so colab can access files

Mounted at /content/drive


In [None]:
import pandas as pd #importing pandas

# reading the csv
data = pd.read_csv("/content/drive/Shareddrives/1:1_Aiden_Chavda/Dataset/chembl_TGF-betareceptortypeI.csv")
data.shape



(3687, 46)

In [None]:
# filter only the IC50 standard type
data_filter = data[data['standard_type'] == "IC50"]
data_filter.reset_index(drop = True, inplace = True)
data_filter.head()

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,393576,[],CHEMBL814453,Inhibition of Transforming growth factor beta-...,B,,,BAO_0000190,...,Homo sapiens,TGF-beta receptor type I,9606,,,IC50,uM,UO_0000065,,30.0
1,,,411577,[],CHEMBL814454,Inhibition of Transforming growth factor beta-...,B,,,BAO_0000190,...,Homo sapiens,TGF-beta receptor type I,9606,,,IC50,uM,UO_0000065,,27.0
2,,,829600,[],CHEMBL814457,In vitro inhibitory activity against human Tra...,B,,,BAO_0000190,...,Homo sapiens,TGF-beta receptor type I,9606,,,IC50,uM,UO_0000065,,0.034
3,,,829601,[],CHEMBL728561,In vitro inhibition of transforming growth fac...,F,,,BAO_0000190,...,Homo sapiens,TGF-beta receptor type I,9606,,,IC50,uM,UO_0000065,,0.0029
4,,,829602,[],CHEMBL740149,In vitro inhibition of transforming growth fac...,F,,,BAO_0000190,...,Homo sapiens,TGF-beta receptor type I,9606,,,IC50,uM,UO_0000065,,0.0071


In [None]:
# count null values
null_values = data_filter["standard_value"].isnull().sum() # sums null values
print("Nulls in standard value column: {}".format(null_values))

# removes any null values (if any)
if null_values > 0:
  print("Removing nans in standard_value column")
  print("Shape before dropping the nulls: {}".format(data_filter.shape)) # prints rows and columns
  data_filter_nans = data_filter[data_filter.standard_value.notna()]
  data_filter_nans.reset_index(drop = True, inplace = True)
  print("Shape after dropping the nulls: {}".format(data_filter_nans.shape))
else:
  print("No Nans in standard_value column")
  data_filter_nans = data_filter.copy()

Nulls in standard value column: 56
Removing nans in standard_value column
Shape before dropping the nulls: (1730, 46)
Shape after dropping the nulls: (1674, 46)


In [None]:
# check duplicates
duplicates = data_filter_nans["canonical_smiles"].duplicated().sum()
print("Duplicates in canonical smiles column: {}".format(duplicates))

# removes any duplicates (if any)
if duplicates > 0:
  print("Removing duplicates in canonical smiles column")
  print("Shape before dropping the duplicates: {}".format(data_filter_nans.shape)) # prints rows and columns
  data_filter_dup = data_filter_nans.drop_duplicates(["canonical_smiles"])
  data_filter_dup.reset_index(drop = True, inplace = True)
  print("Shape after dropping the duplicates: {}".format(data_filter_dup.shape))
else:
  print("No duplicates in canonical smiles column")
  data_filter_dup = data_filter_nans.copy()

Duplicates in canonical smiles column: 494
Removing duplicates in canonical smiles column
Shape before dropping the duplicates: (1674, 46)
Shape after dropping the duplicates: (1180, 46)


In [None]:
# count null values (canonical smiles)
null_smile = data_filter_dup["canonical_smiles"].isnull().sum() # sums null values
print("Nulls in canonical_smiles column: {}".format(null_smile))

# removes any null values (if any)
if null_smile > 0:
  print("Removing nans in canonical smiles column")
  print("Shape before dropping the nulls: {}".format(data_filter_dup.shape)) # prints rows and columns
  data_filter_smile = data_filter_dup[data_filter_dup.canonical_smiles.notna()]
  data_filter_smile.reset_index(drop = True, inplace = True)
  print("Shape after dropping the nulls: {}".format(data_filter_smile.shape))
else:
  print("No Nans in standard_value column")
  data_filter_smile = data_filter_dup.copy()

Nulls in canonical_smiles column: 0
No Nans in standard_value column


In [None]:
# filter out needed values
# only need canonical_smiles and standard_value (IC50 value)
select_columns = ['canonical_smiles', 'standard_value']
df = data_filter_smile[select_columns]
df.head()

Unnamed: 0,canonical_smiles,standard_value
0,C[S+]([O-])c1ccc(-c2nc(-c3ccc(F)cc3)c(-c3ccncc...,30000.0
1,c1ccc(-c2n[nH]cc2-c2ccnc3ccccc23)nc1,27000.0
2,Cc1cccc(-c2[nH]ncc2-c2ccnc3ccccc23)n1,34.0
3,Cc1cccc(-c2n[nH]cc2-c2ccc(O)cc2)n1,31.0
4,Cc1cccc(-c2[nH]ncc2-c2ccc(F)c(F)c2)n1,151.0


In [None]:
# convert from IC50 to PIC50
import numpy as np

# define function for converting
def pIC50(standard_value):
  molar = standard_value * (10**-9) # converts nM to M
  return -np.log10(molar)

df['pIC50'] = df['standard_value'].apply(pIC50)
df.drop("standard_value", axis = 1, inplace = True)
df.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['pIC50'] = df['standard_value'].apply(pIC50)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop("standard_value", axis = 1, inplace = True)


Unnamed: 0,canonical_smiles,pIC50
0,C[S+]([O-])c1ccc(-c2nc(-c3ccc(F)cc3)c(-c3ccncc...,4.522879
1,c1ccc(-c2n[nH]cc2-c2ccnc3ccccc23)nc1,4.568636
2,Cc1cccc(-c2[nH]ncc2-c2ccnc3ccccc23)n1,7.468521
3,Cc1cccc(-c2n[nH]cc2-c2ccc(O)cc2)n1,7.508638
4,Cc1cccc(-c2[nH]ncc2-c2ccc(F)c(F)c2)n1,6.821023


In [None]:
# count null values (pIC50)
new_nulls = df["pIC50"].isnull().sum() # sums null values
print("Nulls in pIC50 column: {}".format(new_nulls))

# removes any null values (if any)
if new_nulls > 0:
  print("Removing nans in pIC50 column")
  print("Shape before dropping the nulls: {}".format(df.shape)) # prints rows and columns
  df_no_null = df[df.pIC50.notna()]
  df_no_null.reset_index(drop = True, inplace = True)
  print("Shape after dropping the nulls: {}".format(df_no_null.shape))
else:
  print("No Nans in pIC50 column")
  df_no_null = df.copy()

Nulls in pIC50 column: 0
No Nans in pIC50 column


In [None]:
# save dataframe
df_no_null.to_csv("/content/drive/Shareddrives/1:1_Aiden_Chavda/Dataset/smile_pIC50_processed.csv", index = False)