In [None]:
# PART 2: Step 1: Data Preparation and Feature Engineering


In [54]:
## Install dependencies
!pip install rdkit pandas scikit-learn matplotlib seaborn



In [55]:

## Load and preprocess dataset
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import MACCSkeys
from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Load dataset
data = pd.read_csv("bioactivity_smiles_subset.csv")


In [57]:
data

Unnamed: 0,Bioactivity_ID,Activity,Compound_CID,SMILES
0,66455236,Inactive,665485,CN1CCN(CC1)C2=NC3=C(N2CCC4=CC=C(C=C4)OC)C(=O)N...
1,83282869,Active,6469913,CC1=CC(=C(C=C1)NC(=O)NCC2=CC=C(C=C2)OC)C
2,66347256,Inactive,4522106,CC1CCCCN1S(=O)(=O)C2=CC=CC3=C2N=CC=C3
3,83282465,Active,6407175,CC1=C(C=C(C=C1)C2=NN=C(C3=CC=CC=C32)NCC4=NC5=C...
4,66403528,Inactive,2401200,C1CN(CCC1C(=O)N)C(=O)CCC(=O)C2=CC=C(C=C2)F
...,...,...,...,...
1995,66339053,Inactive,919981,CC1=CC(=C(C(=C1)C)S(=O)(=O)N2C=CC3=CC=CC=C32)C
1996,66333172,Inactive,650428,C1COCCN1C(=O)C2=CC=C(C=C2)NC(=O)NC3=CC(=CC=C3)Cl
1997,66307825,Active,2305037,CCOC1=CC=C(C=C1)NC(=O)CC(=O)NC2=CC=C(C=C2)OCC
1998,66500074,Inactive,2810563,C1COCCN1S(=O)(=O)C2=CC=C(C=C2)NC(=O)C3=NC=CN=C...


In [58]:
# Keep only SMILES and Activity
data = data[["SMILES", "Activity"]]
data

Unnamed: 0,SMILES,Activity
0,CN1CCN(CC1)C2=NC3=C(N2CCC4=CC=C(C=C4)OC)C(=O)N...,Inactive
1,CC1=CC(=C(C=C1)NC(=O)NCC2=CC=C(C=C2)OC)C,Active
2,CC1CCCCN1S(=O)(=O)C2=CC=CC3=C2N=CC=C3,Inactive
3,CC1=C(C=C(C=C1)C2=NN=C(C3=CC=CC=C32)NCC4=NC5=C...,Active
4,C1CN(CCC1C(=O)N)C(=O)CCC(=O)C2=CC=C(C=C2)F,Inactive
...,...,...
1995,CC1=CC(=C(C(=C1)C)S(=O)(=O)N2C=CC3=CC=CC=C32)C,Inactive
1996,C1COCCN1C(=O)C2=CC=C(C=C2)NC(=O)NC3=CC(=CC=C3)Cl,Inactive
1997,CCOC1=CC=C(C=C1)NC(=O)CC(=O)NC2=CC=C(C=C2)OCC,Active
1998,C1COCCN1S(=O)(=O)C2=CC=C(C=C2)NC(=O)C3=NC=CN=C...,Inactive


In [59]:
# descriptor function to include physicochemical and topological descriptors

from rdkit import Chem
from rdkit.Chem import Descriptors, Crippen, Lipinski

# Function to compute extended descriptors
def compute_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        # Return NaN for invalid SMILES
        return pd.Series([np.nan]*11,
                         index=["MolWt", "LogP", "NumHDonors", "NumHAcceptors",
                                "NumRings", "NumRotatableBonds", "NumHeavyAtoms",
                                "NumHeteroatoms", "NumAromaticRings", "TPSA", "FractionCSP3"])

    # Compute descriptors
    return pd.Series([
        Descriptors.MolWt(mol),                  # Molecular Weight
        Crippen.MolLogP(mol),                    # LogP (hydrophobicity)
        Lipinski.NumHDonors(mol),                # H-bond donors
        Lipinski.NumHAcceptors(mol),             # H-bond acceptors
        Lipinski.RingCount(mol),                 # Number of rings
        Lipinski.NumRotatableBonds(mol),         # Rotatable bonds
        Descriptors.HeavyAtomCount(mol),         # Number of heavy atoms
        Descriptors.NumHeteroatoms(mol),         # Number of heteroatoms
        Lipinski.NumAromaticRings(mol),          # Number of aromatic rings
        Descriptors.TPSA(mol),                   # Topological Polar Surface Area
        Lipinski.FractionCSP3(mol)               # Fraction of sp3 carbons
    ], index=["MolWt", "LogP", "NumHDonors", "NumHAcceptors",
              "NumRings", "NumRotatableBonds", "NumHeavyAtoms",
              "NumHeteroatoms", "NumAromaticRings", "TPSA", "FractionCSP3"])

# Apply descriptor calculation to all SMILES
desc_df = data["SMILES"].apply(compute_descriptors)

In [60]:
desc_df

Unnamed: 0,MolWt,LogP,NumHDonors,NumHAcceptors,NumRings,NumRotatableBonds,NumHeavyAtoms,NumHeteroatoms,NumAromaticRings,TPSA,FractionCSP3
0,398.467,0.42640,1.0,8.0,4.0,5.0,29.0,9.0,3.0,88.39,0.450000
1,284.359,3.63374,2.0,2.0,2.0,4.0,21.0,4.0,2.0,50.36,0.235294
2,290.388,2.79790,0.0,3.0,3.0,2.0,20.0,5.0,2.0,50.27,0.400000
3,472.574,4.34392,2.0,6.0,5.0,6.0,34.0,9.0,5.0,103.87,0.160000
4,306.337,1.51250,1.0,3.0,2.0,5.0,22.0,6.0,1.0,80.47,0.437500
...,...,...,...,...,...,...,...,...,...,...,...
1995,299.395,3.80356,0.0,3.0,3.0,2.0,21.0,4.0,3.0,39.07,0.176471
1996,359.813,3.45640,2.0,3.0,3.0,3.0,25.0,7.0,2.0,70.67,0.222222
1997,342.395,3.45130,2.0,4.0,2.0,8.0,25.0,6.0,2.0,76.66,0.263158
1998,392.393,0.44800,2.0,7.0,3.0,5.0,27.0,11.0,2.0,138.79,0.250000


In [61]:
# Merge descriptors with original dataframe
data = pd.concat([data, desc_df], axis=1)
data

Unnamed: 0,SMILES,Activity,MolWt,LogP,NumHDonors,NumHAcceptors,NumRings,NumRotatableBonds,NumHeavyAtoms,NumHeteroatoms,NumAromaticRings,TPSA,FractionCSP3
0,CN1CCN(CC1)C2=NC3=C(N2CCC4=CC=C(C=C4)OC)C(=O)N...,Inactive,398.467,0.42640,1.0,8.0,4.0,5.0,29.0,9.0,3.0,88.39,0.450000
1,CC1=CC(=C(C=C1)NC(=O)NCC2=CC=C(C=C2)OC)C,Active,284.359,3.63374,2.0,2.0,2.0,4.0,21.0,4.0,2.0,50.36,0.235294
2,CC1CCCCN1S(=O)(=O)C2=CC=CC3=C2N=CC=C3,Inactive,290.388,2.79790,0.0,3.0,3.0,2.0,20.0,5.0,2.0,50.27,0.400000
3,CC1=C(C=C(C=C1)C2=NN=C(C3=CC=CC=C32)NCC4=NC5=C...,Active,472.574,4.34392,2.0,6.0,5.0,6.0,34.0,9.0,5.0,103.87,0.160000
4,C1CN(CCC1C(=O)N)C(=O)CCC(=O)C2=CC=C(C=C2)F,Inactive,306.337,1.51250,1.0,3.0,2.0,5.0,22.0,6.0,1.0,80.47,0.437500
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,CC1=CC(=C(C(=C1)C)S(=O)(=O)N2C=CC3=CC=CC=C32)C,Inactive,299.395,3.80356,0.0,3.0,3.0,2.0,21.0,4.0,3.0,39.07,0.176471
1996,C1COCCN1C(=O)C2=CC=C(C=C2)NC(=O)NC3=CC(=CC=C3)Cl,Inactive,359.813,3.45640,2.0,3.0,3.0,3.0,25.0,7.0,2.0,70.67,0.222222
1997,CCOC1=CC=C(C=C1)NC(=O)CC(=O)NC2=CC=C(C=C2)OCC,Active,342.395,3.45130,2.0,4.0,2.0,8.0,25.0,6.0,2.0,76.66,0.263158
1998,C1COCCN1S(=O)(=O)C2=CC=C(C=C2)NC(=O)C3=NC=CN=C...,Inactive,392.393,0.44800,2.0,7.0,3.0,5.0,27.0,11.0,2.0,138.79,0.250000


In [62]:
# Drop any rows with invalid SMILES
data.dropna(inplace=True)

In [63]:
data

Unnamed: 0,SMILES,Activity,MolWt,LogP,NumHDonors,NumHAcceptors,NumRings,NumRotatableBonds,NumHeavyAtoms,NumHeteroatoms,NumAromaticRings,TPSA,FractionCSP3
0,CN1CCN(CC1)C2=NC3=C(N2CCC4=CC=C(C=C4)OC)C(=O)N...,Inactive,398.467,0.42640,1.0,8.0,4.0,5.0,29.0,9.0,3.0,88.39,0.450000
1,CC1=CC(=C(C=C1)NC(=O)NCC2=CC=C(C=C2)OC)C,Active,284.359,3.63374,2.0,2.0,2.0,4.0,21.0,4.0,2.0,50.36,0.235294
2,CC1CCCCN1S(=O)(=O)C2=CC=CC3=C2N=CC=C3,Inactive,290.388,2.79790,0.0,3.0,3.0,2.0,20.0,5.0,2.0,50.27,0.400000
3,CC1=C(C=C(C=C1)C2=NN=C(C3=CC=CC=C32)NCC4=NC5=C...,Active,472.574,4.34392,2.0,6.0,5.0,6.0,34.0,9.0,5.0,103.87,0.160000
4,C1CN(CCC1C(=O)N)C(=O)CCC(=O)C2=CC=C(C=C2)F,Inactive,306.337,1.51250,1.0,3.0,2.0,5.0,22.0,6.0,1.0,80.47,0.437500
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,CC1=CC(=C(C(=C1)C)S(=O)(=O)N2C=CC3=CC=CC=C32)C,Inactive,299.395,3.80356,0.0,3.0,3.0,2.0,21.0,4.0,3.0,39.07,0.176471
1996,C1COCCN1C(=O)C2=CC=C(C=C2)NC(=O)NC3=CC(=CC=C3)Cl,Inactive,359.813,3.45640,2.0,3.0,3.0,3.0,25.0,7.0,2.0,70.67,0.222222
1997,CCOC1=CC=C(C=C1)NC(=O)CC(=O)NC2=CC=C(C=C2)OCC,Active,342.395,3.45130,2.0,4.0,2.0,8.0,25.0,6.0,2.0,76.66,0.263158
1998,C1COCCN1S(=O)(=O)C2=CC=C(C=C2)NC(=O)C3=NC=CN=C...,Inactive,392.393,0.44800,2.0,7.0,3.0,5.0,27.0,11.0,2.0,138.79,0.250000


In [64]:
# Map Activity to numeric (Active=1, Inactive=0)
data["Activity"] = data["Activity"].map({"Active": 1, "Inactive": 0})

In [65]:
data

Unnamed: 0,SMILES,Activity,MolWt,LogP,NumHDonors,NumHAcceptors,NumRings,NumRotatableBonds,NumHeavyAtoms,NumHeteroatoms,NumAromaticRings,TPSA,FractionCSP3
0,CN1CCN(CC1)C2=NC3=C(N2CCC4=CC=C(C=C4)OC)C(=O)N...,0,398.467,0.42640,1.0,8.0,4.0,5.0,29.0,9.0,3.0,88.39,0.450000
1,CC1=CC(=C(C=C1)NC(=O)NCC2=CC=C(C=C2)OC)C,1,284.359,3.63374,2.0,2.0,2.0,4.0,21.0,4.0,2.0,50.36,0.235294
2,CC1CCCCN1S(=O)(=O)C2=CC=CC3=C2N=CC=C3,0,290.388,2.79790,0.0,3.0,3.0,2.0,20.0,5.0,2.0,50.27,0.400000
3,CC1=C(C=C(C=C1)C2=NN=C(C3=CC=CC=C32)NCC4=NC5=C...,1,472.574,4.34392,2.0,6.0,5.0,6.0,34.0,9.0,5.0,103.87,0.160000
4,C1CN(CCC1C(=O)N)C(=O)CCC(=O)C2=CC=C(C=C2)F,0,306.337,1.51250,1.0,3.0,2.0,5.0,22.0,6.0,1.0,80.47,0.437500
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,CC1=CC(=C(C(=C1)C)S(=O)(=O)N2C=CC3=CC=CC=C32)C,0,299.395,3.80356,0.0,3.0,3.0,2.0,21.0,4.0,3.0,39.07,0.176471
1996,C1COCCN1C(=O)C2=CC=C(C=C2)NC(=O)NC3=CC(=CC=C3)Cl,0,359.813,3.45640,2.0,3.0,3.0,3.0,25.0,7.0,2.0,70.67,0.222222
1997,CCOC1=CC=C(C=C1)NC(=O)CC(=O)NC2=CC=C(C=C2)OCC,1,342.395,3.45130,2.0,4.0,2.0,8.0,25.0,6.0,2.0,76.66,0.263158
1998,C1COCCN1S(=O)(=O)C2=CC=C(C=C2)NC(=O)C3=NC=CN=C...,0,392.393,0.44800,2.0,7.0,3.0,5.0,27.0,11.0,2.0,138.79,0.250000


In [66]:
## SAVE AS FINAL CSV FILE AS [ EBNA1_curated_data.csv ]

data.to_csv("EBNA1_curated_data.csv", index=False)

In [None]:
## Morgan Fingerprint generator

from rdkit.Chem import rdFingerprintGenerator

# Load curated dataset
data = pd.read_csv("EBNA1_curated_data.csv")

In [68]:
# Create a 1024-bit Morgan fingerprint generator (ECFP4, radius=2)
morgan_gen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=1024)

# Function to generate Morgan fingerprint
def smiles_to_morgan(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return np.zeros(1024)  # Return zeros for invalid SMILES
    return morgan_gen.GetFingerprintAsNumPy(mol)

# Apply fingerprint generation
data['MorganFingerprint'] = data['SMILES'].apply(smiles_to_morgan)

In [69]:
# Convert fingerprints to numpy array for ML
import numpy as np
X = np.vstack(data['MorganFingerprint'].values)
y = data['Activity'].values

print("Morgan fingerprints generated:", X.shape)
print("Activity labels:", y.shape)

Morgan fingerprints generated: (2000, 1024)
Activity labels: (2000,)


In [70]:
# Save fingerprints as CSV for ML
fingerprint_df = pd.DataFrame(X)
fingerprint_df['Activity'] = y
fingerprint_df.to_csv("EBNA1_MorganFingerprints.csv", index=False)

print("Morgan fingerprints saved to EBNA1_MorganFingerprints.csv")

Morgan fingerprints saved to EBNA1_MorganFingerprints.csv


In [71]:
fingerprint_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1015,1016,1017,1018,1019,1020,1021,1022,1023,Activity
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [76]:
## Train/Test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Train set:", X_train.shape, "Test set:", X_test.shape)

Train set: (1600, 1024) Test set: (400, 1024)


In [77]:
# Save processed data
import pickle
with open("X_train.pkl", "wb") as f: pickle.dump(X_train, f)
with open("X_test.pkl", "wb") as f: pickle.dump(X_test, f)
with open("y_train.pkl", "wb") as f: pickle.dump(y_train, f)
with open("y_test.pkl", "wb") as f: pickle.dump(y_test, f)

print(" Features extracted and data prepared.")

 Features extracted and data prepared.
