In [28]:
#import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [29]:
#import training data
df = pd.read_csv('../data/train.csv')

In [30]:
#check first 5 entries of smiles
df['SMILES'].head()

0          FC1=C(F)C(F)(F)C1(F)F
1    c1ccc2c(c1)ccc3Nc4ccccc4c23
2            CCN1C(C)=Nc2ccccc12
3                     CC#CC(=O)O
4                      CCCCC(S)C
Name: SMILES, dtype: object

In [31]:
#get info about smiles
df['SMILES'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 2662 entries, 0 to 2661
Series name: SMILES
Non-Null Count  Dtype 
--------------  ----- 
2662 non-null   object
dtypes: object(1)
memory usage: 20.9+ KB


In [32]:
#create a function to generate basic features from smiles
from rdkit import Chem
from rdkit.Chem import Descriptors

# --------------------------------
# Function: Extract basic molecular features
# Input : SMILES string
# Output: Dictionary of numeric features
# --------------------------------
def basic_features(smiles):
    """
    Extracts basic, count-based molecular features from a SMILES string.
    These features describe the size and elemental composition of a molecule.
    """

    # Convert SMILES string into an RDKit molecule object
    # If SMILES is invalid, RDKit returns None
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    return {
        # Molecular weight (sum of atomic weights of all atoms)
        "mol_weight": Descriptors.MolWt(mol),

        # Total number of explicit (non-hydrogen) atoms
        "num_atoms": mol.GetNumAtoms(),

        # Number of rings in the molecule (important for rigidity & packing)
        "num_rings": len(Chem.GetSSSR(mol)),

        # Number of carbon atoms
        "num_C": sum(1 for atom in mol.GetAtoms() if atom.GetSymbol() == "C"),

        # Number of nitrogen atoms (often affects polarity & H-bonding)
        "num_N": sum(1 for atom in mol.GetAtoms() if atom.GetSymbol() == "N"),

        # Number of oxygen atoms (strong effect on polarity)
        "num_O": sum(1 for atom in mol.GetAtoms() if atom.GetSymbol() == "O"),

        # Number of sulfur atoms (heavier heteroatom, affects melting point)
        "num_S": sum(1 for atom in mol.GetAtoms() if atom.GetSymbol() == "S"),
    }

In [33]:
# Apply feature extraction to dataset

# Apply the feature extraction function to each SMILES string
# This returns a Series of dictionaries
basic_features_df = df["SMILES"].apply(basic_features)

# Convert the dictionaries into a DataFrame
basic_features_df = basic_features_df.apply(pd.Series)

In [34]:
#check basic features dataframe head
basic_features_df.head()

Unnamed: 0,mol_weight,num_atoms,num_rings,num_C,num_N,num_O,num_S
0,162.032,10.0,1.0,4.0,0.0,0.0,0.0
1,217.271,17.0,4.0,16.0,1.0,0.0,0.0
2,160.22,12.0,2.0,10.0,2.0,0.0,0.0
3,84.074,6.0,0.0,4.0,0.0,2.0,0.0
4,118.245,7.0,0.0,6.0,0.0,0.0,1.0


In [35]:
#add basic features to original dataframe
df = pd.concat([df, basic_features_df], axis=1)
df.head()

Unnamed: 0,id,SMILES,Tm,Group 1,Group 2,Group 3,Group 4,Group 5,Group 6,Group 7,...,Group 422,Group 423,Group 424,mol_weight,num_atoms,num_rings,num_C,num_N,num_O,num_S
0,2175,FC1=C(F)C(F)(F)C1(F)F,213.15,0,0,0,0,0,0,0,...,0,0,0,162.032,10.0,1.0,4.0,0.0,0.0,0.0
1,1222,c1ccc2c(c1)ccc3Nc4ccccc4c23,407.15,0,0,0,0,0,0,0,...,0,0,0,217.271,17.0,4.0,16.0,1.0,0.0,0.0
2,2994,CCN1C(C)=Nc2ccccc12,324.15,2,1,0,0,0,0,0,...,0,0,0,160.22,12.0,2.0,10.0,2.0,0.0,0.0
3,1704,CC#CC(=O)O,351.15,1,0,0,0,0,0,0,...,0,0,0,84.074,6.0,0.0,4.0,0.0,2.0,0.0
4,2526,CCCCC(S)C,126.15,2,3,0,0,0,0,0,...,0,0,0,118.245,7.0,0.0,6.0,0.0,0.0,1.0
