In [1]:
#Assignment_1
#Importing required libraries

import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit import DataStructs
from rdkit.Chem import PandasTools
from rdkit.Chem import MACCSkeys
from rdkit.Chem import AllChem 
from rdkit.Chem import rdMolDescriptors, rdFingerprintGenerator
from rdkit.Chem import AtomPairs
from rdkit.Chem.Pharm2D import Gobbi_Pharm2D,Generate
from rdkit.Chem import Descriptors
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler




#-----------------------------------------------------------------------------------------------------------------------------------
#Part A



#(1)Importing dataset_1

ds1=pd.read_csv('Assignment_1/dataset_1.csv', header=0)

#(2)Removing NaN values

ds1.dropna(subset=["SMILES","LABELS"],inplace=True,ignore_index=True)

#(3)Checking for invalid SMILES

invalid_SMILES = []

for smile in range(len(ds1)):
    smile_val = ds1['SMILES'][smile]
    mol = Chem.MolFromSmiles(smile_val, sanitize=False)
    try:
        Chem.SanitizeMol(mol)
    except Exception:
        print(f"Invalid SMILES at index {smile}: {smile_val}")
        invalid_SMILES.append(smile)

# Delete rows with invalid SMILES
ds1.drop(invalid_SMILES, inplace=True)
print(f"Dataset had {len(invalid_SMILES)} invalid SMILES")
print(f"Dataset shape after cleaning : {ds1.shape}")

#Resetting Indexes
ds1 = ds1.reset_index(drop=True)
#Proceeding with (4) generating MACCS fingerprints

PandasTools.RenderImagesInAllDataFrames(True)
macckeys_list=[]

PandasTools.AddMoleculeColumnToFrame(frame=ds1, smilesCol='SMILES')


for fp in range(len(ds1)):
    molfp=ds1['ROMol'][fp]
    maccfp=MACCSkeys.GenMACCSKeys(molfp)
    macckeys_list.append(maccfp)
    
ds1['MACCKEYS'] = macckeys_list
ds1['MACCKEYS_STRING'] = ds1['MACCKEYS'].apply(lambda x: x.ToBitString())
ds1[['SMILES', 'MACCKEYS_STRING']].to_csv('macckeys_output.csv', index=False)



#If we want each bit with separate column
'''
macckeys_bits = ds1['MACCKEYS'].apply(lambda x: list(x.ToBitString()))
macckeys_df = pd.DataFrame(macckeys_bits.tolist(),  columns=[f'bit_{i}' for i in range(1,167)])
'''

#To load MACCKEYS again
'''
from rdkit.DataStructs import CreateFromBitString
df_example = pd.read_csv('macckeys_output.csv')

df_example['MACCKEYS'] = df_example['MACCKEYS_STRING'].apply(lambda x: CreateFromBitString(x))
'''


#-----------------------------------------------------------------------------------------------------------------------------------

#Part B


#(1) Generate circular fingerprints (ECFP)/Morgan fingerprints


morgfp =[]


#Old_function
'''
for mfp in range(len(ds1)):
    molmp=Chem.MolFromSmiles(ds1['SMILES'][mfp])
    mfp_mol = AllChem.GetMorganFingerprintAsBitVect(molmp, radius=2, nBits=2048)
    morgfp.append(mfp_mol)
'''

#Morgan Generator
morgan_gen = AllChem.GetMorganGenerator(radius=2, fpSize=2048)

for mfp in range(len(ds1)):
    molmp = Chem.MolFromSmiles(ds1['SMILES'][mfp])
    mfp_mol = morgan_gen.GetFingerprint(molmp)
    morgfp.append(mfp_mol)

ds1['ECFP'] = morgfp
ds1['ECFP_STRING'] = ds1['ECFP'].apply(lambda x: x.ToBitString())
ds1[['SMILES', 'ECFP_STRING']].to_csv('ecfp_output.csv', index=False)

#If we want each bit with separate column
'''
ecfp_bits = ds1['ECFP'].apply(lambda x: list(x.ToBitString()))
ecfp_df = pd.DataFrame(ecfp_bits.tolist(),  columns=[f'bit_{i}' for i in range(1,2049)])
'''

#To load MorganFP again
'''
from rdkit.DataStructs import CreateFromBitString
df_example = pd.read_csv('ecfp_output.csv')

df_example['ECFP'] = df_example['ECFP_STRING'].apply(lambda x: CreateFromBitString(x))
'''
#(2) Generating Atom Pair Fingerprints


ap_list=[]
apfpgen = rdFingerprintGenerator.GetAtomPairGenerator(fpSize=2048)


for apfp in range(len(ds1)):
    molap=ds1['ROMol'][apfp]
    ap_fp = apfpgen.GetFingerprint(molap)
    ap_list.append(ap_fp)


ds1['Atom_Pair'] = ap_list
ds1['Atom_Pair_STRING'] = ds1['Atom_Pair'].apply(lambda x: x.ToBitString())
ds1[['SMILES', 'Atom_Pair_STRING']].to_csv('atom_pair_output.csv', index=False)

#To load APFP again
'''
from rdkit.DataStructs import CreateFromBitString
df_example = pd.read_csv('atom_pair_output.csv')

df_example['Atom_Pair'] = df_example['Atom_Pair_STRING'].apply(lambda x: CreateFromBitString(x))
'''
#(3) Generating Topological Torsion Fingerprints
ttfp_list= []
ttfpgen = rdFingerprintGenerator.GetTopologicalTorsionGenerator(fpSize=2048)


for ttfp in range(len(ds1)):
    moltt=ds1['ROMol'][ttfp]
    tt_fp_vec = ttfpgen.GetFingerprint(moltt)
    ttfp_list.append(tt_fp_vec)

ds1['Topological_Torsion'] = ttfp_list
ds1['Topological_Torsion_STRING'] = ds1['Topological_Torsion'].apply(lambda x: x.ToBitString())
ds1[['SMILES', 'Topological_Torsion_STRING']].to_csv('topological_torsion_output.csv', index=False)

#To load Topological Torsion Fingerprints again
'''
from rdkit.DataStructs import CreateFromBitString
df_example = pd.read_csv('topological_torsion_output.csv')

df_example['Topological_Torsion'] = df_example['Topological_Torsion_STRING'].apply(lambda x: CreateFromBitString(x))
'''

# (4) Generating Pharmacophore-based fingerprints 
pharma_list=[]

for pi in range(len(ds1)):
    mol_pi= Chem.MolFromSmiles(ds1['SMILES'][pi])
    pi_fp = Generate.Gen2DFingerprint(mol_pi,Gobbi_Pharm2D.factory)
    pharma_list.append(pi_fp)

ds1['Pharmacophore_FP'] = pharma_list
ds1['Pharmacophore_FP_STRING'] = ds1['Pharmacophore_FP'].apply(lambda x: x.ToBitString())
ds1[['SMILES', 'Pharmacophore_FP_STRING']].to_csv('pharmacophore_fp_output.csv', index=False)

#Check bit information
'''
GetNumOnBits()
Gobbi_Pharm2D.factory.GetBitDescription(int)
'''
# (5) Computing all RDKit 2D descriptors
desc_list = [n[0] for n in Descriptors._descList]
print(f"Utilizing following number of descriptors : ",len(desc_list))

desc_r=[]
for i in range(len(ds1)):
    mol_i= Chem.MolFromSmiles(ds1['SMILES'][i])
    desc_dict = Descriptors.CalcMolDescriptors(mol_i)
    desc_r.append(desc_dict)

df_descriptors = pd.DataFrame(desc_r)


ds1f= pd.concat([ds1, df_descriptors], axis=1)
csv_list=['SMILES']

for j in desc_list:
   csv_list.append(j)

ds1f[csv_list].to_csv('descriptors_2d_output.csv', index=False)

#-----------------------------------------------------------------------------------------------------------------------------------


#Part C

# Importing dataset-2

ds2=pd.read_csv('Assignment_1/dataset_2.csv', header=0)

print(f"Raw data dimension :\t",ds2.shape)


#Replacing infinite values with NaN
ds2 = ds2.replace([np.inf, -np.inf], np.nan)

#Removing rows with all missing values
ds2 = ds2.dropna(how='all')

#Removing columns with any missing values
ds2 = ds2.dropna(axis=1, how='any')

#Resetting Indexes
ds2 = ds2.reset_index(drop=True)

print(f"Clean data dimension :\t",ds2.shape)
#Scaling: Applied Min-Max normalization to the data.


ds2_mm = MinMaxScaler().fit_transform(ds2)
ds2_scaled = pd.DataFrame(ds2_mm, columns=ds2.columns, index=ds2.index)
# Applying Z-score standardization 

ds2_zs= StandardScaler().fit_transform(ds2_scaled)
ds2_standardized = pd.DataFrame(ds2_zs, columns=ds2.columns, index=ds2.index)


#-----------------------------------------------------------------------------------------------------------------------------------



#Part D



def compute_rdkit_2d_descriptors(csv_file_path):

    import pandas as pd
    import numpy as np
    from rdkit import Chem
    from rdkit.Chem import Descriptors


    
    """
    Takes a CSV file containing SMILES as input and outputs a DataFrame of RDKit 2D numerical descriptors.
    
    Parameters:
    -----------
    csv_file_path : str
        Path to the CSV file containing SMILES and LABELS columns
    
    Returns:
    --------
    pd.DataFrame
        DataFrame with SMILES, LABELS, and all RDKit 2D descriptors
    """


    # Importing dataset
    molecules_df = pd.read_csv(csv_file_path, header=0)

    print(f"Raw Dataset shape : {molecules_df.shape}")
    
    # Removing NaN values
    molecules_df.dropna(subset=["SMILES"], inplace=True, ignore_index=True)

    print(f"Dataset shape after removing empty values : {molecules_df.shape}")


    
    # (3) Checking for invalid SMILES
    invalid_smiles_indices = []
    
    for idx in range(len(molecules_df)):
        smiles_string = molecules_df['SMILES'][idx]
        mol = Chem.MolFromSmiles(smiles_string, sanitize=False)
        try:
            Chem.SanitizeMol(mol)
        except Exception:
            print(f"Invalid SMILES at index {idx}: {smiles_string}")
            invalid_smiles_indices.append(idx)


    # Saving (if any are present) invalid SMILES

    invalid_smiles_df = molecules_df.loc[invalid_smiles_indices]

    # Deleting rows with invalid SMILES
    molecules_df.drop(invalid_smiles_indices, inplace=True)
    print(f"Dataset had {len(invalid_smiles_indices)} invalid SMILES")
    print(f"Dataset shape after cleaning : {molecules_df.shape}")

    molecules_df = molecules_df.reset_index(drop=True)
    
    # (4) Computing all RDKit 2D descriptors
    descriptor_names = [n[0] for n in Descriptors._descList]
    print(f"Utilizing following number of descriptors : {len(descriptor_names)}")
    
    descriptors_list = []
    for i in range(len(molecules_df)):
        mol = Chem.MolFromSmiles(molecules_df['SMILES'][i])
        descriptors_dict = Descriptors.CalcMolDescriptors(mol)
        descriptors_list.append(descriptors_dict)
    
    descriptors_df = pd.DataFrame(descriptors_list)
    
    # (5) Concatenating original data with descriptors
    result_df = pd.concat([molecules_df, descriptors_df], axis=1)
    
    print(f"Successfully calculated 2D-Descriptors for given SMILES, here is the resulting dataset :\n", result_df)

    choice = str(input((f" Do you want to save the result as CSV (y/n) :\t")))
    if choice in ('y', 'Y', 'YES', 'yes'): 
        result_df.to_csv('rdkit_2d_descriptors_output.csv', index=False)
        if len(invalid_smiles_df) > 0:
            print(f"Invalid SMILES entries are also saved in a separate file.")
            invalid_smiles_df.to_csv('rdkit_2d_descriptors_output_invalid_smiles.csv',index=False)
    else: 
        exit()



        
def standardize_descriptors(raw_descriptors_df_csv_file_path):

    import pandas as pd
    import numpy as np
    from sklearn.preprocessing import MinMaxScaler, StandardScaler

    """
    Cleans, filters for numeric data, normalizes, and standardizes a DataFrame.
    """

    raw_descriptors_df=pd.read_csv(raw_descriptors_df_csv_file_path, header=0)

    print(f"Raw Dataset shape : {raw_descriptors_df.shape}")

    # Replacing infinite values with NaN
    clean_df = raw_descriptors_df.replace([np.inf, -np.inf], np.nan)

    # Filtering to keep only numeric columns (float, int, etc.)
    clean_df = clean_df.select_dtypes(include=[np.number])

    # Removing rows with all missing values
    clean_df = clean_df.dropna(how='all')

    # Removing columns with any missing values
    clean_df = clean_df.dropna(axis=1, how='any')

    # Resetting Indexes
    clean_df = clean_df.reset_index(drop=True)

    print(f"Cleaned Dataset shape : {clean_df.shape}")

    # Scaling: Applied Min-Max normalization
    normalized_features = MinMaxScaler().fit_transform(clean_df)
    normalized_df = pd.DataFrame(normalized_features,columns=clean_df.columns,index=clean_df.index)

    # Applying Z-score standardization after normalization
    standardized_features = StandardScaler().fit_transform(normalized_df)
    standardized_df = pd.DataFrame(standardized_features,columns=clean_df.columns,index=clean_df.index)

    print(f"Successfully standardized given 2D-Descriptors, here is the resulting dataset :\n", standardized_df)

    choice = str(input((f" Do you want to save the result as CSV (y/n) :\t")))
    if choice in ('y', 'Y', 'YES', 'yes'): 
        standardized_df.to_csv('rdkit_2d_descriptors_standardized_output.csv', index=False)
    else: 
        exit()



#-----------------------------------------------------------------------------------------------------------------------------------


Dataset had 0 invalid SMILES
Dataset shape after cleaning : (643, 2)
Utilizing following number of descriptors :  217
Raw data dimension :	 (643, 218)
Clean data dimension :	 (643, 205)
