In [2]:
#Assignment_1_v2

#Importing required libraries

import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit import DataStructs

#Part A

#(1)
#Importing dataset_1

ds1=pd.read_csv('Assignment_1/dataset_1.csv', header=0)

print(f'Raw dataset-1 shape : ',ds1.shape)

#(2)Checking for missing values 

missing_count = ds1.isnull().sum()
print(f'Missing values in each column :\n', missing_count)

#Removing empty(NaN) values
ds1.dropna(subset=["SMILES","LABELS"],inplace=True,ignore_index=True)

print(f'Dataset-1 shape after removing missing values : ',ds1.shape)

#(3)Checking for invalid SMILES and validating them to ensure chemical validity

invalid_SMILES = []


for smile in range(len(ds1)):
    # Taking the SMILES string for the current row
    smile_val = ds1['SMILES'][smile]
    # Creating a molecule from SMILES without full sanitization yet
    mol = Chem.MolFromSmiles(smile_val, sanitize=False)
    try:
        # Trying to sanitize the molecule to check if it is chemically valid
        Chem.SanitizeMol(mol)
    except Exception:
        # If sanitization is failing, the SMILES is being marked as invalid
        print(f"Invalid SMILES at index {smile}: {smile_val}")
        invalid_SMILES.append(smile)

# Deleting rows with invalid SMILES
# This is done to remove molecules that are not chemically valid so they do not cause errors later
ds1.drop(invalid_SMILES, inplace=True)
print(f"Dataset-1 had {len(invalid_SMILES)} invalid SMILES")

# Printing the shape again if any invalid SMILES were removed
if len(invalid_SMILES) > 0:
     print(f"Dataset-1 shape after removing invalid SMILES : {ds1.shape}")

#Resetting Indexes (To prevent indexing errors for further iterative coding)
ds1 = ds1.reset_index(drop=True)

#Summary:
#First, it checks for missing values in each column, then removes them. 
#After that, it creates a list of invalid SMILES (if any in the dataset) by checking whether the SMILES string at each row could be sanitized or not. 
#After this, code resets the index to reorder the DataFrame after removing invalid SMILES so that the DataFrame could be loaded for further iterative operations without indexing issues.

# Proceeding with (4) generating MACCS fingerprints
from rdkit.Chem import PandasTools
from rdkit.Chem import MACCSkeys
PandasTools.RenderImagesInAllDataFrames(True)


# Adding a new 'Molecule/ROMol' column to the DataFrame which stores RDKit molecule objects
# This is allowing direct visualization of molecular structures from the SMILES
PandasTools.AddMoleculeColumnToFrame(frame=ds1, smilesCol='SMILES')


# Creating an empty DataFrame for MACCS keys
# Each row is corresponding to one molecule and each of the 166 columns will be storing one MACCS bit
maccs_table = pd.DataFrame(index=range(len(ds1['SMILES'])), columns=range(1,167))

# Step 2: Calculating the MACCS fingerprint for each SMILES

for row_index in range(len(ds1)):
    # Taking the SMILES string for the current row
    each_smiles = ds1.SMILES[row_index]
    # Converting the SMILES into a molecule and generating its MACCS fingerprint
    given_smi2maccsfp = MACCSkeys.GenMACCSKeys(Chem.MolFromSmiles(each_smiles))
    col_index = 0  # Starting the column index at zero before compiling bits
    
    # Looping through each bit in the fingerprint and filling the table
    for each_bit in given_smi2maccsfp.ToBitString():
        col_index += 1  # Moving to the next column for each bit
        # Storing the current bit in the respective row and column of the MACCS table
        maccs_table.loc[row_index, col_index] = each_bit


# Combining the original dataset with the 167-bit MACCS fingerprint table
maccs_ds1 = ds1.merge(maccs_table, left_index=True, right_index=True, how='inner')

print('Dataset-1 with MACCS fingerprint :\n', maccs_ds1)


# Setting up the column order for CSV file
macckeys_coln = ['SMILES']
macckeys_coln.extend(list(range(1,167)))

# Saving the SMILES and their MACCS fingerprints to a CSV file
maccs_ds1.to_csv('macckeys_output.csv', index=False, columns=macckeys_coln)


# Summary: This code generates MACCS keys (167-bit molecular fingerprints) for each SMILES string by converting each molecule to an RDKit object and extracting its fingerprint representation.
# It then merges these numerical fingerprints with the original dataset and exports the combined data (SMILES strings + fingerprints) to a CSV file.


#Part B

#(1) Generate circular fingerprints (ECFP)/Morgan fingerprints
from rdkit.Chem import AllChem 


# Creating empty list to store Morgan fingerprints for each SMILE
morgfp =[]

# Setting up a Morgan fingerprint generator:
# radius=2 controls how far from each atom we look,
# fpSize=2048 means each fingerprint will have 2048 bits
morgan_gen = AllChem.GetMorganGenerator(radius=2, fpSize=2048)

for mfp in range(len(ds1)):
    # Turning the SMILES string into an RDKit molecule object
    molmp = Chem.MolFromSmiles(ds1['SMILES'][mfp])
    # Creating the Morgan fingerprint for this molecule
    mfp_mol = morgan_gen.GetFingerprint(molmp)
    # Adding the fingerprint to the list
    morgfp.append(mfp_mol)

# Adding the fingerprint objects to the DataFrame in a new column (now as a list)
ds1['ECFP'] = morgfp

# Converting each fingerprint into a string

ecfp_bits = ds1['ECFP'].apply(lambda x: list(x.ToBitString()))

# Turning the list of bits into a new DataFrame
# Each column (bit_1 to bit_2048) is one position/bit of the fingerprint
ecfp_df = pd.DataFrame(ecfp_bits.tolist(), columns=[f'bit_{i}' for i in range(1,2049)])

# Removing the temporary ECFP column that stores RDKit objects
del ds1['ECFP']

# Joining the original data with the 2048-bit fingerprint columns
# Now each molecule has its original info plus 2048 fingerprint features
ecfp_ds1 = ds1.merge(ecfp_df, left_index=True, right_index=True, how='inner')


print('Dataset-1 with Morgan fingerprint :\n', ecfp_ds1)


# Setting up the column order for CSV file
ecfp_coln = ['SMILES']
ecfp_coln.extend(list(f'bit_{i}' for i in range(1,2049)))

# Saving the SMILES and their Morgan fingerprints to a CSV file
ecfp_ds1.to_csv('ecfp_output.csv', index=False, columns=ecfp_coln)


#Summary: This code generates 2048-bit Morgan (ECFP) fingerprints (similarly to previous code for MACCS Keys but with a different approach) for each SMILES string using an RDKit fingerprint generator. 
#It converts these fingerprints into individual bit vectors by first compiling a column with 2048-bit string entries for all SMILES which are then converted to individual bits in individual columns and finally merging them with the original dataset.


#(2) Generating Atom Pair Fingerprints
from rdkit.Chem import rdMolDescriptors, rdFingerprintGenerator
from rdkit.Chem import AtomPairs

# Creating an empty list to store Atom Pair fingerprints for each SMILE
ap_list = []

# Setting up the Atom Pair fingerprint generator with 2048 bits
apfpgen = rdFingerprintGenerator.GetAtomPairGenerator(fpSize=2048)

for apfp in range(len(ds1)):
    # Using the pre-existing RDKit molecule object from the 'ROMol' column
    molap = ds1['ROMol'][apfp]
    # Generating the Atom Pair fingerprint for this molecule
    ap_fp = apfpgen.GetFingerprint(molap)
    # Adding the fingerprint to the list
    ap_list.append(ap_fp)


# Adding the fingerprint objects to the DataFrame in a new column (now as a list)
ds1['Atom_Pair'] = ap_list


# Converting each fingerprint into a string

ap_bits = ds1['Atom_Pair'].apply(lambda x: list(x.ToBitString()))

# Turning the list of bits into a new DataFrame
# Each column (bit_1 to bit_2048) is one position/bit of the fingerprint
ap_df = pd.DataFrame(ap_bits.tolist(), columns=[f'bit_number_{i}' for i in range(1,2049)])

# Removing the temporary ECFP column that stores RDKit objects
del ds1['Atom_Pair']

# Joining the original data with the 2048-bit fingerprint columns
# Now each molecule has its original info plus 2048 fingerprint features
ap_ds1 = ds1.merge(ap_df, left_index=True, right_index=True, how='inner')


print('Dataset-1 with Atom Pair fingerprint :\n', ap_ds1)



# Setting up the column order for CSV file
ap_coln = ['SMILES']
ap_coln.extend(list(f'bit_number_{i}' for i in range(1,2049)))

# Saving the SMILES and their Atom pair fingerprints to a CSV file
ap_ds1.to_csv('atom_pair_output.csv', index=False, columns=ap_coln)

#Summary: This code generates 2048-bit Atom Pair fingerprints in similar manner as Morgan fingerprints (code-wise). 
#These are generated for each SMILES string using an RDKit fingerprint generator. 
#It converts these fingerprints into individual bit vectors by first compiling a column with 2048-bit string entries for all SMILES which are then converted to individual bits in individual columns and finally merging them with the original dataset.

#(3) Generating Topological Torsion Fingerprints


# Creating an empty list to store Atom Pair fingerprints for each SMILE
ttfp_list= []

# Setting up the Topological Torsion fingerprint generator with 2048 bits
ttfpgen = rdFingerprintGenerator.GetTopologicalTorsionGenerator(fpSize=2048)


for ttfp in range(len(ds1)):
    # Using the pre-existing RDKit molecule object from the 'ROMol' column
    moltt=ds1['ROMol'][ttfp]
    # Generating the Topological Torsion fingerprint for this molecule
    tt_fp_vec = ttfpgen.GetFingerprint(moltt)
    # Adding the fingerprint to the list
    ttfp_list.append(tt_fp_vec)


# Adding the fingerprint objects to the DataFrame in a new column (now as a list)
ds1['Topological_Torsion'] = ttfp_list


# Converting each fingerprint into a string

ttfp_bits = ds1['Topological_Torsion'].apply(lambda x: list(x.ToBitString()))

# Turning the list of bits into a new DataFrame
# Each column (bit_1 to bit_2048) is one position/bit of the fingerprint
ttfp_df = pd.DataFrame(ttfp_bits.tolist(), columns=[f'bit_pos_{i}' for i in range(1,2049)])

# Removing the temporary ECFP column that stores RDKit objects
del ds1['Topological_Torsion']

# Joining the original data with the 2048-bit fingerprint columns
# Now each molecule has its original info plus 2048 fingerprint features
ttfp_ds1 = ds1.merge(ttfp_df, left_index=True, right_index=True, how='inner')


print('Dataset-1 with Topological Torsion fingerprint :\n', ttfp_ds1)



# Setting up the column order for CSV file
ttfp_coln = ['SMILES']
ttfp_coln.extend(list(f'bit_pos_{i}' for i in range(1,2049)))

# Saving the SMILES and their Topological Torsion fingerprints to a CSV file
ttfp_ds1.to_csv('topological_torsion_output.csv', index=False, columns=ttfp_coln)

#Summary: This code generates 2048-bit Topological Torsion fingerprints in the same manner as Atom Pair fingerprints (code-wise). 
#These are generated for each SMILES string using an RDKit fingerprint generator. 
#It converts these fingerprints into individual bit vectors by first compiling a column with 2048-bit string entries for all SMILES which are then converted to individual bits in individual columns and finally merging them with the original dataset.

# (4) Generating Pharmacophore-based fingerprints 
from rdkit.Chem.Pharm2D import Gobbi_Pharm2D,Generate

# Creating an empty list to store Pharmacophore fingerprints for each molecule
pharma_list = []

for pi in range(len(ds1)):
    # Converting the SMILES string into an RDKit molecule object
    mol_pi = Chem.MolFromSmiles(ds1['SMILES'][pi])
    # Generating the 2D Pharmacophore fingerprint
    pi_fp = Generate.Gen2DFingerprint(mol_pi, Gobbi_Pharm2D.factory)
    # Adding the fingerprint to the list
    pharma_list.append(pi_fp)

# Adding the Pharmacophore fingerprint objects to a new column
ds1['Pharmacophore_FP'] = pharma_list

# Converting each Pharmacophore fingerprint into bit strings
pharma_bits = ds1['Pharmacophore_FP'].apply(lambda x: list(x.ToBitString()))

# Creating a DataFrame from the bit string lists, where each column represents one Pharmacophore bit
pharma_df = pd.DataFrame(pharma_bits.tolist())

print(pharma_df.shape)
# Setting up numbered column names (from 1 to len(pharma_df.columns)) for the Pharmacophore bits
ncoln = []
for coln in range(1, (len(pharma_df.columns) + 1)):
    ncoln.append(coln)
pharma_df.columns = ncoln

# Removing the temporary column that was holding RDKit fingerprint objects
del ds1['Pharmacophore_FP']

# Combining the original dataset with the Pharmacophore fingerprint columns
pharma_ds1 = ds1.merge(pharma_df, left_index=True, right_index=True, how='inner')

print('Dataset-1 with Pharmacophore-based fingerprint :\n', pharma_ds1)

# Creating a list of columns to export, excluding LABELS and ROMol columns
# This will keep SMILES plus all the Pharmacophore fingerprint bits in the output file
pcoln = []
for ncol in pharma_ds1.columns:
    if ncol not in ['LABELS', 'ROMol']:
        pcoln.append(ncol)

# Saving the SMILES and Pharmacophore fingerprints to a CSV file 
pharma_ds1.to_csv('pharmacophore_fp_output.csv', index=False, columns=pcoln)

# Note: To check which chemical features each bit represents, we can use:
# GetOnBits() and Gobbi_Pharm2D.factory.GetBitDescription(bit_number)


#Summary: This code generates Gobbi Pharmacophore fingerprints from SMILES and converting them into numeric bit in similar manner as Atom Pair and Topological Torsion Fingerprints generators (code wise).  


# (5) Computing all RDKit 2D descriptors
from rdkit.Chem import Descriptors

# Getting the list of all available RDKit 2D molecular descriptors

desc_list = [n[0] for n in Descriptors._descList]
print(f"Utilizing following number of descriptors : ", len(desc_list))

# Creating an empty list to store descriptor values for each molecule
desc_r = []

for i in range(len(ds1)):
    # Converting SMILES to RDKit molecule object
    mol_i = Chem.MolFromSmiles(ds1['SMILES'][i])
    # Calculating ALL RDKit 2D descriptors for this molecule
    desc_dict = Descriptors.CalcMolDescriptors(mol_i)
    # Adding the calculated descriptors to the list
    desc_r.append(desc_dict)

# Converting the list of descriptors into a DataFrame
# Each column is one descriptor, each row is one SMILE/molecule
df_descriptors = pd.DataFrame(desc_r)

# Combining the original dataset with all the 2D descriptor columns
ds1f = pd.concat([ds1, df_descriptors], axis=1)


print('Dataset-1 with 2D descriptors :\n', ds1f)

# Creating a list of columns to export: SMILES plus all descriptor names
csv_list = ['SMILES']
for j in desc_list:
    csv_list.append(j)

# Saving only SMILES and 2D descriptors to CSV
ds1f[csv_list].to_csv('descriptors_2d_output.csv', index=False)


#Summary: The code firstly gets all available RDKit 2D descriptors, then looping through each SMILES string to convert it into a molecule and calculate every descriptor value. 
#These descriptor values are being stored in a DataFrame where each column represents one property, and then this DataFrame is being combined with the original dataset. 
#Finally, it saves the SMILES strings plus all these numeric descriptor values to a CSV file.

#Part C

# Importing dataset-2

ds2=pd.read_csv('Assignment_1/dataset_2.csv', header=0)

print(f"Raw data dimension :\t",ds2.shape)


#Replacing infinite values with NaNn
ds2 = ds2.replace([np.inf, -np.inf], np.nan)

#Removing rows with all missing values
ds2 = ds2.dropna(how='all')

#Removing columns with any missing values
ds2 = ds2.dropna(axis=1, how='any')

#Resetting Indexes
ds2 = ds2.reset_index(drop=True)

print(f"Clean data dimension :\t",ds2.shape)




# Applying Min-Max normalization to the data.
from sklearn.preprocessing import MinMaxScaler
#MinMaxScaler is a data preprocessing technique that rescales features to a fixed range.
#For example, it can be done by shifting and scaling values so the minimum becomes 0 and the maximum becomes 1.

ds2_mm = MinMaxScaler().fit_transform(ds2)
ds2_scaled = pd.DataFrame(ds2_mm, columns=ds2.columns, index=ds2.index)


# Applying Z-score standardization 
from sklearn.preprocessing import StandardScaler
#StandardScaler is a preprocessing tool used to standardize numeric features by removing the mean and scaling them to unit variance. 
#This is also called Z-score normalization in which each feature transforms independently which provides a mean of 0 and a standard deviation of 1 in final distribution.

ds2_zs= StandardScaler().fit_transform(ds2)
ds2_standardized = pd.DataFrame(ds2_zs, columns=ds2.columns, index=ds2.index)

#Example of finding standard deviation and mean for the first column

first_col = ds2.iloc[:, 0]


# Calculating mean and std values
mean_val = first_col.mean()
std_val = first_col.std()

print(f"Column Name : {ds2.columns[0]}, Mean: {mean_val}, Std: {std_val}")

# MinMaxScaler overall preserves original distribution of the data while StandardScaler is sensitive to outliers and better for data which is already normally distributed (or is not highly skewed).

#Creating a simple logical function to check whether the data is normalized helps determine which function to use (though more complex data requires additional context and specialized packages).
from scipy.stats import shapiro
from sklearn.compose import ColumnTransformer
import warnings


warnings.filterwarnings('ignore', category=UserWarning, module='scipy.stats')

# Showing the original dataset shape before scaling
print(ds2.shape)

scaled_columns = []
zscore_count = 0  # Counting columns that need StandardScaler (normal distribution)
minmax_count = 0  # Counting columns that need MinMaxScaler (non-normal distribution)

scol = []  # List of columns with normal distribution (will use Z-score scaling)
mmcol = [] # List of columns with non-normal distribution (will use Min-Max scaling)

# Checking each column to see if its data follows a normal distribution
for i, col_name in enumerate(ds2.columns):
    col = ds2[col_name]
    col_t = ds2[[col_name]]

    # Running Shapiro-Wilk test to check for normality (p > 0.05 means normal distribution)
    p = shapiro(col)[1]

    if p > 0.05:
        # Checking if the column has zero variation (all values are the same)
        if col.std() == 0:
            text = 'Not Normal'
            minmax_count = minmax_count + 1
            # Using MinMaxScaler for constant columns
            mmcol.append(col_name)
        else:           
            text = 'Normal'
            zscore_count = zscore_count + 1
            # Using StandardScaler for normally distributed data
            scol.append(col_name)
    else:
        # Non-normal data is getting MinMaxScaler
        text = 'Not Normal'
        minmax_count = minmax_count + 1
        mmcol.append(col_name)

# Creating scaling transformers for different column groups
transformers = []
if scol:
    # StandardScaler for normally distributed columns (preserves mean=0, std=1)
    transformers.append(('zscore', StandardScaler(), scol))
if mmcol:
    # MinMaxScaler for non-normal/constant columns (scales to 0-1 range)
    transformers.append(('minmax', MinMaxScaler(), mmcol))

# Applying both scalers at once using ColumnTransformer
ct = ColumnTransformer(transformers, remainder='passthrough')
ds2_scaled = ct.fit_transform(ds2)

# Converting the scaled array back to a DataFrame with original column names
ds2_scaled_df = pd.DataFrame(ds2_scaled, columns=ds2.columns, index=ds2.index)

# Showing the scaling summary and first few rows of scaled data
print(f'Summary: Columns with Not Normal distribution {minmax_count} and Normal distribution {zscore_count}')
print(ds2_scaled_df.head())


#Part D

def compute_rdkit_2d_descriptors(csv_file_path):

    import pandas as pd
    from rdkit import Chem
    from rdkit.Chem import Descriptors


    
    """
    Takes a CSV file containing SMILES as input and outputs a DataFrame of RDKit 2D numerical descriptors.
    
    Parameters:
    -----------
    csv_file_path : str
        Path to the CSV file containing SMILES and LABELS columns
    
    Returns:
    --------
    pd.DataFrame
        DataFrame with SMILES, LABELS, and all RDKit 2D descriptors
    """


    # Importing dataset
    molecules_df = pd.read_csv(csv_file_path, header=0)

    print(f"Raw Dataset shape : {molecules_df.shape}")
    
    # Removing NaN values
    molecules_df.dropna(subset=["SMILES"], inplace=True, ignore_index=True)

    print(f"Dataset shape after removing empty values : {molecules_df.shape}")


    
    # (3) Checking for invalid SMILES
    invalid_smiles_indices = []
    
    for idx in range(len(molecules_df)):
        smiles_string = molecules_df['SMILES'][idx]
        mol = Chem.MolFromSmiles(smiles_string, sanitize=False)
        try:
            Chem.SanitizeMol(mol)
        except Exception:
            print(f"Invalid SMILES at index {idx}: {smiles_string}")
            invalid_smiles_indices.append(idx)


    # Saving (if any are present) invalid SMILES

    invalid_smiles_df = molecules_df.loc[invalid_smiles_indices]

    # Deleting rows with invalid SMILES
    molecules_df.drop(invalid_smiles_indices, inplace=True)
    print(f"Dataset had {len(invalid_smiles_indices)} invalid SMILES")
    print(f"Dataset shape after cleaning : {molecules_df.shape}")

    molecules_df = molecules_df.reset_index(drop=True)
    
    # (4) Computing all RDKit 2D descriptors
    descriptor_names = [n[0] for n in Descriptors._descList]
    print(f"Utilizing following number of descriptors : {len(descriptor_names)}")
    
    descriptors_list = []
    for i in range(len(molecules_df)):
        mol = Chem.MolFromSmiles(molecules_df['SMILES'][i])
        descriptors_dict = Descriptors.CalcMolDescriptors(mol)
        descriptors_list.append(descriptors_dict)
    
    descriptors_df = pd.DataFrame(descriptors_list)
    
    # (5) Concatenating original data with descriptors
    result_df = pd.concat([molecules_df, descriptors_df], axis=1)
    
    print(f"Successfully calculated 2D-Descriptors for given SMILES, here is the resulting dataset :\n", result_df)

    choice = str(input((f" Do you want to save the result as CSV (y/n) :\t")))
    if choice in ('y', 'Y', 'YES', 'yes'): 
        result_df.to_csv('rdkit_2d_descriptors_output.csv', index=False)
        if len(invalid_smiles_df) > 0:
            print(f"Invalid SMILES entries are also saved in a separate file.")
            invalid_smiles_df.to_csv('rdkit_2d_descriptors_output_invalid_smiles.csv',index=False)
    else: 
        exit()


def standardize_descriptors(raw_descriptors_df_csv_file_path):

    import pandas as pd
    import numpy as np
    from sklearn.preprocessing import MinMaxScaler, StandardScaler
    from scipy.stats import shapiro
    from sklearn.compose import ColumnTransformer
    import warnings


    warnings.filterwarnings('ignore', category=UserWarning, module='scipy.stats')

    '''
    Cleans, filters for numeric data, and intelligently standardizes a DataFrame 
    using distribution-based scaling (StandardScaler for normal data, MinMaxScaler for non-normal).
    '''

    raw_descriptors_df=pd.read_csv(raw_descriptors_df_csv_file_path, header=0)

    print(f"Raw Dataset shape : {raw_descriptors_df.shape}")

    # Replacing infinite values with NaN
    clean_df = raw_descriptors_df.replace([np.inf, -np.inf], np.nan)

    # Filtering to keep only numeric columns (float, int, etc.)
    clean_df = clean_df.select_dtypes(include=[np.number])

    # Removing rows with all missing values
    clean_df = clean_df.dropna(how='all')

    # Removing columns with any missing values
    clean_df = clean_df.dropna(axis=1, how='any')

    # Resetting Indexes
    clean_df = clean_df.reset_index(drop=True)

    # Suppressing warnings for Shapiro-Wilk test on constant data
    warnings.filterwarnings('ignore', category=UserWarning, module='scipy.stats')

    # Counting columns for different scaling types
    zscore_count = 0  # Normal distribution columns
    minmax_count = 0  # Non-normal/constant columns
    scol = []  # Columns needing StandardScaler
    mmcol = [] # Columns needing MinMaxScaler

    # Testing each column's distribution to choose the best scaling method
    for i, col_name in enumerate(clean_df.columns):
        col = clean_df[col_name]
        p = shapiro(col)[1]  # Shapiro-Wilk test p-value

        if p > 0.05:  # Normal distribution
            if col.std() == 0:  # Constant column
                text = 'Not Normal'
                minmax_count += 1
                mmcol.append(col_name)
            else:
                text = 'Normal'
                zscore_count += 1
                scol.append(col_name)
        else:  # Non-normal distribution
            text = 'Not Normal'
            minmax_count += 1
            mmcol.append(col_name)

    # Creating scaling transformers based on column distribution
    transformers = []
    if scol:
        transformers.append(('zscore', StandardScaler(), scol))
    if mmcol:
        transformers.append(('minmax', MinMaxScaler(), mmcol))

    # Applying the scaling using ColumnTransformer
    ct = ColumnTransformer(transformers, remainder='passthrough')
    df_transformed_scaled = ct.fit_transform(clean_df)

    # Converting scaled result back to DataFrame with original column names
    df_scaled = pd.DataFrame(df_transformed_scaled, columns=clean_df.columns, index=clean_df.index)

    # Showing scaling summary and preview
    print(f'Summary: Columns with Not Normal distribution {minmax_count} and Normal distribution {zscore_count}')
    print("Successfully standardized given 2D-Descriptors, here is the resulting dataset :\n", df_scaled.head())
    print(f"Cleaned and Processed Dataset shape : {df_scaled.shape}")

    # Asking user if they want to save the scaled dataset
    choice = str(input("Do you want to save the result as CSV (y/n) :\t"))
    if choice in ('y', 'Y', 'YES', 'yes'): 
        df_scaled.to_csv('rdkit_2d_descriptors_standardized_output.csv', index=False)
        print("File saved as 'rdkit_2d_descriptors_standardized_output.csv'")
    else: 
        print("No file saved.")
#Summary: This function is cleaning descriptor data by removing non-numeric columns and handling missing/infinite values.
#Then it is scaling each column based on its distribution (StandardScaler for normal data, MinMaxScaler for non-normal). 









Raw dataset-1 shape :  (757, 2)
Missing values in each column :
 SMILES    111
LABELS      3
dtype: int64
Dataset-1 shape after removing missing values :  (643, 2)
Dataset-1 had 0 invalid SMILES
Dataset-1 with MACCS fingerprint :
                                  SMILES  LABELS  \
0                          CC(Cl)(Cl)Cl   231.0   
1                             C=C(Cl)Cl   450.0   
2                                C=C=CC     7.0   
3             CC1(C)C(=O)N(CO)C(=O)N1CO    24.0   
4                             C=C/C=C/C    28.0   
..                                  ...     ...   
638       CCCCCCCCCCCCCCCCCCCCCCCCCCCCC    58.2   
639      CCCCCCCCCCCCCCCCCCCCCCCCCCCCCC    60.3   
640     CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC    88.7   
641    CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC    42.0   
642  CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC    67.5   

                                                 ROMol  1  2  3  4  5  6  7  \
0    <rdkit.Chem.rdchem.Mol object at 0x0000020B6F7...  0  0  0  0  0  0  0  

In [None]:
#C1
#Storing each each bit as separate descriptors → See Part A step(4) & Part B (1) - (4)

#C2
#Dataset-1 Issues → See Part A, steps (2)–(3): Missing values removed with dropna(), invalid SMILES are validated with Chem.SanitizeMol()
#Dataset-2 Issues → See Part C, data cleaning: Infinite values replaced, rows/columns with NaNs removed
#Why Shape was changed→ See Part A, step (3): Rows removed after filtering → shape goes from (757, 2) to (643, 2), Part C: Columns containing Infinite values and NaNs were removed which made shape go from (643, 218) to (643, 205)
#Why Cleaning was Needed → See Part A & Part C: (1) Prevents crashes from invalid SMILES when iterating, (2) Allows for accurate calculations of descriptors without NaN values

#C3
#MinMax and StandardScaler Description → See Part C
#Mean/Std Code → See Part C, first column example: added in code as a standalone provided with .mean() and .std()
#Normalization  → See Part C, distribution-based scaling i.e., Z-score standardization on columns with normal distribution and MinMax normalization on non-normal distributed data and constant range of data(std = 0)