In [93]:
import numpy as np
import pandas as pd

In [94]:
# READ the files
f_mord = pd.read_csv("FGFR4_desc2.csv", low_memory=False, index_col=0)
i_mord = pd.read_csv("IGF1R_desc2.csv", low_memory=False, index_col=0)
f_padel = pd.read_csv("FGFR4_descriptor_padelpy.csv", low_memory=False)
i_padel = pd.read_csv("IGF1R_descriptor_padelpy.csv", low_memory=False)
# we only need the smile (identifier) and the IC50
f_ = pd.read_csv("bindingdb_11012022_FGFR4.tsv", low_memory=False, sep='\t')[['Ligand SMILES','IC50 (nM)']]
i_ = pd.read_csv("bindingdb_11012022_IGF1R.tsv", low_memory=False, sep='\t')[['Ligand SMILES','IC50 (nM)']]

In [95]:
# del columns; seems irrelevant
f_mord = f_mord.drop(['ABC', 'ABCGG'], axis=1)
i_mord = i_mord.drop(['ABC', 'ABCGG'], axis=1)
# take smile out of list and string symbols ([''])
f_padel['smile'] = f_padel['smile'].apply(lambda x: x[2:-2])
i_padel['smile'] = i_padel['smile'].apply(lambda x: x[2:-2])
# change the names and remove the duplicated rows
f_ = f_.rename(columns={'Ligand SMILES': 'smile', 'IC50 (nM)': 'IC50'}).drop_duplicates(subset="smile")
i_ = i_.rename(columns={'Ligand SMILES': 'smile', 'IC50 (nM)': 'IC50'}).drop_duplicates(subset="smile")

In [96]:
#the order of mordred and padel descriptors are the same, so simply concatenate them
f_features = pd.concat([f_padel, f_mord], axis=1)
i_features = pd.concat([i_padel, i_mord], axis=1)

In [97]:
# add the IC50 column to the features based on the smile which is a common column in both datasets
i = i_features.merge(i_[['smile', 'IC50']], on='smile', how='left')
f = f_features.merge(f_[['smile', 'IC50']], on='smile', how='left')

##  clean and preprocess:

1. Identify columns with non-numeric data types.
2. Attempt to convert non-numeric values to NaN.
3. Fill the Nan values with the mean value of the feature for all samples
4. Drop columns with over 1% non-numeric values.
5. Convert 'IC50' column to float and handle '>' or '<' characters. Then convert to pIC50

In [98]:
columns_to_drop = []  # Initialize a list to store columns to drop
# Assuming you have a DataFrame i
for column_name in i.columns:
    # Check if all values in the column have the same data type
    same_data_type = i[column_name].apply(type).nunique() == 1
    
    if not same_data_type:
        print(f"Not all values in column '{column_name}' have the same data type.")
    else:
        column_data_type = i[column_name].apply(type).iloc[0]
        if column_data_type not in (float, int) and column_name != 'smile':
            #print(f"Column '{column_name}' has data type: {column_data_type.__name__}")
            
            # Initialize a count for non-convertible values
            non_convertible_count = 0
            
            # Attempt to convert each value to float and count non-convertible values
            for index, value in enumerate(i[column_name]):
                try:
                    float_value = float(value)
                except (ValueError, TypeError):
                    non_convertible_count += 1
            
            # Print the count and percentage of non-convertible values
            total_values = len(i[column_name])
            non_convertible_percentage = (non_convertible_count / total_values) * 100
            if non_convertible_percentage > 1:
                #print(f"Number of non-convertible values in {column_name}: {non_convertible_count} ({non_convertible_percentage:.2f}%)")
                # Add the column to the list of columns to drop
                columns_to_drop.append(column_name)
                
            # replace non-convertible values with NaN
            i[column_name] = pd.to_numeric(i[column_name], errors='coerce')
print ("dataframe i has %d columns full of not float type values"%(len(columns_to_drop)))
i.drop(columns=columns_to_drop, inplace=True)
i['IC50'] = i['IC50'].str.replace(">", "").str.replace("<", "").astype(float)

Not all values in column 'IC50' have the same data type.
dataframe i has 397 columns full of not float type values


In [99]:
columns_to_drop = []  # Initialize a list to store columns to drop
# Assuming you have a DataFrame f
for column_name in f.columns:
    # Check if all values in the column have the same data type
    same_data_type = f[column_name].apply(type).nunique() == 1
    
    if not same_data_type:
        print(f"Not all values in column '{column_name}' have the same data type.")
    else:
        column_data_type = f[column_name].apply(type).iloc[0]
        if column_data_type not in (float, int) and column_name != 'smile':
            #print(f"Column '{column_name}' has data type: {column_data_type.__name__}")
            
            # Initialize a count for non-convertible values
            non_convertible_count = 0
            
            # Attempt to convert each value to float and count non-convertible values
            for index, value in enumerate(f[column_name]):
                try:
                    float_value = float(value)
                except (ValueError, TypeError):
                    non_convertible_count += 1
            
            # Print the count and percentage of non-convertible values
            total_values = len(f[column_name])
            non_convertible_percentage = (non_convertible_count / total_values) * 100
            if non_convertible_percentage > 1:
                #print(f"Number of non-convertible values in {column_name}: {non_convertible_count} ({non_convertible_percentage:.2f}%)")
                # Add the column to the list of columns to drop
                columns_to_drop.append(column_name)
                
            # replace non-convertible values with NaN
            f[column_name] = pd.to_numeric(f[column_name], errors='coerce')
print ("dataframe i has %d columns full of not float type values"%(len(columns_to_drop)))
f.drop(columns=columns_to_drop, inplace=True)
f['IC50'] = f['IC50'].str.replace(">", "").str.replace("<", "").astype(float)

Not all values in column 'IC50' have the same data type.
dataframe i has 397 columns full of not float type values


In [100]:
i['Lipinski'] = i['Lipinski'].astype(int)
i['GhoseFilter'] = i['GhoseFilter'].astype(int)
f['Lipinski'] = f['Lipinski'].astype(int)
f['GhoseFilter'] = f['GhoseFilter'].astype(int)

In [101]:
non_numeric_columns = i.select_dtypes(exclude=['float', 'int'])
num_non_numeric_columns = non_numeric_columns.shape[1]
print(f"Number of non-numeric columns: {num_non_numeric_columns}")
non_numeric_columns = f.select_dtypes(exclude=['float', 'int'])
num_non_numeric_columns = non_numeric_columns.shape[1]
print(f"Number of non-numeric columns: {num_non_numeric_columns}")


Number of non-numeric columns: 1
Number of non-numeric columns: 1


In [102]:
# Calculate the mean of each numeric column
column_means = i.mean(numeric_only=True)
# Fill NaN values with the mean of each column
i.fillna(column_means, inplace=True)

# Calculate the mean of each column
column_means = f.mean(numeric_only=True)
# Fill NaN values with the mean of each column
f.fillna(column_means, inplace=True)

In [103]:
# Create a new column 'pIC50' by applying the negative logarithm
i['pIC50'] = -np.log10(i['IC50'])
i.drop('IC50', axis=1, inplace=True)

f['pIC50'] = -np.log10(f['IC50'])
f.drop('IC50', axis=1, inplace=True)

In [104]:
f.to_csv('final_FGFR4.csv')
i.to_csv('final_IGF1R.csv')