<h1 style = "color : navy;"> QSAR Model Building_MF</h1>

## Suppress any warnings for all the upcoming cells
- use this function with the "ignore" argument to ignore all warnings

In [24]:
import warnings
warnings.filterwarnings('ignore')

# If there is a specific type of warning that you want to ignore, you can specify this type in the filterwarnings function
warnings.filterwarnings('ignore', category=UserWarning) # You would replace UserWarning with the specific warning class
                                                        # you wish to ignore.

# For the case of ignoring warnings from specific libraries, you can add the module parameter:
warnings.filterwarnings('ignore', module='numpy')  # Ignore warnings from numpy


<h1 style="color:red;">Data Manipulation for machine learning model building</h1>

<h3 style="color:navy;">Read the "SMILES_SPECIES_SEX_abs_max_zscore" data frame</h3>

In [335]:
# read the ""unique_SMILES_abs_max_zscore_rat.csv"" data frame 
SMILES_SPECIES_SEX_abs_max_zscore = pd.read_csv("SMILES_SPECIES_SEX_abs_max_zscore.csv")

# print the data frame
print(SMILES_SPECIES_SEX_abs_max_zscore.shape)
SMILES_SPECIES_SEX_abs_max_zscore.head(1)

(2051, 1617)


Unnamed: 0,SMILES,zscore,TSSPECIES,SEX,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,Br.Br.CCC[C@H](N[C@H]1CCc2cc(F)cc(F)c2C1)C(=O)...,-1.230864,RAT,F,27.102784,21.266255,0,2,multiple fragments (SpAbs_A/SpAbs),multiple fragments (SpMax_A/SpMax),...,10.429842,87.000352,649.180242,8.114753,7100004000.0,52,182,208,divide by zero encountered in power (mZagreb1),7.5


<h2 style="color:purple;">Manually add "target' column and assign values by zscore values</h2>
<ul style="color:blue;">
    <li>Manually create a column and name it 'target'</li>
    <li>Assign values based on zscore:</li>
    <ul>
        <li>zscore <= -2: assigned as toxic</li>
        <li>-2 >= zscore <= -1: assigned as mild_toxic</li>
        <li>zscore >= -1: assigned as non_toxic</li>
    </ul>
 </ul>


<h3 style="color:black;">Read the "SMILES_SPECIES_SEX_abs_max_zscore_Target.csv" data frame</h3>

In [386]:
# read the new data frame 
SMILES_SPECIES_SEX_abs_max_zscore_Target = pd.read_csv("SMILES_SPECIES_SEX_abs_max_zscore_Target.csv")

# print the data frame
print(SMILES_SPECIES_SEX_abs_max_zscore_Target.shape)
SMILES_SPECIES_SEX_abs_max_zscore_Target.head(3)

(2051, 1618)


Unnamed: 0,SMILES,zscore,Target,TSSPECIES,SEX,ABC,ABCGG,nAcid,nBase,SpAbs_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,Cc1[nH]nc2Nc3cc(ncc3C(=Nc12)c4ccccc4Cl)N5CCOCC5,-44.513706,toxic,DOG,M,22.563116,17.190573,0,0,37.8284886,...,10.35917,78.040268,394.130887,8.385764,1873.0,51,156,189,7.111111111,6.027778
1,COc1cc(N(C)CCN(C)C)c(NC(=O)C=C)cc1Nc2nccc(n2)-...,-40.997577,toxic,MOUSE,M,28.548139,22.171626,0,1,47.48848942,...,10.503724,87.974766,499.269573,7.132422,4557.0,60,192,226,12.69444444,8.277778
2,CNC(=O)c1cn(C[C@H](F)CCc2ccc(NC(=O)Cc3cc(OC4CC...,-28.290163,toxic,DOG,F,29.706334,21.101928,0,0,46.88133674,...,10.515235,88.468481,532.215821,8.187936,6289.0,50,198,225,13.03472222,8.194444


<h1 style = " color : red"> Cleaning the "SMILES_SPECIES_SEX_abs_max_zscore_Target" df</h1>
 <ul style="color:blue;">
    <li>This data frame has mixed data types in columns</li>
    <ul>
    <li>convert all columns except first four  into numeric </li>
   <ul>
    <li>conversion into numeric, convert non-numeric to "NaN" </li>
       <ul>
       <li>Finally need to handle "0" & "NaN" values </li>
    </ul>
    </ul>
    </ul>
    </ul>

#
To count the number of 0 values in every column of a DataFrame df, you can use the eq method to create a Boolean mask where True indicates a 0 value, and then use the sum method to count the number of True values in each column.

<h2 style = " color : navy"> Convert all the columns except first four columns to numeric</h2>

In [407]:
# Read the "SMILES_SPECIES_SEX_abs_max_zscore_Target.csv" as a data frame 
SMILES_SPECIES_SEX_abs_max_zscore_Target = pd.read_csv("SMILES_SPECIES_SEX_abs_max_zscore_Target.csv")

# Define a new data frame 

df1 = SMILES_SPECIES_SEX_abs_max_zscore_Target

# convert all columns except the first five columns of the DataFrame df to numeric
df1.iloc[:, 5:] = df1.iloc[:, 5:].apply(pd.to_numeric,errors='coerce')

# print the data frame
print(df1.shape)
df1.head(1)

(2051, 1618)


Unnamed: 0,SMILES,zscore,Target,TSSPECIES,SEX,ABC,ABCGG,nAcid,nBase,SpAbs_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,Cc1[nH]nc2Nc3cc(ncc3C(=Nc12)c4ccccc4Cl)N5CCOCC5,-44.513706,toxic,DOG,M,22.563116,17.190573,0,0,37.828489,...,10.35917,78.040268,394.130887,8.385764,1873.0,51,156,189,7.111111,6.027778


<h3 style = " color : blue"> Check the data type of a column</h3>

In [409]:
# check the data type of a column
print(df1['SEX'].dtype)

object


<h2 style="color:red;"> Handling the NaN values ("NaNs")</h2>

In [404]:
# Count the number of NaN values in each column of df1
nan_counts = df1.isna().sum()

# Filter columns with a maximum defined number of  NaN value
columns_up_to_defined_NaN = nan_counts[nan_counts <= 5].index

# Create a new data frame with columns up to 1 NaN value
df_up_to_defined_NaN = df1[columns_up_to_defined_NaN]

# Create another data frame with the remaining columns
df_remaining_NaN = df1.drop(columns_up_to_defined_NaN, axis=1)

# Print the data frames
print("Data Frame with up to defined NaN value:", df_up_to_defined_NaN.shape)
#print(df_up_to_defined_NaN)

print("Data Frame with remaining (have zeors and NaNs) columns:", df_remaining_NaN.shape )
#print(df_remaining_NaN)
df_up_to_defined_NaN

Data Frame with up to defined NaN value: (2051, 821)
Data Frame with remaining (have zeors and NaNs) columns: (2051, 797)


Unnamed: 0,SMILES,zscore,Target,TSSPECIES,SEX,ABC,ABCGG,nAcid,nBase,nAromAtom,...,SRW09,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb2
0,Cc1[nH]nc2Nc3cc(ncc3C(=Nc12)c4ccccc4Cl)N5CCOCC5,-44.513706,toxic,DOG,M,22.563116,17.190573,0,0,17,...,7.002156,10.359170,78.040268,394.130887,8.385764,1.873000e+03,51,156,189,6.027778
1,COc1cc(N(C)CCN(C)C)c(NC(=O)C=C)cc1Nc2nccc(n2)-...,-40.997577,toxic,MOUSE,M,28.548139,22.171626,0,1,21,...,7.034388,10.503724,87.974766,499.269573,7.132422,4.557000e+03,60,192,226,8.277778
2,CNC(=O)c1cn(C[C@H](F)CCc2ccc(NC(=O)Cc3cc(OC4CC...,-28.290163,toxic,DOG,F,29.706334,21.101928,0,0,17,...,6.580639,10.515235,88.468481,532.215821,8.187936,6.289000e+03,50,198,225,8.194444
3,Cn1nc(nc1Nc2ccc3[nH]ncc3c2C4CC4)-c5ccc(cc5)C(=...,-18.000000,toxic,DOG,M,25.610323,19.488794,0,0,20,...,8.028455,10.447642,88.519310,437.177565,8.248633,3.319000e+03,50,176,211,6.861111
4,Cc1cc(Nc2cc(N)ncn2)c(=O)n3c1C(=O)NC34CCCCC4,-15.928106,toxic,DOG,F,19.972739,16.141582,0,0,12,...,7.247793,10.425757,75.161421,340.164774,7.559217,1.390000e+03,45,140,171,5.347222
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2046,COc1cccc(CNc2ccc(cc2)S(=O)(=O)Nc3nc4ccccc4s3)c1O,4.478343,non_toxic,DOG,F,23.578821,17.507081,0,0,21,...,6.823286,10.331920,79.779030,441.081698,9.001667,2.901000e+03,47,160,188,6.541667
2047,CC1(C)COc2ccc(F)cc2CN3[C@@H](COc4cn5ncc(C(=O)N...,5.047919,non_toxic,RAT,F,25.887720,18.783195,0,0,15,...,6.842683,10.599705,82.686236,447.151824,8.599074,2.450000e+03,60,182,220,6.638889
2048,Cl.CCOc1c(cc(Cl)c(F)c1[C@@H]2CNC(=O)C2)[C@H](C...,6.187184,non_toxic,PIG,F,23.546021,20.077258,0,0,15,...,7.525640,10.512900,83.206785,468.124358,8.668970,3.000002e+09,54,164,201,6.555556
2049,OP(O)(O)=O.CN(C)CCCNC(=O)C(\COc1cccc2ccccc12)=...,6.863036,non_toxic,DOG,F,26.407757,21.393109,3,1,10,...,0.000000,10.027606,71.307560,525.224002,7.294778,1.550000e+10,41,164,175,8.250000


<h2 style="color:red;"> Handling the zero values ("0")</h2>
 <ul style="color:blue;">
    <li>use "df_up_to_defined_NaN " from previous step</li>
     <li>Final data frame will have "0" and "NaN" based on criteria from previous steps</li>
    </ul>

In [405]:
# Count the number of zero values in each column of df_up_to_defined_NaN 
zero_counts = df_up_to_defined_NaN.eq(0).sum()

# Filter columns with a maximum of defined zero value
columns_up_to_defined_zero = zero_counts[zero_counts <= 5].index

# Create a new data frame with columns up to defined zero value
df_up_to_defined_zero = df_up_to_defined_NaN[columns_up_to_defined_zero]

# Create another data frame with the remaining columns
df_remaining_zeros = df_up_to_defined_NaN.drop(columns_up_to_defined_zero, axis=1)

# Print the data frames
print("Data Frame with up to defined zero value:", df_up_to_defined_zero.shape)
#print(df_up_to_defined_zero)

print("Data Frame with remaining columns:", df_remaining_zeros.shape)
#print(df_remaining_zeros)


Data Frame with up to defined zero value: (2051, 194)
Data Frame with remaining columns: (2051, 627)


<h2 style="color:red;"> Final Handling of "0" & "NaN" per row</h2>
<ul style="color:blue;">
    <li>count the total number of 'zero' and 'NaN' in each row</li>
     <li>Final data frame will have no "0" and "NaN" </li>
    </ul>


## create two data frames-"no "0" & "NaN"" & ""with "0" & "NaN"

In [406]:
# Count the total number of zeros and NaNs in each row
total_zeros_and_nans_per_row = df_up_to_defined_zero.isna().sum(axis=1) + df_up_to_defined_zero.eq(0).sum(axis=1)

# Create a new DataFrame with rows that have no zeros or NaNs
df_no_zeros_no_NaNs = df_up_to_defined_zero[total_zeros_and_nans_per_row == 0].copy()

# Create a second DataFrame with the remaining rows
df_remaining_rows_withzeros_NaN = df_up_to_defined_zero[total_zeros_and_nans_per_row != 0].copy()

# Reset the index of the new DataFrames
df_no_zeros_no_NaNs.reset_index(drop=True, inplace=True)
df_remaining_rows_withzeros_NaN .reset_index(drop=True, inplace=True)

# Print the new DataFrame with no zeros or NaNs
print("DataFrame with no zeros or NaNs:", df_no_zeros_no_NaNs.shape)
df_no_zeros_no_NaNs.head(2)

# Print the second DataFrame with remaining rows
#print("Second DataFrame with remaining rows:", df_remaining_rows_withzeros_NaN .shape)
#print(df_remaining_rows_withzeros_NaN )


DataFrame with no zeros or NaNs: (2030, 194)


Unnamed: 0,SMILES,zscore,Target,TSSPECIES,SEX,nAtom,nHeavyAtom,nHetero,ATS0dv,ATS1dv,...,SRW06,SRW08,SRW10,TSRW10,MW,AMW,WPath,Zagreb1,Zagreb2,mZagreb2
0,Cc1[nH]nc2Nc3cc(ncc3C(=Nc12)c4ccccc4Cl)N5CCOCC5,-44.513706,toxic,DOG,M,47,28,8,383.604938,425.111111,...,7.058758,8.686261,10.35917,78.040268,394.130887,8.385764,1873.0,156,189,6.027778
1,COc1cc(N(C)CCN(C)C)c(NC(=O)C=C)cc1Nc2nccc(n2)-...,-40.997577,toxic,MOUSE,M,70,37,9,496.0,522.0,...,7.236339,8.843471,10.503724,87.974766,499.269573,7.132422,4557.0,192,226,8.277778


<h3 style ="color : navy;">Write the "df_no_zeros_no_NaNs" as csv file </h3> 
   <ul style="color:red;">
    <li>df_no_zeros_no_NaNs.to_csv("df_no_zeros_no_NaNs.csv", index = False)</li>
    </ul>

In [401]:
# # Write the "df_no_zeros_no_NaNs" as csv file 
# df_no_zeros_no_NaNs.to_csv("df_no_zeros_no_NaNs.csv", index = False)

<h2 style="color:red;"<><><><><><><><><>><><><><><><><><><><><><><><><><><><><><><><>

<h2 style="color:red;"> Testing Differnt Combination of '0' & 'Nan' </h2>
<ul style="color:blue;">
     <li style="color: purple;">Total '10' combination results </li>
    <li>Assess the effect of different combinations of '0' & 'NaN' on row numbers </li>
    <ul style="color:purple;">
     <li>Final data frame will have no "0" and "NaN" </li>
    </ul>
    </ul>

In [None]:
# Read the "SMILES_SPECIES_SEX_abs_max_zscore_Target.csv" as a data frame 
SMILES_SPECIES_SEX_abs_max_zscore_Target = pd.read_csv("SMILES_SPECIES_SEX_abs_max_zscore_Target.csv")

# Define a new data frame 

df1 = SMILES_SPECIES_SEX_abs_max_zscore_Target

# convert all columns except the first five columns of the DataFrame df1 to numeric
df1.iloc[:, 5:] = df1.iloc[:, 5:].apply(pd.to_numeric,errors='coerce')

# print the df1 data frame
print(df1.shape)
print(df1.head(1))

# Create an empty list to store the results
results = []

# Define the unique values in the "target" column
target_unique_values = ['toxic', 'mild_toxic', 'non_toxic']

# Loop through the values 0 to 10
for value in range(11):
    # Convert all columns except the first four columns of the DataFrame df to numeric
    df1 = SMILES_SPECIES_SEX_abs_max_zscore_Target
    SMILES_SPECIES_SEX_abs_max_zscore_Target.iloc[:, 5:] = SMILES_SPECIES_SEX_abs_max_zscore_Target.iloc[:, 5:].apply(pd.to_numeric,errors='coerce')

    # Count the number of NaN values in each column of df1
    nan_counts = df1.isna().sum()

    # Filter columns with a maximum of 'value' NaN values
    columns_up_to_one_nan = nan_counts[nan_counts <= value].index

    # Create a new DataFrame with columns up to 'value' NaN values
    df_up_to_one_nan = df1[columns_up_to_one_nan]

    # Count the number of zero values in each column of df_up_to_one_nan
    zero_counts = df_up_to_one_nan.eq(0).sum()

    # Filter columns with a maximum of 'value' zero values
    columns_up_to_one_zero = zero_counts[zero_counts <= value].index

    # Create a new DataFrame with columns up to 'value' zero values
    df_up_to_one_zero = df_up_to_one_nan[columns_up_to_one_zero]

    # Count the total number of zeros and NaNs in each row
    total_zeros_and_nans_per_row = df_up_to_one_zero.isna().sum(axis=1) + df_up_to_one_zero.eq(0).sum(axis=1)

    # Create a new DataFrame with rows that have no zeros or NaNs
    df_no_zeros_nans_3nonn = df_up_to_one_zero[total_zeros_and_nans_per_row == 0].copy()

    # Subtract df_no_zeros_nans_3nonn from df_up_to_one_zero to get the remaining rows
    df_remaining_rows = df_up_to_one_zero.drop(df_no_zeros_nans_3nonn.index)

    # Reset the index of the new DataFrame
    df_remaining_rows.reset_index(drop=True, inplace=True)

    # Get the unique values from the "target" column
    unique_values = df_remaining_rows["Target"].unique()

    # Append the results to the list
    results.append((value, df_up_to_one_zero.shape, df_no_zeros_nans_3nonn.shape, *unique_values))

# Create a DataFrame from the results list
df_results = pd.DataFrame(results, columns=['Value', 'DataFrame Shape', 'No Zeros/NaNs Shape', *target_unique_values])

# Display the DataFrame
display(df_results)


<h2 style="color:green;"> Creating columns having no 'Zeors' or 'Nans'@@ Single_code  @@ </h2>
   <ul style = "color:red;">
    <li> need to manually imput the desired numbers of "0" and "NaN"</li>
</ul>

<h2 style="color:green;"> Creating data frame having desired combinaton of  'Zeors' or 'Nans'@@ Single_code  @@ </h2>

In [None]:
# Read the "SMILES_SPECIES_SEX_abs_max_zscore_Target.csv" as a data frame 
SMILES_SPECIES_SEX_abs_max_zscore_Target = pd.read_csv("SMILES_SPECIES_SEX_abs_max_zscore_Target.csv")

# Define a new data frame 

df1 = SMILES_SPECIES_SEX_abs_max_zscore_Target

# convert all columns except the first five columns of the DataFrame df1 to numeric
df1.iloc[:, 5:] = df1.iloc[:, 5:].apply(pd.to_numeric,errors='coerce')


# Count the number of NaN values in each column of df1
nan_counts = df1.isna().sum()

# Filter columns with a maximum defined number of NaN value
columns_up_to_defined_NaN = nan_counts[nan_counts <= 4].index

# Create a new data frame with columns up to defined NaN value
df_up_to_defined_NaN = df1[columns_up_to_defined_NaN]

# Create another data frame with the remaining columns
df_remaining_NaN = df1.drop(columns_up_to_defined_NaN, axis=1)

# # Print the data frames
# print("Data Frame with up to defined NaN value:", df_up_to_defined_NaN.shape)
# print(df_up_to_defined_NaN.head(1))

# print("Data Frame with remaining (have zeros and NaNs) columns:", df_remaining_NaN.shape)
# print(df_remaining_NaN.head(1))


#@................................................................................................................

# Count the number of zero values in each column of df_up_to_defined_NaN
zero_counts = df_up_to_defined_NaN.eq(0).sum()

# Filter columns with a maximum of defined zero value
columns_up_to_defined_zero = zero_counts[zero_counts <=4].index

# Create a new data frame with columns up to defined zero value
df_up_to_defined_zero = df_up_to_defined_NaN[columns_up_to_defined_zero]

# Create another data frame with the remaining columns
df_remaining_zeros = df_up_to_defined_NaN.drop(columns_up_to_defined_zero, axis=1)

# # Print the data frames
# print("Data Frame with up to defined zero value:", df_up_to_defined_zero.shape)
# print(df_up_to_defined_zero.head(1))

# print("Data Frame with remaining columns:", df_remaining_zeros.shape)
# print(df_remaining_zeros.head(1))


#@@...............................................................................................................
# Count the total number of zeros and NaNs in each row
total_zeros_and_nans_per_row = df_up_to_defined_zero.isna().sum(axis=1) + df_up_to_defined_zero.eq(0).sum(axis=1)

# Create a new DataFrame with rows that have no zeros or NaNs
df_no_zeros_no_nans = df_up_to_defined_zero[total_zeros_and_nans_per_row == 0].copy()

# Create a second DataFrame with the remaining rows
df_remaining_rows_withzeros_NaN = df_up_to_defined_zero[total_zeros_and_nans_per_row != 0].copy()

# Reset the index of the new DataFrames
df_no_zeros_no_nans.reset_index(drop=True, inplace=True)
df_remaining_rows_withzeros_NaN.reset_index(drop=True, inplace=True)

# Print the new DataFrame with no zeros or NaNs
print("DataFrame with no zeros or NaNs:", df_no_zeros_no_nans.shape)
df_no_zeros_no_nans.head(1)

# Print the second DataFrame with remaining rows
# print("Second DataFrame with remaining rows:", df_remaining_rows_withzeros_NaN.shape)
# print(df_remaining_rows_withzeros_NaN.head(1))


In [None]:
# # Write the df_no_zeros_no_nans data frame as csv file 
# df_no_zeros_no_nans.to_csv("MLMB_df_no_zeros_no_nans4x4.csv",  index = False)

<h2 style="color:green;"> Creating columns having no 'Zeors' or 'Nans'@@ Single_code  @@ </h2>
   <ul style = "color:red;">
    <li> need to manually imput the desired numbers of "0" and "NaN"</li>
</ul>

In [408]:
# Read the "SMILES_SPECIES_SEX_abs_max_zscore_Target.csv" as a data frame 
SMILES_SPECIES_SEX_abs_max_zscore_Target = pd.read_csv("SMILES_SPECIES_SEX_abs_max_zscore_Target.csv")

# Define a new data frame 

df1 = SMILES_SPECIES_SEX_abs_max_zscore_Target

# convert all columns except the first five columns of the DataFrame df1 to numeric
df1.iloc[:, 5:] = df1.iloc[:, 5:].apply(pd.to_numeric,errors='coerce')


# Count the number of NaN values in each column of df1
nan_counts = df1.isna().sum()

# Filter columns with a maximum defined number of NaN value
columns_up_to_defined_NaN = nan_counts[nan_counts <= 5].index

# Create a new data frame with columns up to defined NaN value
df_up_to_defined_NaN = df1[columns_up_to_defined_NaN]

# Create another data frame with the remaining columns
df_remaining_NaN = df1.drop(columns_up_to_defined_NaN, axis=1)

# # Print the data frames
# print("Data Frame with up to defined NaN value:", df_up_to_defined_NaN.shape)
# print(df_up_to_defined_NaN.head(1))

# print("Data Frame with remaining (have zeros and NaNs) columns:", df_remaining_NaN.shape)
# print(df_remaining_NaN.head(1))


#@................................................................................................................

# Count the number of zero values in each column of df_up_to_defined_NaN
zero_counts = df_up_to_defined_NaN.eq(0).sum()

# Filter columns with a maximum of defined zero value
columns_up_to_defined_zero = zero_counts[zero_counts <=5].index

# Create a new data frame with columns up to defined zero value
df_up_to_defined_zero = df_up_to_defined_NaN[columns_up_to_defined_zero]

# Create another data frame with the remaining columns
df_remaining_zeros = df_up_to_defined_NaN.drop(columns_up_to_defined_zero, axis=1)

# # Print the data frames
# print("Data Frame with up to defined zero value:", df_up_to_defined_zero.shape)
# print(df_up_to_defined_zero.head(1))

# print("Data Frame with remaining columns:", df_remaining_zeros.shape)
# print(df_remaining_zeros.head(1))


#@@...............................................................................................................
# Count the total number of zeros and NaNs in each row
total_zeros_and_nans_per_row = df_up_to_defined_zero.isna().sum(axis=1) + df_up_to_defined_zero.eq(0).sum(axis=1)

# Create a new DataFrame with rows that have no zeros or NaNs
df_no_zeros_no_nans = df_up_to_defined_zero[total_zeros_and_nans_per_row == 0].copy()

# Create a second DataFrame with the remaining rows
df_remaining_rows_withzeros_NaN = df_up_to_defined_zero[total_zeros_and_nans_per_row != 0].copy()

# Reset the index of the new DataFrames
df_no_zeros_no_nans.reset_index(drop=True, inplace=True)
df_remaining_rows_withzeros_NaN.reset_index(drop=True, inplace=True)

# Print the new DataFrame with no zeros or NaNs
print("DataFrame with no zeros or NaNs:", df_no_zeros_no_nans.shape)
df_no_zeros_no_nans.head(1)

# Print the second DataFrame with remaining rows
# print("Second DataFrame with remaining rows:", df_remaining_rows_withzeros_NaN.shape)
# print(df_remaining_rows_withzeros_NaN.head(1))


DataFrame with no zeros or NaNs: (2030, 194)


Unnamed: 0,SMILES,zscore,Target,TSSPECIES,SEX,nAtom,nHeavyAtom,nHetero,ATS0dv,ATS1dv,...,SRW06,SRW08,SRW10,TSRW10,MW,AMW,WPath,Zagreb1,Zagreb2,mZagreb2
0,Cc1[nH]nc2Nc3cc(ncc3C(=Nc12)c4ccccc4Cl)N5CCOCC5,-44.513706,toxic,DOG,M,47,28,8,383.604938,425.111111,...,7.058758,8.686261,10.35917,78.040268,394.130887,8.385764,1873.0,156,189,6.027778


In [411]:
# check the data type of a column
print(df1['ATS0dv'].dtype)

float64


<h2 style="color:red;"> Selection of Differnt Combination of '0' & 'Nan' </h2>
<ul style="color:blue;">
     <li style="color: purple;">Total 100 combination results </li>
    <li>Assess the effect of different combinations of '0' & 'NaN' on row numbers </li>
    <ul style="color:purple;">
     <li>Final data frame will have no "0" and "NaN" </li>
    </ul>
    </ul>

In [413]:
# Read the "SMILES_SPECIES_SEX_abs_max_zscore_Target.csv" as a data frame 
SMILES_SPECIES_SEX_abs_max_zscore_Target = pd.read_csv("SMILES_SPECIES_SEX_abs_max_zscore_Target.csv")

# Define a new data frame 

df1 = SMILES_SPECIES_SEX_abs_max_zscore_Target

# convert all columns except the first five columns of the DataFrame df1 to numeric
df1.iloc[:, 5:] = df1.iloc[:, 5:].apply(pd.to_numeric,errors='coerce')

# print the df1 data frame
print(df1.shape)
print(df1.head(1))

# Create an empty list to store the results
results = []

# Iterate over different thresholds for nan_counts and zero_counts
for nan_threshold in range(1, 11):
    for zero_threshold in range(1, 11):
        # Filter columns with a maximum of nan_threshold NaN values
        columns_up_to_one_nan = nan_counts[nan_counts <= nan_threshold].index

        # Create a new DataFrame with columns up to nan_threshold NaN values
        df_up_to_one_nan = df1[columns_up_to_one_nan]

        # Filter columns with a maximum of zero_threshold zero values
        zero_counts = df_up_to_one_nan.eq(0).sum()
        columns_up_to_one_zero = zero_counts[zero_counts <= zero_threshold].index

        # Create a new DataFrame with columns up to zero_threshold zero values
        df_up_to_one_zero = df_up_to_one_nan[columns_up_to_one_zero]

        # Count the total number of zeros and NaNs in each row
        total_zeros_and_nans_per_row = df_up_to_one_zero.isna().sum(axis=1) + df_up_to_one_zero.eq(0).sum(axis=1)

        # Create a new DataFrame with rows that have no zeros or NaNs
        df_no_zeros_nans_3nonn = df_up_to_one_zero[total_zeros_and_nans_per_row == 0].copy()

        # Subtract df_no_zeros_nans_3nonn from df_up_to_one_zero to get the remaining rows
        df_remaining_rows = df_up_to_one_zero.drop(df_no_zeros_nans_3nonn.index)

        # Reset the index of the new DataFrame
        df_remaining_rows.reset_index(drop=True, inplace=True)

        # Count the number of occurrences for each unique value in the "target" column
        target_value_counts = df_remaining_rows["Target"].value_counts()

        # Store the results in a dictionary
        result = {
            'nan_threshold': nan_threshold,
            'zero_threshold': zero_threshold,
            'df_up_to_one_zero_shape': df_up_to_one_zero.shape,
            'df_no_zeros_nans_3nonn_shape': df_no_zeros_nans_3nonn.shape,
            'target_value_counts': target_value_counts,
            'df_remaining_rows_shape': df_remaining_rows.shape
        }

        # Append the result to the list
        results.append(result)

# Create a data frame from the results
df_results = pd.DataFrame(results)


# # Print the results
# print(df_results)

from IPython.display import display

# Convert df_results to a formatted table
display(df_results)

(2051, 1618)
                                            SMILES     zscore Target  \
0  Cc1[nH]nc2Nc3cc(ncc3C(=Nc12)c4ccccc4Cl)N5CCOCC5 -44.513706  toxic   

  TSSPECIES SEX        ABC      ABCGG  nAcid  nBase    SpAbs_A  ...     SRW10  \
0       DOG   M  22.563116  17.190573      0      0  37.828489  ...  10.35917   

      TSRW10          MW       AMW   WPath  WPol  Zagreb1  Zagreb2  mZagreb1  \
0  78.040268  394.130887  8.385764  1873.0    51      156      189  7.111111   

   mZagreb2  
0  6.027778  

[1 rows x 1618 columns]


Unnamed: 0,nan_threshold,zero_threshold,df_up_to_one_zero_shape,df_no_zeros_nans_3nonn_shape,target_value_counts,df_remaining_rows_shape
0,1,1,"(2051, 96)","(2051, 96)","Series([], Name: Target, dtype: int64)","(0, 96)"
1,1,2,"(2051, 114)","(2049, 114)","mild_toxic 2 Name: Target, dtype: int64","(2, 114)"
2,1,3,"(2051, 115)","(2046, 115)","non_toxic 3 mild_toxic 2 Name: Target, ...","(5, 115)"
3,1,4,"(2051, 144)","(2034, 144)",mild_toxic 7 non_toxic 6 toxic ...,"(17, 144)"
4,1,5,"(2051, 144)","(2034, 144)",mild_toxic 7 non_toxic 6 toxic ...,"(17, 144)"
...,...,...,...,...,...,...
95,10,6,"(2051, 360)","(2011, 360)",mild_toxic 18 toxic 15 non_toxic ...,"(40, 360)"
96,10,7,"(2051, 360)","(2011, 360)",mild_toxic 18 toxic 15 non_toxic ...,"(40, 360)"
97,10,8,"(2051, 372)","(2011, 372)",mild_toxic 18 toxic 15 non_toxic ...,"(40, 372)"
98,10,9,"(2051, 373)","(2011, 373)",mild_toxic 18 toxic 15 non_toxic ...,"(40, 373)"


<h2 style="color:red;"> Selection of Differnt Combination of '0' & 'Nan' </h2>
<ul style="color:blue;">
     <li style="color: purple;">Total '10' combination results </li>
    <li>Assess the effect of different combinations of '0' & 'NaN' on row numbers </li>
    <ul style="color:purple;">
     <li>Final data frame will have no "0" and "NaN" </li>
    </ul>
    </ul>

In [416]:
# Read the "SMILES_SPECIES_SEX_abs_max_zscore_Target.csv" as a data frame 
SMILES_SPECIES_SEX_abs_max_zscore_Target = pd.read_csv("SMILES_SPECIES_SEX_abs_max_zscore_Target.csv")

# Define a new data frame 

df1 = SMILES_SPECIES_SEX_abs_max_zscore_Target

# convert all columns except the first five columns of the DataFrame df1 to numeric
df1.iloc[:, 5:] = df1.iloc[:, 5:].apply(pd.to_numeric,errors='coerce')

# print the df1 data frame
print(df1.shape)
print(df1.head(1))

# Create an empty list to store the results
results = []

# Define the unique values in the "target" column
target_unique_values = ['toxic', 'mild_toxic', 'non_toxic']

# Loop through the values 0 to 10
for value in range(11):
    # Convert all columns except the first four columns of the DataFrame df to numeric
    df1 = SMILES_SPECIES_SEX_abs_max_zscore_Target
    SMILES_SPECIES_SEX_abs_max_zscore_Target.iloc[:, 5:] = SMILES_SPECIES_SEX_abs_max_zscore_Target.iloc[:, 5:].apply(pd.to_numeric,errors='coerce')

    # Count the number of NaN values in each column of df1
    nan_counts = df1.isna().sum()

    # Filter columns with a maximum of 'value' NaN values
    columns_up_to_one_nan = nan_counts[nan_counts <= value].index

    # Create a new DataFrame with columns up to 'value' NaN values
    df_up_to_one_nan = df1[columns_up_to_one_nan]

    # Count the number of zero values in each column of df_up_to_one_nan
    zero_counts = df_up_to_one_nan.eq(0).sum()

    # Filter columns with a maximum of 'value' zero values
    columns_up_to_one_zero = zero_counts[zero_counts <= value].index

    # Create a new DataFrame with columns up to 'value' zero values
    df_up_to_one_zero = df_up_to_one_nan[columns_up_to_one_zero]

    # Count the total number of zeros and NaNs in each row
    total_zeros_and_nans_per_row = df_up_to_one_zero.isna().sum(axis=1) + df_up_to_one_zero.eq(0).sum(axis=1)

    # Create a new DataFrame with rows that have no zeros or NaNs
    df_no_zeros_nans_3nonn = df_up_to_one_zero[total_zeros_and_nans_per_row == 0].copy()

    # Subtract df_no_zeros_nans_3nonn from df_up_to_one_zero to get the remaining rows
    df_remaining_rows = df_up_to_one_zero.drop(df_no_zeros_nans_3nonn.index)

    # Reset the index of the new DataFrame
    df_remaining_rows.reset_index(drop=True, inplace=True)

    # Get the unique values from the "target" column
    unique_values = df_remaining_rows["Target"].unique()

    # Append the results to the list
    results.append((value, df_up_to_one_zero.shape, df_no_zeros_nans_3nonn.shape, *unique_values))

# Create a DataFrame from the results list
df_results = pd.DataFrame(results, columns=['Value', 'DataFrame Shape', 'No Zeros/NaNs Shape', *target_unique_values])

# Display the DataFrame
display(df_results)


(2051, 1618)
                                            SMILES     zscore Target  \
0  Cc1[nH]nc2Nc3cc(ncc3C(=Nc12)c4ccccc4Cl)N5CCOCC5 -44.513706  toxic   

  TSSPECIES SEX        ABC      ABCGG  nAcid  nBase    SpAbs_A  ...     SRW10  \
0       DOG   M  22.563116  17.190573      0      0  37.828489  ...  10.35917   

      TSRW10          MW       AMW   WPath  WPol  Zagreb1  Zagreb2  mZagreb1  \
0  78.040268  394.130887  8.385764  1873.0    51      156      189  7.111111   

   mZagreb2  
0  6.027778  

[1 rows x 1618 columns]


Unnamed: 0,Value,DataFrame Shape,No Zeros/NaNs Shape,toxic,mild_toxic,non_toxic
0,0,"(2051, 96)","(2051, 96)",,,
1,1,"(2051, 96)","(2051, 96)",,,
2,2,"(2051, 150)","(2045, 150)",toxic,mild_toxic,non_toxic
3,3,"(2051, 151)","(2042, 151)",toxic,mild_toxic,non_toxic
4,4,"(2051, 194)","(2030, 194)",toxic,mild_toxic,non_toxic
5,5,"(2051, 194)","(2030, 194)",toxic,mild_toxic,non_toxic
6,6,"(2051, 273)","(2028, 273)",toxic,mild_toxic,non_toxic
7,7,"(2051, 273)","(2028, 273)",toxic,mild_toxic,non_toxic
8,8,"(2051, 288)","(2028, 288)",toxic,mild_toxic,non_toxic
9,9,"(2051, 288)","(2028, 288)",toxic,mild_toxic,non_toxic


<h1 style="color:red;"> Model-1 (Model-RandomForestClassifier model) :  </h1>