# Data Splitting

#just in case for one specific species group, we have more than one experimental data, we just use this method:
#the ratio between maximum value and the minimum experimental value (x/y) (compounds with difference of >3 were removed)
#each compound had reported ecotoxicity data available for at least three different groups of species
#The rule of three or more species groups is crucial for the calculation of mean and standard deviation (Posthuma et al. 2019)
#Where multiple responses for a chemical within a particular group of species, the observation with the lowest ecotoxicity value was selected 

In [10]:
import pandas as pd
df = pd.read_csv('MaindatasetNOEC-CompoundsCurated.csv')
df

Unnamed: 0,CAS Number,Chemical Name,Species Scientific Name,Species Group,Conc Mean,Conc Units,Endpoint,Standardized_InChI,InChI_Verification,Curation_Status,Cleaned_SMILES
0,50022,"11beta,16alpha-9-Fluoro-11,17,21-Trihydroxy-16...",Xenopus laevis,Amphibians,0.003925,mg/L,NOEC,InChI=1S/C22H29FO5/c1-12-8-16-15-5-4-13-9-14(2...,Validated InChI found from NIST. Original name...,"SMILES conversion successful, Passed initial f...",[CH]=C[CH2]
1,50022,"11beta,16alpha-9-Fluoro-11,17,21-Trihydroxy-16...",Xenopus laevis,Amphibians,0.196234,mg/L,NOEC,InChI=1S/C22H29FO5/c1-12-8-16-15-5-4-13-9-14(2...,Validated InChI found from NIST. Original name...,"SMILES conversion successful, Passed initial f...",[CH]=C[CH2]
2,50022,"11beta,16alpha-9-Fluoro-11,17,21-Trihydroxy-16...",Xenopus laevis,Amphibians,0.039247,mg/L,NOEC,InChI=1S/C22H29FO5/c1-12-8-16-15-5-4-13-9-14(2...,Validated InChI found from NIST. Original name...,"SMILES conversion successful, Passed initial f...",[CH]=C[CH2]
3,50066,"5-Ethyl-5-phenyl-2,4,6(1H,3H,5H)-pyrimidinetrione",Pleurodeles waltl,Amphibians,100.000000,mg/L,NOEC,InChI=1S/C12H12N2O3/c1-2-12(8-6-4-3-5-7-8)9(15...,Validated InChI found from PubChem. Original n...,"SMILES conversion successful, Passed initial f...",[H]O=C
4,50180,"N,N-Bis(2-chloroethyl)tetrahydro-2H-1,3,2-oxaz...",Rhinella arenarum,Amphibians,40.000000,mg/L,NOEC,InChI=1S/C7H15Cl2N2O2P/c8-2-5-11(6-3-9)14(12)1...,Validated InChI found from PubChem. Original n...,"SMILES conversion successful, Passed initial f...",[H]N1C([H])([H])C([H])([H])C([H])([H])OP1(=O)N...
...,...,...,...,...,...,...,...,...,...,...,...
10549,60207901,"1-[[2-(2,4-Dichlorophenyl)-4-propyl-1,3-dioxol...",Hyperamoeba sp.,Other Invertebrates,8.200000,mg/L,NOEC,InChI=1S/C15H17Cl2N3O2/c1-2-3-12-7-21-15(22-12...,Validated InChI found from PubChem. Original n...,"SMILES conversion successful, Passed initial f...",[H]c1nc([H])n(C([H])([H])C2(c3c([H])c([H])c(Cl...
10550,60207901,"1-[[2-(2,4-Dichlorophenyl)-4-propyl-1,3-dioxol...",Mastigophora,Other Invertebrates,2.500000,mg/L,NOEC,InChI=1S/C15H17Cl2N3O2/c1-2-3-12-7-21-15(22-12...,Validated InChI found from PubChem. Original n...,"SMILES conversion successful, Passed initial f...",[H]c1nc([H])n(C([H])([H])C2(c3c([H])c([H])c(Cl...
10551,120067836,"5-Amino-1-[2,6-dichloro-4-(trifluoromethyl)phe...",Folsomia candida,Other Invertebrates,0.005500,mg/L,NOEC,"InChI=1/C12H4Cl2F6N4S/c13-5-1-4(11(15,16)17)2-...",Validated InChI found from OPSIN. Original nam...,"SMILES conversion successful, Passed initial f...",[H]c1c(Cl)c(-n2nc(C#N)c(SC(F)(F)F)c2N([H])[H])...
10552,120068362,"5-Amino-1-[2,6-dichloro-4-(trifluoromethyl)phe...",Folsomia candida,Other Invertebrates,0.031100,mg/L,NOEC,"InChI=1S/C12H4Cl2F6N4O2S/c13-5-1-4(11(15,16)17...",Validated InChI found from PubChem. Original n...,"SMILES conversion successful, Passed initial f...",[H]c1c(Cl)c(-n2nc(C#N)c(S(=O)(=O)C(F)(F)F)c2N(...


In [11]:
#each compound had reported ecotoxicity data available for at least three different groups of species
# Group by CAS number and count unique species groups for each CAS number
species_count = df.groupby('CAS Number')['Species Group'].nunique()

# Filter the DataFrame to keep only CAS numbers with at least three species groups
filtered_cas_numbers = species_count[species_count >= 3].index

# Filter the original DataFrame to keep only the relevant rows
df_filtered = df[df['CAS Number'].isin(filtered_cas_numbers)]

# Display the filtered DataFrame
print(df_filtered)

       CAS Number                                      Chemical Name  \
3           50066  5-Ethyl-5-phenyl-2,4,6(1H,3H,5H)-pyrimidinetrione   
4           50180  N,N-Bis(2-chloroethyl)tetrahydro-2H-1,3,2-oxaz...   
5           50180  N,N-Bis(2-chloroethyl)tetrahydro-2H-1,3,2-oxaz...   
6           50180  N,N-Bis(2-chloroethyl)tetrahydro-2H-1,3,2-oxaz...   
7           50180  N,N-Bis(2-chloroethyl)tetrahydro-2H-1,3,2-oxaz...   
...           ...                                                ...   
10547    60207901  1-[[2-(2,4-Dichlorophenyl)-4-propyl-1,3-dioxol...   
10548    60207901  1-[[2-(2,4-Dichlorophenyl)-4-propyl-1,3-dioxol...   
10549    60207901  1-[[2-(2,4-Dichlorophenyl)-4-propyl-1,3-dioxol...   
10550    60207901  1-[[2-(2,4-Dichlorophenyl)-4-propyl-1,3-dioxol...   
10553   120068373  5-Amino-1-[2,6-dichloro-4-(trifluoromethyl)phe...   

          Species Scientific Name        Species Group  Conc Mean   \
3               Pleurodeles waltl           Amphibians     100.00

In [13]:
#Where multiple responses for a chemical within a particular group of species, the observation with the lowest ecotoxicity value was selected
# For each CAS number and Species Group, select the row with the minimum ecotoxicity value
df_min = df.loc[df.groupby(['CAS Number', 'Species Group'])['Conc Mean '].idxmin()]

# Filter the original DataFrame to keep only the relevant rows
df_filtered = df_min[df_min['CAS Number'].isin(filtered_cas_numbers)]
df_filtered

Unnamed: 0,CAS Number,Chemical Name,Species Scientific Name,Species Group,Conc Mean,Conc Units,Endpoint,Standardized_InChI,InChI_Verification,Curation_Status,Cleaned_SMILES
8050,50066,"5-Ethyl-5-phenyl-2,4,6(1H,3H,5H)-pyrimidinetrione",Chlamydomonas reinhardtii,Algae,23.20000,mg/L,NOEC,InChI=1S/C12H12N2O3/c1-2-12(8-6-4-3-5-7-8)9(15...,Validated InChI found from PubChem. Original n...,"SMILES conversion successful, Passed initial f...",[H]O=C
3,50066,"5-Ethyl-5-phenyl-2,4,6(1H,3H,5H)-pyrimidinetrione",Pleurodeles waltl,Amphibians,100.00000,mg/L,NOEC,InChI=1S/C12H12N2O3/c1-2-12(8-6-4-3-5-7-8)9(15...,Validated InChI found from PubChem. Original n...,"SMILES conversion successful, Passed initial f...",[H]O=C
1185,50066,"5-Ethyl-5-phenyl-2,4,6(1H,3H,5H)-pyrimidinetrione",Daphnia magna,Crustaceans,5.00000,mg/L,NOEC,InChI=1S/C12H12N2O3/c1-2-12(8-6-4-3-5-7-8)9(15...,Validated InChI found from PubChem. Original n...,"SMILES conversion successful, Passed initial f...",[H]O=C
10089,50180,"N,N-Bis(2-chloroethyl)tetrahydro-2H-1,3,2-oxaz...",Raphidocelis subcapitata,Algae,12.50000,mg/L,NOEC,InChI=1S/C7H15Cl2N2O2P/c8-2-5-11(6-3-9)14(12)1...,Validated InChI found from PubChem. Original n...,"SMILES conversion successful, Passed initial f...",[H]N1C([H])([H])C([H])([H])C([H])([H])OP1(=O)N...
6,50180,"N,N-Bis(2-chloroethyl)tetrahydro-2H-1,3,2-oxaz...",Xenopus laevis,Amphibians,20.00000,mg/L,NOEC,InChI=1S/C7H15Cl2N2O2P/c8-2-5-11(6-3-9)14(12)1...,Validated InChI found from PubChem. Original n...,"SMILES conversion successful, Passed initial f...",[H]N1C([H])([H])C([H])([H])C([H])([H])OP1(=O)N...
...,...,...,...,...,...,...,...,...,...,...,...
1166,203313251,"Carbonic acid, cis-3-(2,5-Dimethylphenyl)-8-me...",Bufo bufo ssp. gargarizans,Amphibians,0.03000,mg/L,NOEC,InChI=1S/C21H27NO5/c1-5-26-20(24)27-18-17(16-1...,Validated InChI found from PubChem. Original n...,"SMILES conversion successful, Passed initial f...",[H]O=C
2761,203313251,"Carbonic acid, cis-3-(2,5-Dimethylphenyl)-8-me...",Daphnia magna,Crustaceans,2.00000,mg/L,NOEC,InChI=1S/C21H27NO5/c1-5-26-20(24)27-18-17(16-1...,Validated InChI found from PubChem. Original n...,"SMILES conversion successful, Passed initial f...",[H]O=C
1169,210880925,[C(E)]-N-[(2-Chloro-5-thiazolyl)methyl]-N'-met...,Lithobates pipiens,Amphibians,0.25000,mg/L,NOEC,InChI=1S/C6H8ClN5O2S/c1-8-6(11-12(13)14)10-3-4...,Validated InChI found from PubChem (formatted ...,"SMILES conversion successful, Passed initial f...",[H]c1nc(Cl)sc1C([H])([H])N([H])C(=NC([H])([H])...
4953,210880925,[C(E)]-N-[(2-Chloro-5-thiazolyl)methyl]-N'-met...,Penaeus monodon,Crustaceans,0.00495,mg/L,NOEC,InChI=1S/C6H8ClN5O2S/c1-8-6(11-12(13)14)10-3-4...,Validated InChI found from PubChem (formatted ...,"SMILES conversion successful, Passed initial f...",[H]c1nc(Cl)sc1C([H])([H])N([H])C(=NC([H])([H])...


In [14]:
df_filtered.to_csv('NOECreadytolog.csv', index=False)

# Log

In [34]:
import pandas as pd
df= pd.read_csv("NOECreadytolog.csv")

In [35]:
# Function to get molecular weight from PubChem based on SMILES
def get_molecular_weight(smiles):
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/{smiles}/property/MolecularWeight/txt"
    try:
        response = requests.get(url, timeout=10)  # Setting a timeout for the request
        response.raise_for_status()
        
        # Process the response text, ensuring it's clean and numeric
        weight_text = response.text.strip().replace('\n', '')  # Remove any newline characters
        
        # Check if the cleaned weight_text is a valid float number
        try:
            molecular_weight = float(weight_text)  # Attempt to convert to float
            return molecular_weight
        except ValueError:
            print(f"Non-numeric data received for SMILES {smiles}: {weight_text}")
            return None  # Return None if the data is not valid
        
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for SMILES {smiles}: {e}")
        return None  # Return None if there's an issue with the request

# Initialize a new column for molecular weight
merged_df['MolecularWeight'] = None

# Iterate over each SMILES in the DataFrame without a progress bar
for i, smiles in enumerate(merged_df['Cleaned_SMILES']):
    print(f"Processing SMILES {i+1}/{len(merged_df)}: {smiles}")
    molecular_weight = get_molecular_weight(smiles)
    merged_df.at[i, 'MolecularWeight'] = molecular_weight

# Save the updated DataFrame to a new CSV file
try:
    merged_df.to_csv("merged_with_molecular_weights.csv", index=False)
    print("CSV file saved successfully as 'merged_with_molecular_weights.csv'.")
except Exception as e:
    print(f"Error saving the DataFrame to CSV: {e}")



Processing SMILES 1/744: [H]O=C
Processing SMILES 2/744: [H]O=C
Processing SMILES 3/744: [H]O=C
Processing SMILES 4/744: [H]N1C([H])([H])C([H])([H])C([H])([H])OP1(=O)N(C([H])([H])C([H])([H])Cl)C([H])([H])C([H])([H])Cl
Processing SMILES 5/744: [H]N1C([H])([H])C([H])([H])C([H])([H])OP1(=O)N(C([H])([H])C([H])([H])Cl)C([H])([H])C([H])([H])Cl
Processing SMILES 6/744: [H]N1C([H])([H])C([H])([H])C([H])([H])OP1(=O)N(C([H])([H])C([H])([H])Cl)C([H])([H])C([H])([H])Cl
Processing SMILES 7/744: [H]N1C([H])([H])C([H])([H])C([H])([H])OP1(=O)N(C([H])([H])C([H])([H])Cl)C([H])([H])C([H])([H])Cl
Processing SMILES 8/744: [H]Oc1c([H])c([H])c2c(c1[H])C([H])([H])C([H])([H])C1([H])C2([H])C([H])([H])C([H])([H])C2(C([H])([H])[H])C1([H])C([H])([H])C([H])([H])[C@@]2([H])O[H]
Processing SMILES 9/744: [H]Oc1c([H])c([H])c2c(c1[H])C([H])([H])C([H])([H])C1([H])C2([H])C([H])([H])C([H])([H])C2(C([H])([H])[H])C1([H])C([H])([H])C([H])([H])[C@@]2([H])O[H]
Processing SMILES 10/744: [H]Oc1c([H])c([H])c2c(c1[H])C([H])([H])C([

In [73]:
df2=pd.read_csv("merged_with_molecular_weights.csv")
df2

Unnamed: 0,CAS Number,Chemical Name,Species Scientific Name,Species Group,Conc Mean,Conc Units,Endpoint,Standardized_InChI,InChI_Verification,Curation_Status,Cleaned_SMILES,MolecularWeight
0,50066,"5-Ethyl-5-phenyl-2,4,6(1H,3H,5H)-pyrimidinetrione",Chlamydomonas reinhardtii,Algae,23.20000,mg/L,NOEC,InChI=1S/C12H12N2O3/c1-2-12(8-6-4-3-5-7-8)9(15...,Validated InChI found from PubChem. Original n...,"SMILES conversion successful, Passed initial f...",[H]O=C,31.034
1,50066,"5-Ethyl-5-phenyl-2,4,6(1H,3H,5H)-pyrimidinetrione",Pleurodeles waltl,Amphibians,100.00000,mg/L,NOEC,InChI=1S/C12H12N2O3/c1-2-12(8-6-4-3-5-7-8)9(15...,Validated InChI found from PubChem. Original n...,"SMILES conversion successful, Passed initial f...",[H]O=C,31.034
2,50066,"5-Ethyl-5-phenyl-2,4,6(1H,3H,5H)-pyrimidinetrione",Daphnia magna,Crustaceans,5.00000,mg/L,NOEC,InChI=1S/C12H12N2O3/c1-2-12(8-6-4-3-5-7-8)9(15...,Validated InChI found from PubChem. Original n...,"SMILES conversion successful, Passed initial f...",[H]O=C,31.034
3,50180,"N,N-Bis(2-chloroethyl)tetrahydro-2H-1,3,2-oxaz...",Raphidocelis subcapitata,Algae,12.50000,mg/L,NOEC,InChI=1S/C7H15Cl2N2O2P/c8-2-5-11(6-3-9)14(12)1...,Validated InChI found from PubChem. Original n...,"SMILES conversion successful, Passed initial f...",[H]N1C([H])([H])C([H])([H])C([H])([H])OP1(=O)N...,261.080
4,50180,"N,N-Bis(2-chloroethyl)tetrahydro-2H-1,3,2-oxaz...",Xenopus laevis,Amphibians,20.00000,mg/L,NOEC,InChI=1S/C7H15Cl2N2O2P/c8-2-5-11(6-3-9)14(12)1...,Validated InChI found from PubChem. Original n...,"SMILES conversion successful, Passed initial f...",[H]N1C([H])([H])C([H])([H])C([H])([H])OP1(=O)N...,261.080
...,...,...,...,...,...,...,...,...,...,...,...,...
739,203313251,"Carbonic acid, cis-3-(2,5-Dimethylphenyl)-8-me...",Bufo bufo ssp. gargarizans,Amphibians,0.03000,mg/L,NOEC,InChI=1S/C21H27NO5/c1-5-26-20(24)27-18-17(16-1...,Validated InChI found from PubChem. Original n...,"SMILES conversion successful, Passed initial f...",[H]O=C,31.034
740,203313251,"Carbonic acid, cis-3-(2,5-Dimethylphenyl)-8-me...",Daphnia magna,Crustaceans,2.00000,mg/L,NOEC,InChI=1S/C21H27NO5/c1-5-26-20(24)27-18-17(16-1...,Validated InChI found from PubChem. Original n...,"SMILES conversion successful, Passed initial f...",[H]O=C,31.034
741,210880925,[C(E)]-N-[(2-Chloro-5-thiazolyl)methyl]-N'-met...,Lithobates pipiens,Amphibians,0.25000,mg/L,NOEC,InChI=1S/C6H8ClN5O2S/c1-8-6(11-12(13)14)10-3-4...,Validated InChI found from PubChem (formatted ...,"SMILES conversion successful, Passed initial f...",[H]c1nc(Cl)sc1C([H])([H])N([H])C(=NC([H])([H])...,249.680
742,210880925,[C(E)]-N-[(2-Chloro-5-thiazolyl)methyl]-N'-met...,Penaeus monodon,Crustaceans,0.00495,mg/L,NOEC,InChI=1S/C6H8ClN5O2S/c1-8-6(11-12(13)14)10-3-4...,Validated InChI found from PubChem (formatted ...,"SMILES conversion successful, Passed initial f...",[H]c1nc(Cl)sc1C([H])([H])N([H])C(=NC([H])([H])...,249.680


In [74]:
# Function to calculate pNOEC
import numpy as np
def calculate_pNOEC(NOEC, molecular_weight):
    concentration_m = (NOEC / molecular_weight) * 1000
    # Check for zero or negative values to avoid log errors
    if concentration_m <= 0:
        return None  # Or another placeholder that signifies an invalid or undefined pEC50
    pNOEC = -np.log10(concentration_m)
    return pNOEC

In [75]:
df2['pNOEC'] = df2.apply(lambda row: calculate_pNOEC(row['Conc Mean '], row['MolecularWeight']), axis=1)

In [76]:
df2

Unnamed: 0,CAS Number,Chemical Name,Species Scientific Name,Species Group,Conc Mean,Conc Units,Endpoint,Standardized_InChI,InChI_Verification,Curation_Status,Cleaned_SMILES,MolecularWeight,pNOEC
0,50066,"5-Ethyl-5-phenyl-2,4,6(1H,3H,5H)-pyrimidinetrione",Chlamydomonas reinhardtii,Algae,23.20000,mg/L,NOEC,InChI=1S/C12H12N2O3/c1-2-12(8-6-4-3-5-7-8)9(15...,Validated InChI found from PubChem. Original n...,"SMILES conversion successful, Passed initial f...",[H]O=C,31.034,-2.873650
1,50066,"5-Ethyl-5-phenyl-2,4,6(1H,3H,5H)-pyrimidinetrione",Pleurodeles waltl,Amphibians,100.00000,mg/L,NOEC,InChI=1S/C12H12N2O3/c1-2-12(8-6-4-3-5-7-8)9(15...,Validated InChI found from PubChem. Original n...,"SMILES conversion successful, Passed initial f...",[H]O=C,31.034,-3.508162
2,50066,"5-Ethyl-5-phenyl-2,4,6(1H,3H,5H)-pyrimidinetrione",Daphnia magna,Crustaceans,5.00000,mg/L,NOEC,InChI=1S/C12H12N2O3/c1-2-12(8-6-4-3-5-7-8)9(15...,Validated InChI found from PubChem. Original n...,"SMILES conversion successful, Passed initial f...",[H]O=C,31.034,-2.207132
3,50180,"N,N-Bis(2-chloroethyl)tetrahydro-2H-1,3,2-oxaz...",Raphidocelis subcapitata,Algae,12.50000,mg/L,NOEC,InChI=1S/C7H15Cl2N2O2P/c8-2-5-11(6-3-9)14(12)1...,Validated InChI found from PubChem. Original n...,"SMILES conversion successful, Passed initial f...",[H]N1C([H])([H])C([H])([H])C([H])([H])OP1(=O)N...,261.080,-1.680136
4,50180,"N,N-Bis(2-chloroethyl)tetrahydro-2H-1,3,2-oxaz...",Xenopus laevis,Amphibians,20.00000,mg/L,NOEC,InChI=1S/C7H15Cl2N2O2P/c8-2-5-11(6-3-9)14(12)1...,Validated InChI found from PubChem. Original n...,"SMILES conversion successful, Passed initial f...",[H]N1C([H])([H])C([H])([H])C([H])([H])OP1(=O)N...,261.080,-1.884256
...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,203313251,"Carbonic acid, cis-3-(2,5-Dimethylphenyl)-8-me...",Bufo bufo ssp. gargarizans,Amphibians,0.03000,mg/L,NOEC,InChI=1S/C21H27NO5/c1-5-26-20(24)27-18-17(16-1...,Validated InChI found from PubChem. Original n...,"SMILES conversion successful, Passed initial f...",[H]O=C,31.034,0.014717
740,203313251,"Carbonic acid, cis-3-(2,5-Dimethylphenyl)-8-me...",Daphnia magna,Crustaceans,2.00000,mg/L,NOEC,InChI=1S/C21H27NO5/c1-5-26-20(24)27-18-17(16-1...,Validated InChI found from PubChem. Original n...,"SMILES conversion successful, Passed initial f...",[H]O=C,31.034,-1.809192
741,210880925,[C(E)]-N-[(2-Chloro-5-thiazolyl)methyl]-N'-met...,Lithobates pipiens,Amphibians,0.25000,mg/L,NOEC,InChI=1S/C6H8ClN5O2S/c1-8-6(11-12(13)14)10-3-4...,Validated InChI found from PubChem (formatted ...,"SMILES conversion successful, Passed initial f...",[H]c1nc(Cl)sc1C([H])([H])N([H])C(=NC([H])([H])...,249.680,-0.000556
742,210880925,[C(E)]-N-[(2-Chloro-5-thiazolyl)methyl]-N'-met...,Penaeus monodon,Crustaceans,0.00495,mg/L,NOEC,InChI=1S/C6H8ClN5O2S/c1-8-6(11-12(13)14)10-3-4...,Validated InChI found from PubChem (formatted ...,"SMILES conversion successful, Passed initial f...",[H]c1nc(Cl)sc1C([H])([H])N([H])C(=NC([H])([H])...,249.680,1.702779


In [77]:
df2.columns

Index(['CAS Number', 'Chemical Name', 'Species Scientific Name',
       'Species Group', 'Conc Mean ', 'Conc Units ', 'Endpoint',
       'Standardized_InChI', 'InChI_Verification', 'Curation_Status',
       'Cleaned_SMILES', 'MolecularWeight', 'pNOEC'],
      dtype='object')

In [78]:
# Create a pivot table to transform data
transformed_df = df2.pivot_table(
    index=['CAS Number', 'Chemical Name'],
    columns='Species Group',
    values='pNOEC',
    aggfunc='first'  
)
# Rename columns to match the new original file structure
transformed_df.columns = [f'pNOEC_{col}' for col in transformed_df.columns]
#Reset the index to make 'CAS Number' and 'ChemicalName' as regular columns
transformed_df.reset_index(inplace=True)
transformed_df

Unnamed: 0,CAS Number,Chemical Name,pNOEC_Algae,pNOEC_Amphibians,pNOEC_Crustaceans,pNOEC_Other Invertebrates
0,50066,"5-Ethyl-5-phenyl-2,4,6(1H,3H,5H)-pyrimidinetrione",-2.873650,-3.508162,-2.207132,
1,50180,"N,N-Bis(2-chloroethyl)tetrahydro-2H-1,3,2-oxaz...",-1.680136,-1.884256,-1.680136,-1.000013
2,50282,"(17beta)Estra-1,3,5(10)triene-3,17-diol",1.532117,3.435207,3.435207,4.891139
3,50293,"1,1'-(2,2,2-Trichloroethylidene)bis[4-chlorobe...",0.549616,-0.342411,4.018137,2.868375
4,50782,2-(Acetyloxy)benzoic acid,-2.809192,,-1.508162,-0.508162
...,...,...,...,...,...,...
199,143545908,"6-[(R)-Hydroxy[(2aS,3R,4S,5aS,7R)-2,2a,3,4,5,5...",2.618466,,2.442375,3.761134
200,153719234,3-[(2-Chloro-5-thiazolyl)methyl]tetrahydro-5-m...,,0.137137,1.766866,2.973605
201,175013180,[2-[[[1-(4-Chlorophenyl)-1H-pyrazol-3-yl]oxy]m...,1.412517,3.588608,2.889638,1.588608
202,203313251,"Carbonic acid, cis-3-(2,5-Dimethylphenyl)-8-me...",-1.508162,0.014717,-1.809192,


In [80]:
pNOEC_columns = ['pNOEC_Algae', 'pNOEC_Amphibians', 'pNOEC_Crustaceans', 'pNOEC_Other Invertebrates']
transformed_df['pNOEC_Mean'] = transformed_df[pNOEC_columns].fillna(0).mean(axis=1)
transformed_df['pNOEC_SD'] = transformed_df[pNOEC_columns].fillna(0).std(axis=1)
transformed_df

Unnamed: 0,CAS Number,Chemical Name,pNOEC_Algae,pNOEC_Amphibians,pNOEC_Crustaceans,pNOEC_Other Invertebrates,pNOEC_Mean,pNOEC_SD
0,50066,"5-Ethyl-5-phenyl-2,4,6(1H,3H,5H)-pyrimidinetrione",-2.873650,-3.508162,-2.207132,,-2.147236,1.526871
1,50180,"N,N-Bis(2-chloroethyl)tetrahydro-2H-1,3,2-oxaz...",-1.680136,-1.884256,-1.680136,-1.000013,-1.561136,0.386259
2,50282,"(17beta)Estra-1,3,5(10)triene-3,17-diol",1.532117,3.435207,3.435207,4.891139,3.323418,1.377377
3,50293,"1,1'-(2,2,2-Trichloroethylidene)bis[4-chlorobe...",0.549616,-0.342411,4.018137,2.868375,1.773429,2.017599
4,50782,2-(Acetyloxy)benzoic acid,-2.809192,,-1.508162,-0.508162,-1.206379,1.238674
...,...,...,...,...,...,...,...,...
199,143545908,"6-[(R)-Hydroxy[(2aS,3R,4S,5aS,7R)-2,2a,3,4,5,5...",2.618466,,2.442375,3.761134,2.205494,1.582285
200,153719234,3-[(2-Chloro-5-thiazolyl)methyl]tetrahydro-5-m...,,0.137137,1.766866,2.973605,1.219402,1.418354
201,175013180,[2-[[[1-(4-Chlorophenyl)-1H-pyrazol-3-yl]oxy]m...,1.412517,3.588608,2.889638,1.588608,2.369842,1.046005
202,203313251,"Carbonic acid, cis-3-(2,5-Dimethylphenyl)-8-me...",-1.508162,0.014717,-1.809192,,-0.825659,0.969724


In [87]:
# Load and ensure 'CAS Number' column is a string in both DataFrames
transformed_df['CAS Number'] = transformed_df['CAS Number'].astype(str)
df2['CAS Number'] = df2['CAS Number'].astype(str)

# Define columns to include from df2, ensuring 'CAS Number' is present for the merge
columns_to_include = ['CAS Number', 'Standardized_InChI', 'InChI_Verification', 
                      'Curation_Status', 'Cleaned_SMILES', 'MolecularWeight']

# Perform the left merge to keep all rows from df
merged_df = pd.merge(df, df2[columns_to_include], on='CAS Number', how='left')
merged_df = merged_df.drop_duplicates()
merged_df



Unnamed: 0,CAS Number,Chemical Name,pNOEC_Algae,pNOEC_Amphibians,pNOEC_Crustaceans,pNOEC_Other Invertebrates,pNOEC_Mean,pNOEC_SD,Standardized_InChI,InChI_Verification,Curation_Status,Cleaned_SMILES,MolecularWeight
0,50066,"5-Ethyl-5-phenyl-2,4,6(1H,3H,5H)-pyrimidinetrione",-2.873650,-3.508162,-2.207132,,-2.147236,1.526871,InChI=1S/C12H12N2O3/c1-2-12(8-6-4-3-5-7-8)9(15...,Validated InChI found from PubChem. Original n...,"SMILES conversion successful, Passed initial f...",[H]O=C,31.034
3,50180,"N,N-Bis(2-chloroethyl)tetrahydro-2H-1,3,2-oxaz...",-1.680136,-1.884256,-1.680136,-1.000013,-1.561136,0.386259,InChI=1S/C7H15Cl2N2O2P/c8-2-5-11(6-3-9)14(12)1...,Validated InChI found from PubChem. Original n...,"SMILES conversion successful, Passed initial f...",[H]N1C([H])([H])C([H])([H])C([H])([H])OP1(=O)N...,261.080
7,50282,"(17beta)Estra-1,3,5(10)triene-3,17-diol",1.532117,3.435207,3.435207,4.891139,3.323418,1.377377,InChI=1S/C18H24O2/c1-18-9-8-14-13-5-3-12(19)10...,Validated InChI found from NIST. Original name...,"SMILES conversion successful, Passed initial f...",[H]Oc1c([H])c([H])c2c(c1[H])C([H])([H])C([H])(...,272.400
11,50293,"1,1'-(2,2,2-Trichloroethylidene)bis[4-chlorobe...",0.549616,-0.342411,4.018137,2.868375,1.773429,2.017599,InChI=1S/C14H9Cl5/c15-11-5-1-9(2-6-11)13(14(17...,Validated InChI found from PubChem. Original n...,"SMILES conversion successful, Passed initial f...",[H]c1c([H])c(C([H])(c2c([H])c([H])c(Cl)c([H])c...,354.500
15,50782,2-(Acetyloxy)benzoic acid,-2.809192,,-1.508162,-0.508162,-1.206379,1.238674,InChI=1S/C9H8O4/c1-6(10)13-8-5-3-2-4-7(8)9(11)...,Validated InChI found from PubChem. Original n...,"SMILES conversion successful, Passed initial f...",[H]O=C,31.034
...,...,...,...,...,...,...,...,...,...,...,...,...,...
673,143545908,"6-[(R)-Hydroxy[(2aS,3R,4S,5aS,7R)-2,2a,3,4,5,5...",2.618466,,2.442375,3.761134,2.205494,1.582285,InChI=1/C15H21N5O7S/c1-6-10-5-16-14-17-8(13(22...,Validated InChI found from OPSIN. Original nam...,"SMILES conversion successful, Passed initial f...",[H]O[C@@]([H])(c1c([H])c(=O)n([H])c(=O)n1[H])[...,415.400
676,153719234,3-[(2-Chloro-5-thiazolyl)methyl]tetrahydro-5-m...,,0.137137,1.766866,2.973605,1.219402,1.418354,InChI=1/C8H10ClN5O3S/c1-12-4-17-5-13(8(12)11-1...,Validated InChI found from OPSIN. Original nam...,"SMILES conversion successful, Passed initial f...",[H]c1nc(Cl)sc1C([H])([H])N1C(=N[N+](=O)[O-])N(...,291.720
679,175013180,[2-[[[1-(4-Chlorophenyl)-1H-pyrazol-3-yl]oxy]m...,1.412517,3.588608,2.889638,1.588608,2.369842,1.046005,InChI=1S/C19H18ClN3O4/c1-25-19(24)23(26-2)17-6...,Validated InChI found from NIST. Original name...,"SMILES conversion successful, Passed initial f...",[H]c1c([H])c([H])c(C([H])([H])Oc2nn(-c3c([H])c...,387.800
683,203313251,"Carbonic acid, cis-3-(2,5-Dimethylphenyl)-8-me...",-1.508162,0.014717,-1.809192,,-0.825659,0.969724,InChI=1S/C21H27NO5/c1-5-26-20(24)27-18-17(16-1...,Validated InChI found from PubChem. Original n...,"SMILES conversion successful, Passed initial f...",[H]O=C,31.034


In [88]:
merged_df.to_csv('NOEC_meansd.csv', index=False)