In [1]:
import requests
import pandas as pd

In [2]:
df=pd.read_csv ('NOECremovedunspeciesandmergedspecies.csv')
df

Unnamed: 0,CAS Number,Chemical Name,Species Scientific Name,Species Group,Conc Mean,Conc Units
0,50000,Formaldehyde,Lithobates catesbeianus,Amphibians,0.50,AI mg/L
1,50000,Formaldehyde,Lithobates catesbeianus,Amphibians,0.50,AI mg/L
2,50066,"5-Ethyl-5-phenyl-2,4,6(1H,3H,5H)-pyrimidinetrione",Pleurodeles waltl,Amphibians,100.00,AI mg/L
3,50180,"N,N-Bis(2-chloroethyl)tetrahydro-2H-1,3,2-oxaz...",Rhinella arenarum,Amphibians,40.00,AI mg/L
4,50180,"N,N-Bis(2-chloroethyl)tetrahydro-2H-1,3,2-oxaz...",Hypsiboas pulchellus,Amphibians,40.00,AI mg/L
...,...,...,...,...,...,...
19135,104405,4-Nonylphenol,Desmodesmus subspicatus,Algae,450.00,µg/l
19136,104405,4-Nonylphenol,Desmodesmus subspicatus,Algae,450.00,µg/l
19137,104405,4-Nonylphenol,Daphnia magna,Crustaceans,0.12,µg/l
19138,104405,4-Nonylphenol,Daphnia magna,Crustaceans,0.04,µg/l


In [3]:
# Function to retrieve molecular weight (molar mass) using PubChem API
def get_chemical_properties(cas_number):
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{cas_number}/JSON"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        if 'PC_Compounds' in data and len(data['PC_Compounds']) > 0:
            cid = data['PC_Compounds'][0]['id']['id']['cid']
            prop_url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/property/MolecularWeight/JSON"
            prop_response = requests.get(prop_url)
            if prop_response.status_code == 200:
                prop_data = prop_response.json()
                return prop_data['PropertyTable']['Properties'][0]['MolecularWeight']
    return None

In [4]:
# Conversion function
def convert_to_standard(value, unit, cas_number=None):
    if unit in ['AI mg/L', 'mg/L', 'ae mg/L']:
        return value, 'mg/L'  # Already in mg/L
    elif unit == 'ul/L':
        return value, 'mg/L'  # Assuming 1 ul = 1 mg for water
    elif unit == '%':
        return value * 10000, 'mg/L'  # Convert % to mg/L
    elif unit in ['ug/L', 'Âµg/l']:    #Gh. included this Âµg/l
        return value / 1000, 'mg/L'  # Convert ug/L to mg/L
    elif unit == 'ug/ml':
        return value, 'mg/L'  # Convert ug/ml to mg/L
    elif unit == 'g/L':         #Gh. included this g/l
        return value *1000, 'mg/L'  # Convert g/l to mg/L
    elif unit in ['uM', 'Âµmol/l'] :  #Gh. included this Âµmol/l
        molar_mass = get_chemical_properties(cas_number)
        if molar_mass is None:
            return value, unit  # Return original if molar mass not found
        else:
            return value * molar_mass / 1000, 'mg/L'  # Convert uM to mg/L
    elif unit == 'ng/ml':
        return value / 1000, 'mg/L'  # Convert ng/ml to mg/L
    elif unit == 'g/L':   #Gh. included this g/l
        return value * 1000, 'mg/L'  # Convert ng/ml to mg/L
    elif unit == 'ppm':
        return value, 'mg/L'  # ppm is approximately mg/L
    elif unit == 'pg/g egg':
        return value * 1e-9, 'mg/L'  # Convert pg/g egg to mg/L
    elif unit == 'AI ug/L':
        return value / 1000, 'mg/L'  # Convert AI ug/L to mg/L
    elif unit == 'ug/g wet wt diet':
        return value / 1000, 'mg/L'  # Convert ug/g wet wt diet to mg/L
    elif unit == 'mmol/kg':
        molar_mass = get_chemical_properties(cas_number)
        if molar_mass is None:
            return value, unit  # Return original if molar mass not found
        else:
            return value * molar_mass * 1000, 'mg/L'  # Convert mmol/kg to mg/L
    elif unit == 'mM':
        molar_mass = get_chemical_properties(cas_number)
        if molar_mass is None:
            return value, unit  # Return original if molar mass not found
        else:
            return value * molar_mass, 'mg/L'  # Convert mM to mg/L
    elif unit == 'umol/L':
        molar_mass = get_chemical_properties(cas_number)
        if molar_mass is None:
            return value, unit  # Return original if molar mass not found
        else:
            return value * molar_mass * 0.001, 'mg/L'  # Convert umol/L to mg/L
    elif unit == 'ug/g lipid diet':
        return value / 1000, 'mg/L'  # Convert ug/g lipid diet to mg/L
    elif unit == 'umol/ml':
        molar_mass = get_chemical_properties(cas_number)
        if molar_mass is None:
            return value, unit  # Return original if molar mass not found
        else:
            return value * molar_mass, 'mg/L'  # Convert umol/ml to mg/L
    elif unit == '% dry wght':
        return value * 10, 'mg/L'  # Convert % dry weight to mg/L
    elif unit == 'M':
        molar_mass = get_chemical_properties(cas_number)
        if molar_mass is None:
            return value, unit  # Return original if molar mass not found
        else:
            return value * molar_mass * 1000, 'mg/L'  # Convert M to mg/L
    elif unit == '% w/v':
        return value * 10000, 'mg/L'  # Convert % w/v to mg/L
    else:
        return value, unit  # For unspecified conversions

In [5]:
df.columns

Index(['CAS Number', 'Chemical Name', 'Species Scientific Name',
       'Species Group', 'Conc Mean ', 'Conc Units '],
      dtype='object')

In [6]:
# Apply conversions
# Copy the original DataFrame to a new DataFrame
new_df = df.copy()

# Apply the transformation to the new DataFrame
new_df[['Conc Mean ', 'Conc Units ']] = new_df.apply(
    lambda row: pd.Series(convert_to_standard(row['Conc Mean '], row['Conc Units '], row['CAS Number'])),
    axis=1)


In [7]:
def debug_conversion(row):
    new_values = convert_to_standard(row['Conc Mean '], row['Conc Units '], row['CAS Number'])
    print("Converting:", row['Conc Mean '], row['Conc Units '], "to", new_values)
    return pd.Series(new_values)

new_df[['Conc Mean ', 'Conc Units ']] = new_df.apply(debug_conversion, axis=1)


Converting: 0.5 mg/L to (0.5, 'mg/L')
Converting: 0.5 mg/L to (0.5, 'mg/L')
Converting: 100.0 mg/L to (100.0, 'mg/L')
Converting: 40.0 mg/L to (40.0, 'mg/L')
Converting: 40.0 mg/L to (40.0, 'mg/L')
Converting: 40.0 mg/L to (40.0, 'mg/L')
Converting: 40.0 mg/L to (40.0, 'mg/L')
Converting: 40.0 mg/L to (40.0, 'mg/L')
Converting: 40.0 mg/L to (40.0, 'mg/L')
Converting: 40.0 mg/L to (40.0, 'mg/L')
Converting: 40.0 mg/L to (40.0, 'mg/L')
Converting: 40.0 mg/L to (40.0, 'mg/L')
Converting: 40.0 mg/L to (40.0, 'mg/L')
Converting: 40.0 mg/L to (40.0, 'mg/L')
Converting: 40.0 mg/L to (40.0, 'mg/L')
Converting: 40.0 mg/L to (40.0, 'mg/L')
Converting: 40.0 mg/L to (40.0, 'mg/L')
Converting: 40.0 mg/L to (40.0, 'mg/L')
Converting: 40.0 mg/L to (40.0, 'mg/L')
Converting: 40.0 mg/L to (40.0, 'mg/L')
Converting: 40.0 mg/L to (40.0, 'mg/L')
Converting: 40.0 mg/L to (40.0, 'mg/L')
Converting: 40.0 mg/L to (40.0, 'mg/L')
Converting: 40.0 mg/L to (40.0, 'mg/L')
Converting: 40.0 mg/L to (40.0, 'mg/L')
Co

In [8]:

new_df.to_csv('NOECafterunitconversion.csv', index=False)

In [9]:
# More robust filtering to handle variations in the 'mg/L' representation
filtered_df = new_df.loc[new_df['Conc Units '].str.strip().str.lower() == 'mg/l']

print(filtered_df)


       CAS Number                                      Chemical Name  \
0           50000                                       Formaldehyde   
1           50000                                       Formaldehyde   
2           50066  5-Ethyl-5-phenyl-2,4,6(1H,3H,5H)-pyrimidinetrione   
3           50180  N,N-Bis(2-chloroethyl)tetrahydro-2H-1,3,2-oxaz...   
4           50180  N,N-Bis(2-chloroethyl)tetrahydro-2H-1,3,2-oxaz...   
...           ...                                                ...   
19088    60207901  1-[[2-(2,4-Dichlorophenyl)-4-propyl-1,3-dioxol...   
19089    60207901  1-[[2-(2,4-Dichlorophenyl)-4-propyl-1,3-dioxol...   
19090    60207901  1-[[2-(2,4-Dichlorophenyl)-4-propyl-1,3-dioxol...   
19093    60207901  1-[[2-(2,4-Dichlorophenyl)-4-propyl-1,3-dioxol...   
19108   120068373  5-Amino-1-[2,6-dichloro-4-(trifluoromethyl)phe...   

          Species Scientific Name        Species Group  Conc Mean  Conc Units   
0         Lithobates catesbeianus           Amphibians

In [10]:
filtered_df.to_csv('NOECtodosplit.csv', index=False)