In [None]:
import requests
import pandas as pd
from rdkit import Chem
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

In [None]:
import os
import psutil

def get_optimal_worker_count(io_bound=True):
    cpu_count = os.cpu_count()
    if cpu_count is None:
        return 4  # Default to 4 if we can't determine CPU count
    
    if io_bound:
        # For I/O bound tasks, we can use more workers
        return min(32, cpu_count * 4)
    else:
        # For CPU bound tasks, use number of CPU cores
        return cpu_count

# Usage
io_bound_workers = get_optimal_worker_count(io_bound=True)
cpu_bound_workers = get_optimal_worker_count(io_bound=False)

print(f"Recommended workers for I/O-bound tasks: {io_bound_workers}")
print(f"Recommended workers for CPU-bound tasks: {cpu_bound_workers}")

# Get more detailed system information
print(f"\nCPU cores: {psutil.cpu_count(logical=False)}")
print(f"Logical processors: {psutil.cpu_count(logical=True)}")
print(f"Total RAM: {psutil.virtual_memory().total / (1024**3):.2f} GB")

In [None]:
headers = [
  "Index", "Name", "Synonym", "Structure", "Mol. form.", "CAS Reg. No.", "Beilstein Reg. No.", "Mol. wt.",
  "Physical form", "tmp/ºC", "tbp/ºC", "Other bp/ºC", "ρ/g cm-3",
  "nD", "s/g kg-1", "vp/kPa (25 °C)", "Solubility"
]
headers = [h.strip() for h in headers]
headers, len(headers)

In [None]:
filename = 'physical_constants_of_organic_compounds_CRC_handbook_data_10902entries.csv'
df = pd.read_csv(filename, names=headers, skiprows=1, index_col='Index')
# remove structure column
df.drop(columns=['Structure'], inplace=True)
df

In [None]:
invalid_cas_numbers = []
def get_smiles_from_cas(cas_number):
    global invalid_cas_numbers
    try:
        url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{cas_number}/property/CanonicalSMILES/JSON"
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            smiles = data['PropertyTable']['Properties'][0]['CanonicalSMILES']
            return smiles
        else:
            return None
    except requests.RequestException:
        invalid_cas_numbers.append(cas_number)
        return None
    
cas_number = "3383-96-8"  # Example CAS number for water
smiles = get_smiles_from_cas(cas_number)
if smiles:
    print(f"SMILES for CAS {cas_number}: {smiles}")
    m = Chem.MolFromSmiles(smiles)
    display(m)
else:
    print(f"Could not retrieve SMILES for CAS {cas_number}")

In [None]:
cas_pattern = r'^\d{2,7}-\d{2}-\d$'
# Find invalid CAS numbers
invalid_cas = df[~df['CAS Reg. No.'].str.match(cas_pattern)]

# Display invalid CAS numbers
invalid_cas, df['CAS Reg. No.']

In [None]:
def process_cas_numbers(df, column_name, max_workers=10):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit all tasks
        future_to_index = {executor.submit(get_smiles_from_cas, cas): index 
                           for index, cas in df[column_name].items()}
        
        # Process results as they complete
        results = {}
        for future in as_completed(future_to_index):
            index = future_to_index[future]
            try:
                result = future.result()
                results[index] = result
            except Exception as exc:
                print(f'CAS number at index {index} generated an exception: {exc}')
                results[index] = None
    
    return pd.Series(results)


In [None]:
# Process the DataFrame
start_time = time.time()
df['SMILES'] = process_cas_numbers(df, 'CAS Reg. No.', max_workers=io_bound_workers)
end_time = time.time()

print(df)
print(f"Processing time: {end_time - start_time:.2f} seconds")

In [None]:
# Apply the function to each CAS number in the DataFrame
df['SMILES'] = df['CAS Reg. No.'].apply(get_smiles_from_cas)

# Display the DataFrame to verify changes
df

In [None]:
df.to_csv(f"{filename}.smiles.csv")
df['SMILES']

In [None]:
# Define a function to apply get_smiles_from_cas in parallel
def apply_parallel(df, func, column_name):
    with ThreadPoolExecutor() as executor:
        results = list(executor.map(func, df[column_name]))
    return results

# Apply the function to each CAS number in the DataFrame in parallel
df['SMILES'] = apply_parallel(df, get_smiles_from_cas, 'CAS Reg. No.')

# Display the DataFrame to verify changes
df