In [2]:
import pandas as pd
import json



df = pd.read_csv('ddms1.csv')


df.rename(columns={'Unnamed: 0': 'drug_name'}, inplace=True)



with open('DP.json', 'r') as f:
    drug_data = json.load(f)

print(f"Loaded JSON with {len(drug_data)} entries")


drug_to_smiles = {}

for entry in drug_data:
    if entry.get('organism') == 'human':
        drug_name = entry.get('drug_name', '')
        smiles = entry.get('smiles', '')
        if drug_name and smiles:
         
            drug_to_smiles[drug_name.lower()] = smiles

print(f"Found {len(drug_to_smiles)} human drugs with SMILES")


df['smiles'] = df['drug_name'].str.lower().map(drug_to_smiles)

manual_smiles = {
    'plx4032': 'CCCS(=O)(=O)NC1=C(C(=C(C=C1)F)C(=O)C2=CNC3=C2C=C(C=N3)C4=CC=C(C=C4)Cl)F',
    'trastuzumab': 'CCCCN1C(=O)C(NC(=O)C12CCN(CC2)CC3=CC=C(C=C3)OC4=CC=C(C=C4)C(=O)O)C(C5CCCCC5)O',
    'cholestyramine': 'CCC1=CC=C(C=C1)C(C)CCC2=CC=C(C=C2)[N+](C)(C)C.[Cl-]'
}


for drug_name_lower, smiles in manual_smiles.items():
    mask = (df['drug_name'].str.lower() == drug_name_lower) & (df['smiles'].isna())
    df.loc[mask, 'smiles'] = smiles





matched = df['smiles'].notna().sum()
total = len(df)
print(f"\nMatched {matched} out of {total} drugs ({matched/total*100:.1f}%)")


unmatched = df[df['smiles'].isna()]['drug_name'].tolist()
if unmatched:
    print(f"\nUnmatched drugs to be dropped ({len(unmatched)}):")
    for drug in unmatched[:10]:  # Show first 10
        print(f"  - {drug}")
    if len(unmatched) > 10:
        print(f"  ... and {len(unmatched) - 10} more")


df_original_len = len(df)
df = df.dropna(subset=['smiles'])
print(f"\nDropped {df_original_len - len(df)} drugs without SMILES")
print(f"Final dataset: {len(df)} drugs")


   

df.to_csv("ddms1.csv", index=False)




print("\n top 10 smiles")
print(df[['drug_name', 'smiles']].head(10))

Loaded JSON with 875 entries
Found 160 human drugs with SMILES

Matched 149 out of 158 drugs (94.3%)

Unmatched drugs to be dropped (9):
  - Etanercept
  - Oprelvekin
  - Interferon beta-1a
  - Cetuximab
  - Interferon beta-1b
  - Natural alpha interferon
  - Interferon gamma-1b
  - Rituximab
  - Casiigly

Dropped 9 drugs without SMILES
Final dataset: 149 drugs

 top 10 smiles
         drug_name                                             smiles
0      Anastrozole      CC(C)(C#N)C1=CC(=CC(=C1)CN2C=NC=N2)C(C)(C)C#N
1      Abiraterone  O[C@H]1CC[C@]2(C(=CC[C@@H]3[C@@H]2CC[C@]2([C@H...
2       Bexarotene  CC1=CC2=C(C=C1C(=C)C3=CC=C(C=C3)C(=O)O)C(CCC2(...
3        Metformin                                  CN(C)C(=N)N=C(N)N
4    Dexamethasone  CC1CC2C3CCC4=CC(=O)C=CC4(C3(C(CC2(C1(C(=O)CO)O...
5  Cigarette smoke  C1=CC(=CC=C1N)S(=O)(=O)OC2=C(C=C(C=C2[N+](=O)[...
6           Harman                         CC1=NC=CC2=C1NC3=CC=CC=C23
7          Plx4032  CCCS(=O)(=O)NC1=C(C(=C(C=C1)F)C(=O)C2=CN