In [1]:
print("hello")

hello


# データセットダウンロード

In [2]:
import os
import gzip
import pandas as pd
import numpy as np
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.ensemble import RandomForestClassifier

try:
    from rdkit import Chem
    from rdkit.Chem import AllChem
except ImportError:
    Chem = None
    AllChem = None
    print("⚠ RDKit が import できませんでした。conda などでインストールしてください。")

DATA_DIR = "/Users/kakuayato/Development/PharmFoundation/dataset"  # ファイルを置いたディレクトリに変更

STRUCT_FILE = os.path.join(DATA_DIR, "structures.smiles.tsv")  # タイプミスっぽいけどユーザーの実ファイル名に合わせる
FDA_FILE    = os.path.join(DATA_DIR, "FDA_Approved.csv")
EMA_FILE    = os.path.join(DATA_DIR, "EMA_Approved.csv")
PMDA_FILE   = os.path.join(DATA_DIR, "PMDA_Approved.csv")
DTI_FILE    = os.path.join(DATA_DIR, "drug.target.interaction.tsv.gz")  # ひとまず未使用

⚠ RDKit が import できませんでした。conda などでインストールしてください。


# 構造情報（SMILES）の読み込み

In [3]:
struct_df = pd.read_csv(STRUCT_FILE, sep="\t")

print("structures.smiles.tsv の列:")
print(struct_df.columns.tolist())
struct_df.head()

structures.smiles.tsv の列:
['SMILES', 'InChI', 'InChIKey', 'ID', 'INN', 'CAS_RN']


Unnamed: 0,SMILES,InChI,InChIKey,ID,INN,CAS_RN
0,CNC(=O)C1=C(C=C(C=C1)C2=NN3C(=CN=C3N=C2)CC4=CC...,InChI=1S/C23H17FN6O/c1-25-22(31)18-6-5-16(11-1...,LIOLIMKSCNQPLV-UHFFFAOYSA-N,5392,capmatinib,1029712-80-8
1,CC(C)(COC1=CN2C(=C(C=N2)C#N)C(=C1)C3=CN=C(C=C3...,"InChI=1S/C29H31N7O3/c1-29(2,37)18-39-24-9-25(2...",XIIOFHFUYBLOLW-UHFFFAOYSA-N,5393,selpercatinib,2152628-33-4
2,CCN1C2=CC(=NC=C2C=C(C1=O)C3=CC(=C(C=C3Br)F)NC(...,InChI=1S/C24H21BrFN5O2/c1-3-31-21-12-22(27-2)2...,CEFJVGZHQAGLHS-UHFFFAOYSA-N,5394,ripretinib,1442472-39-0
3,C[C@]12CC[C@H]3[C@H]([C@@H]1C[C@H]([C@@H]2O)[1...,InChI=1S/C18H23FO2/c1-18-7-6-13-12-5-3-11(20)8...,KDLLNMRYZGUVMA-ZYMZXAKXSA-N,5395,fluoroestradiol F 18,94153-53-4
4,C1=CC2=C(C=C1C3=CN=C(C=C3)[18F])NC4=C2C=NC=C4,InChI=1S/C16H10FN3/c17-16-4-2-11(8-19-16)10-1-...,GETAAWDSFUCLBS-SJPDSGJFSA-N,5396,flortaucipir F 18,1522051-90-6


In [4]:
# 必要な列名を柔軟に拾う
def pick_col(candidates, columns):
    cols_lower = {c.lower(): c for c in columns}
    for cand in candidates:
        if cand.lower() in cols_lower:
            return cols_lower[cand.lower()]
    return None

id_col      = pick_col(["ID", "STRUCT_ID", "DRUGCENTRAL_ID"], struct_df.columns)
name_col    = pick_col(["INN", "DRUG_NAME", "NAME"], struct_df.columns)
smiles_col  = pick_col(["SMILES"], struct_df.columns)

print("ID列:", id_col, "  Name列:", name_col, "  SMILES列:", smiles_col)

# 必要列だけに絞る
struct_core = struct_df[[c for c in [id_col, name_col, smiles_col] if c is not None]].copy()
struct_core = struct_core.rename(columns={
    id_col: "drug_id",
    name_col: "drug_name",
    smiles_col: "smiles",
})
struct_core.head()

ID列: ID   Name列: INN   SMILES列: SMILES


Unnamed: 0,drug_id,drug_name,smiles
0,5392,capmatinib,CNC(=O)C1=C(C=C(C=C1)C2=NN3C(=CN=C3N=C2)CC4=CC...
1,5393,selpercatinib,CC(C)(COC1=CN2C(=C(C=N2)C#N)C(=C1)C3=CN=C(C=C3...
2,5394,ripretinib,CCN1C2=CC(=NC=C2C=C(C1=O)C3=CC(=C(C=C3Br)F)NC(...
3,5395,fluoroestradiol F 18,C[C@]12CC[C@H]3[C@H]([C@@H]1C[C@H]([C@@H]2O)[1...
4,5396,flortaucipir F 18,C1=CC2=C(C=C1C3=CN=C(C=C3)[18F])NC4=C2C=NC=C4


# 承認薬リストの読み込み & 結合

In [5]:
def load_approved(path, source):
    df = pd.read_csv(path)
    df["regulator"] = source
    return df

fda_df  = load_approved(FDA_FILE, "FDA")
ema_df  = load_approved(EMA_FILE, "EMA")
pmda_df = load_approved(PMDA_FILE, "PMDA")

approved_raw = pd.concat([fda_df, ema_df, pmda_df], ignore_index=True)

print("Approved CSV の列名:")
print(approved_raw.columns.tolist())
approved_raw.head()

Approved CSV の列名:
['2104', 'perflutren', 'regulator', '5405', 'belantamab mafodotin', '5392', 'capmatinib']


Unnamed: 0,2104,perflutren,regulator,5405,belantamab mafodotin,5392,capmatinib
0,1834.0,monobenzone,FDA,,,,
1,2684.0,tobramycin,FDA,,,,
2,3051.0,butamben,FDA,,,,
3,3103.0,citrulline,FDA,,,,
4,3347.0,methionine,FDA,,,,


In [6]:
app_id_col   = pick_col(["ID", "STRUCT_ID", "DRUG_ID", "DRUGCENTRAL_ID"], approved_raw.columns)
app_name_col = pick_col(["DRUG_NAME", "INN", "NAME"], approved_raw.columns)

print("Approved 側 ID列:", app_id_col, " Name列:", app_name_col)

Approved 側 ID列: None  Name列: None


In [7]:
approved_df = approved_raw.copy()

if app_id_col is not None and "drug_id" in struct_core.columns:
    approved_df = approved_df.rename(columns={app_id_col: "drug_id"})
    merged = approved_df.merge(struct_core, on="drug_id", how="inner")
else:
    # 名前で join（簡易）
    print("ID で join できなさそうなので drug_name ベースで join します。")
    approved_df["drug_name_tmp"] = approved_df[app_name_col].astype(str).str.upper()
    struct_core["drug_name_tmp"] = struct_core["drug_name"].astype(str).str.upper()
    merged = approved_df.merge(struct_core, on="drug_name_tmp", how="inner", suffixes=("_app", "_struct"))

print("マージ後 shape:", merged.shape)
merged.head()

ID で join できなさそうなので drug_name ベースで join します。


KeyError: None

# 鎮痛薬ラベルの作成

In [None]:
# テキスト系の候補列
text_candidate_cols = [c for c in merged.columns 
                       if any(k in c.lower() for k in ["indication", "use", "desc", "label"])]
print("適応症っぽい列候補:", text_candidate_cols)

# ATC 系の候補列
atc_candidate_cols = [c for c in merged.columns if "atc" in c.lower()]
print("ATC コードっぽい列候補:", atc_candidate_cols)

NameError: name 'merged' is not defined

In [None]:
analgesic_keywords = [
    "pain",
    "analgesic",
    "analgesia",
    "neuropathic",
    "headache",
    "migraine",
    "nociceptive",
]

def label_row(row,
              text_cols=text_candidate_cols,
              atc_cols=atc_candidate_cols,
              keywords=analgesic_keywords):
    text_hit = False
    for c in text_cols:
        val = str(row.get(c, "")).lower()
        if any(kw in val for kw in keywords):
            text_hit = True
            break
    
    atc_hit = False
    for c in atc_cols:
        val = str(row.get(c, "")).upper()
        # 複数コードが | や , で繋がっているケースもあるので split
        for token in [t.strip() for t in val.replace("|", ",").split(",")]:
            if token.startswith("N02"):
                atc_hit = True
                break
        if atc_hit:
            break
    
    return int(text_hit or atc_hit)

merged["is_analgesic"] = merged.apply(label_row, axis=1)

merged["is_analgesic"].value_counts()

NameError: name 'text_candidate_cols' is not defined

# SMILES → Fingerprint 変換（RDKit）

In [None]:
if Chem is None:
    raise RuntimeError("RDKit が import できていません。先にインストールしてください。")

def smiles_to_morgan_fp(smiles, radius=2, n_bits=2048):
    if pd.isna(smiles):
        return None
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
    arr = np.zeros((n_bits,), dtype=np.int8)
    # 直接 bit を numpy array にコピー
    Chem.DataStructs.ConvertToNumpyArray(fp, arr)
    return arr

# Fingerprint を作る
fps = []
labels = []

for _, row in merged.iterrows():
    fp = smiles_to_morgan_fp(row["smiles"])
    if fp is None:
        continue
    fps.append(fp)
    labels.append(row["is_analgesic"])

X = np.stack(fps).astype(np.float32)
y = np.array(labels, dtype=np.int64)

X.shape, Counter(y)

RuntimeError: RDKit が import できていません。先にインストールしてください。

# 学習・評価（RandomForest）

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

clf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    n_jobs=-1,
    class_weight="balanced_subsample",
    random_state=42,
)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred, digits=3))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))

NameError: name 'X' is not defined

# ざっくり可視化（重要度など）

In [None]:
import matplotlib.pyplot as plt

importances = clf.feature_importances_
plt.figure(figsize=(6,4))
plt.hist(importances, bins=50)
plt.xlabel("Feature importance")
plt.ylabel("Count")
plt.title("RandomForest feature importances (Morgan bits)")
plt.show()

NameError: name 'clf' is not defined