In [None]:

# Cell 1 setup Google Drive (boleh diubah)
from google.colab import drive
drive.mount('/content/drive')

import os

# Ganti dengan folder tempat dataset kamu
DATA_DIR = "/content/drive/MyDrive/capstone_project"
print("Folder data:", DATA_DIR)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Folder data: /content/drive/MyDrive/capstone_project


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import warnings
warnings.filterwarnings('ignore')

import os
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, chi2
from scipy.sparse import hstack, csr_matrix
import lightgbm as lgb


In [None]:
def load_dictionary(file_name, key_col, value_col):
    """
    Memuat kamus dari file CSV di DATA_DIR ke dalam dictionary.
    file_name: nama file di dalam DATA_DIR (bukan full path).
    """
    file_path = os.path.join(DATA_DIR, file_name)
    try:
        kamus_df = pd.read_csv(file_path, dtype=str).fillna('')
        kamus_dict = dict(zip(kamus_df[key_col].str.lower(), kamus_df[value_col].str.lower()))
        print(f"Kamus berhasil dimuat: {file_path}")
        return kamus_dict
    except FileNotFoundError:
        print(f"PERINGATAN: File kamus tidak ditemukan: {file_path}. Pembersihan dengan kamus tersebut dilewati.")
        return {}
    except Exception as e:
        print(f"PERINGATAN: Gagal memuat kamus {file_path}: {e}")
        return {}

# Pemuatan Kamus Global
KAMUS_SINGKATAN       = load_dictionary('kamus_singkatan.csv',    'Singkatan',      'Bentuk Lengkap')
KAMUS_MEDIS_SINONIM   = load_dictionary('kamus_medis.csv',        'istilah_lama',   'istilah_standar')
KAMUS_KEYWORD_TRIAGE  = load_dictionary('kamus_keyword_poli.csv', 'Keyword Keluhan','Poli Tujuan')

Kamus berhasil dimuat: /content/drive/MyDrive/capstone_project/kamus_singkatan.csv
Kamus berhasil dimuat: /content/drive/MyDrive/capstone_project/kamus_medis.csv
Kamus berhasil dimuat: /content/drive/MyDrive/capstone_project/kamus_keyword_poli.csv


In [None]:
# =====================================================
# 2. LOAD DATASET
# =====================================================
print("Loading data...")

data_file_path  = os.path.join(DATA_DIR, 'data_diagnosa_filtered.xlsx')
kamus_icd_path  = os.path.join(DATA_DIR, 'icd10_poli_clean.csv')

df = pd.read_excel(data_file_path, dtype=str, engine='openpyxl')
kamus = pd.read_csv(kamus_icd_path, dtype=str)

df.columns = [c.strip() for c in df.columns]

Loading data...


In [None]:
# =====================================================
# 3. FILTER DATASET (KONTROL/FOLLOW UP)
# =====================================================
print("Filter data: Menghapus kasus 'kontrol', 'follow up', 'MCU'...")

initial_row_count = len(df)
if 'keluhan' in df.columns:
    pattern = (
        r'\bkontrol\b'
        r'|\bcontrole?\b'                 # controle / control
        r'|\bfollow\s*up\b'               # follow up / followup
        r'|\bmedical\s*check\s*up\b'      # medical check up
        r'|\bmcu\b'                       # MCU = medical check up
        r'|\bfup\b'
        r'|\bulang\b'
        r'|\bkembali\b'
        r'|\bcek\s*up\b'
    )

    mask_kontrol = df['keluhan'].astype(str).str.lower().str.contains(
        pattern,
        na=False,
        regex=True
    )
    df = df[~mask_kontrol].copy()
    print(f"Jumlah baris yang dihapus: {mask_kontrol.sum()}")
else:
    print("Kolom 'keluhan' tidak ditemukan, filter kontrol dilewati.")

Filter data: Menghapus kasus 'kontrol', 'follow up', 'MCU'...
Jumlah baris yang dihapus: 161595


In [None]:
# =====================================================
# 4. EKSPANSI ISTILAH MEDIS
# =====================================================
medical_expansions = {
    'jantung': 'jantung cardiac cardio heart arrhythmia coronary angina pectoris cvd cardiovascular',
    'hipertensi': 'hipertensi hypertension htn tekanan tinggi darah tinggi blood pressure',
    'nyeri dada': 'nyeri dada chest pain angina pectoris cardiac ischemia',
    'sesak': 'sesak dyspnea dispnea shortness breath napas pendek respiratory breathing difficulty',
    'batuk': 'batuk cough tussis respiratory paru lung bronchitis pneumonia',
    'asma': 'asma asthma wheezing mengi bronchial bronchospasm',
    'paru': 'paru lung pulmonary respiratory chest thorax',
    'mual': 'mual nausea muntah vomit emesis gastric gastrointestinal',
    'diare': 'diare diarrhea gastroenteritis gastric loose stool',
    'perut': 'perut abdomen stomach gastric abdominal belly',
    'maag': 'maag gastritis peptic ulcer gastric dyspepsia',
    'lambung': 'lambung stomach gastric gastritis ulcer',
    'pusing': 'pusing dizziness vertigo neurologic headache cephalgia',
    'stroke': 'stroke cva cerebrovascular neurologic paralysis hemiparesis',
    'kejang': 'kejang seizure convulsion epilepsy neurologic',
    'saraf': 'saraf nerve neurologic neurology neuropathy',
    'diabetes': 'diabetes mellitus dm endocrine gula darah hyperglycemia',
    'gula': 'gula sugar diabetes glucose hyperglycemia',
    'anak': 'anak child pediatric pediatri bayi infant neonatus',
    'bayi': 'bayi infant baby neonatus newborn pediatric',
    'demam': 'demam fever febris panas pyrexia temperature elevated',
    'nyeri': 'nyeri pain sakit ache painful',
    'lemas': 'lemas weak weakness fatigue lethargy tired',
    'mata': 'mata eye ocular ophthalmology vision penglihatan',
    'gigi': 'gigi teeth dental tooth oral mulut',
    'kulit': 'kulit skin dermatology dermato rash ruam',
    'tulang': 'tulang bone orthopedi fracture patah sendi joint',
    'bedah': 'bedsah surgery surgical operasi operation',
    'gawat': 'gawat emergency urgent critical igd er acute severe',
    'darurat': 'darurat emergency critical urgent acute life-threatening'
}

In [None]:
# =====================================================
# 5. FUNGSI CLEANING TEKS (HYBRID: PHRASE + TOKEN)
# =====================================================

# Pisahkan kamus medis jadi phrase (ada spasi) dan token (satu kata)
KAMUS_MEDIS_PHRASE = {k: v for k, v in KAMUS_MEDIS_SINONIM.items() if " " in k}
KAMUS_MEDIS_TOKEN  = {k: v for k, v in KAMUS_MEDIS_SINONIM.items() if " " not in k}

def extract_core_keluhan(text):
    """
    Ambil inti keluhan dari format seperti:
    'KU; SEDANG. CM,KEL;-'
    'ku sedang, kes cm,kel : nyeri kaki kiri'
    Kalau ada 'kel' / 'keluhan', ambil teks setelahnya.
    Kalau kosong atau hanya '-' => return '' (bisa di-drop nanti).
    """
    if text is None or (isinstance(text, float) and np.isnan(text)):
        return ""

    s = str(text).lower()
    s = s.replace('\n', ' ')

    # buang spasi berlebih
    s = re.sub(r'\s+', ' ', s).strip()

    # cari pola 'kel' atau 'keluhan' lalu ambil teks setelahnya
    m = re.search(r'\bkel(?:uhan)?\b[^a-z0-9]*[:;\-]*\s*(.*)', s)
    if m:
        core = m.group(1).strip()
    else:
        core = s

    # kalau core cuma '-' atau kosong, anggap tidak ada keluhan
    core = core.strip()
    core = re.sub(r'^[\-\.,:]+$', '', core).strip()

    return core

def clean_text_with_dictionaries(text):
    """
    Pembersihan teks:
    1. Lowercase.
    2. Replace FRASA medis (yang mengandung spasi) pakai .replace() â€“ jumlahnya biasanya tidak banyak.
    3. Mapping per-kata untuk singkatan & istilah medis satu kata.
    Pendekatan ini mendekati versi lama (akurasinya bagus), tapi tetap lebih aman memori.
    """
    if text is None or (isinstance(text, float) and np.isnan(text)):
        return ""
    text = remove_corrupt_patterns(text).lower()

    # Hilangkan tag rumah sakit
    text = re.sub(r'\brspb\b', '', text)
    text = re.sub(r'\bkeluhan\b', '', text)

    # Pisahkan koma/titik koma jadi spasi
    text = re.sub(r'[,:;]', ' ', text)

    # Rapikan spasi
    text = re.sub(r'\s+', ' ', text).strip()

    # 2. Ganti frasa medis (multi-kata)
    for old_phrase, new_phrase in KAMUS_MEDIS_PHRASE.items():
        if old_phrase in text:
            text = text.replace(old_phrase, new_phrase)

    # 3. Mapping per-kata: singkatan & istilah medis token
    tokens = text.split()
    # singkatan
    tokens = [KAMUS_SINGKATAN.get(tok, tok) for tok in tokens]
    # istilah medis satu kata
    tokens = [KAMUS_MEDIS_TOKEN.get(tok, tok) for tok in tokens]

    return " ".join(tokens)

def remove_corrupt_patterns(text):
    text = str(text)

    # 1. Hilangkan artefak encoding Word/Excel
    text = re.sub(r'_x000D_', ' ', text, flags=re.IGNORECASE)
    text = re.sub(r'x000d', ' ', text, flags=re.IGNORECASE)
    text = re.sub(r'\\x00', ' ', text)
    text = re.sub(r'\\u000d', ' ', text)

    # 2. Pertahankan pola medis penting
    important_patterns = re.findall(r'\d{1,4}/\d{1,4}', text)

    # 5. Hapus sisa koma/strip ganjil
    text = re.sub(r'[;,]', ' ', text)

    # 6. Rapikan spasi
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def enhance_medical_text(text):
    if text is None or (isinstance(text, float) and np.isnan(text)):
        return ""
    text = str(text).lower()

    # pakai cleaning baru
    text = clean_text_with_dictionaries(text)

    # ekspansi istilah medis
    for term, expansion in medical_expansions.items():
        if term in text:
            text += " " + expansion

    stop_small = {'dan','atau','yang','ini','itu','ke','di','dgn','dgnnya'}
    tokens = [w for w in text.split() if len(w) >= 2 and w not in stop_small]
    return ' '.join(tokens)

def find_poli_from_keyword(keluhan):
    if keluhan is None or (isinstance(keluhan, float) and np.isnan(keluhan)) or not KAMUS_KEYWORD_TRIAGE:
        return ''
    keluhan = str(keluhan).lower()
    for keyword, poli in KAMUS_KEYWORD_TRIAGE.items():
        if keyword in keluhan:
            return poli.strip()
    return ''

preview = df[['keluhan']].head(10).copy()
preview['keluhan_cleaned']  = preview['keluhan'].apply(clean_text_with_dictionaries)
preview['keluhan_enhanced'] = preview['keluhan'].apply(enhance_medical_text)
display(preview)


Unnamed: 0,keluhan,keluhan_cleaned,keluhan_enhanced
0,"nyeri lutut kiri ,klien mengatakan mengeluh ra...",nyeri lutut kiri klien mengatakan mengeluh ras...,nyeri lutut kiri klien mengatakan mengeluh ras...
1,"FARING HIPEREMIS, RH +/+,usia 6 thn 1 bln _x00...",faring hiperemis rhesus positif dan sudah terv...,faring hiperemis rhesus positif sudah tervalid...
2,"obat rutin habis,OS TIDAK IKUT_x000D_\nobat ru...",obat rutin habis mata kiri tidak ikut obat rut...,obat rutin habis mata kiri tidak ikut obat rut...
3,"kel : terasa panas didada kiri, led 66, sputum...",keluhan terasa panas didada kiri laju endap da...,keluhan terasa panas didada kiri laju endap da...
4,"RKM, OS PTERIGIUM , OD POST CLG (2019 _x000D_\...",rekam medis mata kiri daging tumbuh pada mata ...,rekam medis mata kiri daging tumbuh pada mata ...
6,"KU SEDANG CM,KEL -",keadaan umum sedang hanya keluhan -,keadaan umum sedang hanya keluhan
15,"KU; SEDANG. CM,KEL;-",keadaan umum sedang. hanya keluhan -,keadaan umum sedang. hanya keluhan
16,"OD MERAH DAN GATAL 18/16, ods konjungtivitis_x...",mata kanan merah dan gatal 18/16 kedua mata ko...,mata kanan merah gatal 18/16 kedua mata konjun...
17,"ku sedang, kes cm,kel : nyeri kaki kiri",keadaan umum sedang kes hanya keluhan nyeri ka...,keadaan umum sedang kes hanya keluhan nyeri ka...
19,"t/f : hiperemis,",terlihat hiperemis,terlihat hiperemis


In [None]:
# =====================================================
# 6. NUMERIC HANDLING & FEATURE ENGINEERING
# =====================================================
numeric_cols = ['usia', 'pernafasan_x_per_menit', 'suhu_tubuh']

preview_before_num = df[numeric_cols].head(10).copy()
preview_before_num.columns = [c + '_before' for c in preview_before_num.columns]

for col in ['usia', 'pernafasan_x_per_menit', 'suhu_tubuh']:
    if col not in df.columns:
        df[col] = np.nan
    df[col] = pd.to_numeric(df[col].astype(str).str.replace(r'[^\d.-]', '', regex=True),
                            errors='coerce')
    median_val = df[col].median(skipna=True)
    df[col].fillna(median_val if not np.isnan(median_val) else 0, inplace=True)

df['age_category'] = pd.cut(df['usia'].astype(float),
                            bins=[-1,2,12,18,45,65,200],
                            labels=['infant','child','adolescent','adult','senior','geriatric']).astype(str)
df['fever_level'] = pd.cut(df['suhu_tubuh'].astype(float),
                           bins=[-1,36.5,37.5,38.5,60],
                           labels=['hypothermia','normal','lowfever','highfever']).astype(str)
df['resp_level'] = pd.cut(df['pernafasan_x_per_menit'].astype(float),
                          bins=[-1,12,20,30,200],
                          labels=['bradypnea','normal','tachypnea','severe']).astype(str)

df['critical_fever']  = (df['suhu_tubuh'].astype(float) > 38.5).astype(int)
df['critical_resp']   = (df['pernafasan_x_per_menit'].astype(float) > 30).astype(int)
df['pediatric_case']  = (df['usia'].astype(float) < 18).astype(int)
df['geriatric_case']  = (df['usia'].astype(float) > 65).astype(int)
df['adult_case']      = ((df['usia'].astype(float) >= 18) & (df['usia'].astype(float) <= 65)).astype(int)
df['vital_severity']  = (df['critical_fever']*2 + df['critical_resp']*2 +
                         (df['suhu_tubuh'].astype(float)>37.5).astype(int) +
                         (df['pernafasan_x_per_menit'].astype(float)>20).astype(int)).astype(int)
df['age_risk']        = (df['pediatric_case']*2 + df['geriatric_case']*3).astype(int)

if 'riwayat_penyakit' not in df.columns:
    df['riwayat_penyakit'] = ''
else:
    df['riwayat_penyakit'] = df['riwayat_penyakit'].fillna('')

preview_after_num = df[
    ['usia', 'pernafasan_x_per_menit', 'suhu_tubuh',
     'age_category','fever_level','resp_level',
     'critical_fever','critical_resp',
     'pediatric_case','geriatric_case','adult_case',
     'vital_severity','age_risk']
].head(10).copy()

# gabungkan before & after side-by-side
preview_num = pd.concat(
    [preview_before_num.reset_index(drop=True),
     preview_after_num.reset_index(drop=True)],
    axis=1
)

display(preview_num)


Unnamed: 0,usia_before,pernafasan_x_per_menit_before,suhu_tubuh_before,usia,pernafasan_x_per_menit,suhu_tubuh,age_category,fever_level,resp_level,critical_fever,critical_resp,pediatric_case,geriatric_case,adult_case,vital_severity,age_risk
0,62,20.0,36.0,62,20.0,36.0,senior,hypothermia,normal,0,0,0,0,1,0,0
1,7,22.0,370.0,7,22.0,370.0,child,,tachypnea,1,0,1,0,0,4,2
2,66,,,66,20.0,36.5,geriatric,hypothermia,normal,0,0,0,1,0,0,3
3,60,20.0,365.0,60,20.0,365.0,senior,,normal,1,0,0,0,1,3,0
4,54,,,54,20.0,36.5,senior,hypothermia,normal,0,0,0,0,1,0,0
5,66,18.0,36.0,66,18.0,36.0,geriatric,hypothermia,normal,0,0,0,1,0,0,3
6,51,20.0,36.0,51,20.0,36.0,senior,hypothermia,normal,0,0,0,0,1,0,0
7,10,,,10,20.0,36.5,child,hypothermia,normal,0,0,1,0,0,0,2
8,57,18.0,36.0,57,18.0,36.0,senior,hypothermia,normal,0,0,0,0,1,0,0
9,49,,,49,20.0,36.5,senior,hypothermia,normal,0,0,0,0,1,0,0


In [None]:
def clean_poli_label(poli):
    """
    Bersihkan nama poli:
    - hilangkan 'RSPB'
    - hilangkan kode (A)/(C)/dst di akhir
    - rapikan spasi
    """
    if pd.isna(poli):
        return np.nan
    text = str(poli)

    # hilangkan nama RS
    text = re.sub(r'\bRSPB\b', '', text, flags=re.IGNORECASE)

    # hilangkan kode kategori (A), (B), (C), dst di akhir string
    text = re.sub(r'\s*\([A-Z]\)\s*$', '', text)

    # rapikan spasi
    text = re.sub(r'\s+', ' ', text).strip()

    # Tambahkan logika khusus untuk KLINIK GIGI
    if 'KLINIK GIGI' in text.upper():
        return 'KLINIK GIGI & MULUT'
    if 'INTERNIS' in text.upper():
        return 'KLINIK PENYAKIT DALAM'
    if 'NEUROLOGI' in text.upper():
        return 'KLINIK NEUROLOGI'
    if 'THT' in text.upper():
        return 'KLINIK THT'

    # biar konsisten semua kapital (atau .title() kalau mau)
    return text.upper()

# =====================================================
# 7. MAPPING POLI
# =====================================================
preview_poli_before = df[['list_icd_10']].head(10).copy()
preview_poli_before.columns = ['list_icd_10_before']

map_poli = dict(zip(kamus['list_icd'], kamus['poli_name']))
df['poli_raw'] = df['poli']
df['poli'] = df['poli'].apply(clean_poli_label)
df = df.dropna(subset=['poli']).copy()

# =====================================================
# FILTER FOKUS POLI TERBANYAK
# =====================================================

# Hitung jumlah data per poli
poli_counts = df['poli'].value_counts()

# TENTUKAN mau fokus berapa poli
TOP_N_POLI = 10   # boleh kamu ubah: 5, 8, 12, dst.

print("=== DISTRIBUSI POLI (TOP 20) ===")
print(poli_counts.head(20))
print("\nTotal poli unik sebelum filter:", df['poli'].nunique())

# Ambil daftar poli terbanyak
top_poli = poli_counts.head(TOP_N_POLI).index.tolist()

print(f"\n=== POLI YANG DIPAKAI UNTUK TRAINING (TOP {TOP_N_POLI}) ===")
for i, p in enumerate(top_poli, 1):
    print(f"{i}. {p}")

# Filter dataset hanya pada poli-poli tersebut
df = df[df['poli'].isin(top_poli)].copy()

# Hitung ulang distribusi setelah filter
poli_counts_after = df['poli'].value_counts()

print("\nTotal data setelah filter poli terbanyak:", len(df))
print("Jumlah poli unik setelah filter:", df['poli'].nunique())
print("\n=== DISTRIBUSI POLI SETELAH FILTER (TOP N SAJA) ===")
display(poli_counts_after.to_frame('jumlah'))

df_focused = df.copy()
unique_poli_count = len(df_focused['poli'].unique())
print(f"Fokus pada semua {unique_poli_count} poli yang tersedia di data.")

MIN_SAMPLES = 5

=== DISTRIBUSI POLI (TOP 20) ===
poli
KLINIK ANAK            60594
KLINIK JANTUNG         19278
INTERNIS               17967
KLINIK GIGI & MULUT    16507
KLINIK ANASTESI        12282
NEUROLOGI               8585
THT                     6292
KLINIK MATA             5326
Name: count, dtype: int64

Total poli unik sebelum filter: 8

=== POLI YANG DIPAKAI UNTUK TRAINING (TOP 10) ===
1. KLINIK ANAK
2. KLINIK JANTUNG
3. INTERNIS
4. KLINIK GIGI & MULUT
5. KLINIK ANASTESI
6. NEUROLOGI
7. THT
8. KLINIK MATA

Total data setelah filter poli terbanyak: 146831
Jumlah poli unik setelah filter: 8

=== DISTRIBUSI POLI SETELAH FILTER (TOP N SAJA) ===


Unnamed: 0_level_0,jumlah
poli,Unnamed: 1_level_1
KLINIK ANAK,60594
KLINIK JANTUNG,19278
INTERNIS,17967
KLINIK GIGI & MULUT,16507
KLINIK ANASTESI,12282
NEUROLOGI,8585
THT,6292
KLINIK MATA,5326


Fokus pada semua 8 poli yang tersedia di data.


In [None]:
# =====================================================
# 8. EKSTRAK KELUHAN INTI & DROP YANG KOSONG
# =====================================================

# buat kolom keluhan_core sebagai basis teks
df['keluhan_core'] = df.get('keluhan', '').apply(extract_core_keluhan)

# drop baris yang keluhan_core kosong / sangat abstrak
mask_empty_core = df['keluhan_core'].astype(str).str.strip() == ''
print("Baris yang di-drop karena keluhan sangat abstrak / kosong:", mask_empty_core.sum())
df = df[~mask_empty_core].copy()

# baru setelah itu df_focused
df_focused = df.copy()

# pakai keluhan_core sebagai sumber utama teks keluhan
df_focused['keluhan_enhanced']  = df_focused.get('keluhan_core', '').apply(enhance_medical_text)
df_focused['diagnosa_enhanced'] = df_focused.get('diagnosa', '').apply(enhance_medical_text)
df_focused['riwayat_enhanced']  = df_focused.get('riwayat_penyakit', '').apply(enhance_medical_text)
df_focused['poli_triage']       = df_focused.get('keluhan_core', '').apply(find_poli_from_keyword)

def create_weighted_text(row):
    diagnosa_text = str(row.get('diagnosa_enhanced', ''))
    keluhan_text  = str(row.get('keluhan_enhanced', ''))
    riwayat_text  = str(row.get('riwayat_enhanced', ''))

    text = (diagnosa_text + " ") * 2
    text += (riwayat_text + " ") * 3
    text += (keluhan_text + " ")

    poli_triage = str(row.get('poli_triage', ''))
    if poli_triage:
        text += " " + poli_triage.replace(' ', '_').upper() * 5

    demo_text = f"{row.get('age_category','')} {row.get('jenis_kelamin','')} {row.get('fever_level','')} {row.get('resp_level','')}"
    text += " " + (demo_text + " ") * 3
    if row.get('vital_severity', 0) >= 4:
        text += " CRITICAL_CASE HIGH_ACUITY EMERGENCY URGENT_CARE SEVERE_CONDITION " * 2
    if row.get('pediatric_case', 0) == 1:
        text += " PEDIATRIC_SPECIALTY CHILD_MEDICINE ANAK_CARE PEDIATRI " * 2
    if row.get('geriatric_case', 0) == 1:
        text += " GERIATRIC_SPECIALTY ELDERLY_CARE LANSIA_CARE " * 2
    return text.replace('nan','').strip()

df_focused['weighted_text'] = df_focused.apply(create_weighted_text, axis=1)

preview_text = df_focused[[
    'keluhan',        # keluhan original dari RS
    'keluhan_core',   # hasil extract_core_keluhan
    'keluhan_enhanced',
    'diagnosa',
    'diagnosa_enhanced',
    'riwayat_penyakit',
    'riwayat_enhanced'
]].head(10).copy()

display(preview_text)


Baris yang di-drop karena keluhan sangat abstrak / kosong: 0


Unnamed: 0,keluhan,keluhan_core,keluhan_enhanced,diagnosa,diagnosa_enhanced,riwayat_penyakit,riwayat_enhanced
0,"nyeri lutut kiri ,klien mengatakan mengeluh ra...","nyeri lutut kiri ,klien mengatakan mengeluh ra...",nyeri lutut kiri klien mengatakan mengeluh ras...,OA knee sinistra,osteoarthritis knee sinistra,,
1,"FARING HIPEREMIS, RH +/+,usia 6 thn 1 bln _x00...","batuk(+) <> seminggu, demam(-), pilek(-), munt...",batuk(+) <> seminggu demam(-) pilek(-) muntah ...,BRONKOPNEUMONIA,bronkopneumonia,"- ACUTE UPPER RESPIRATORY INFECTION, UNSPECIFI...",acute upper respiratory infection unspecified ...
4,"RKM, OS PTERIGIUM , OD POST CLG (2019 _x000D_\...","rkm, os pterigium , od post clg (2019 _x000d_ ...",rekam medis mata kiri daging tumbuh pada mata ...,Tidak Dilakukan Pemeriksaan,tidak dilakukan pemeriksaan,,
16,"OD MERAH DAN GATAL 18/16, ods konjungtivitis_x...","od merah dan gatal 18/16, ods konjungtivitis_x...",mata kanan merah gatal 18/16 kedua mata konjun...,Tidak Dilakukan Pemeriksaan,tidak dilakukan pemeriksaan,,
17,"ku sedang, kes cm,kel : nyeri kaki kiri",nyeri kaki kiri,nyeri kaki kiri nyeri pain sakit ache painful,Tidak Dilakukan Pemeriksaan,tidak dilakukan pemeriksaan,,
19,"t/f : hiperemis,","t/f : hiperemis,",terlihat hiperemis,Tidak Dilakukan Pemeriksaan,tidak dilakukan pemeriksaan,,
20,"RH-/-,usia 2 thn 9 bln _x000D_\nkonttol post r...","demam(-), bapil(-), muntah(-), sesak(-), mak/m...",demam(-) bapil(-) muntah(-) sesak(-) mak/min +...,BP PERBAIKAN,bp perbaikan,,
22,"saat malam sesak ada mengi sedikit, tidur ngor...","saat malam sesak ada mengi sedikit, tidur ngor...",saat malam sesak ada mengi sedikit tidur ngoro...,asma,asma asma asthma wheezing mengi bronchial bron...,,
23,"os jalan sudah lebih kuat, nyeri lutut negatif...","os jalan sudah lebih kuat, nyeri lutut negatif...",mata kiri jalan sudah lebih kuat nyeri lutut n...,LBP + Iskialgia Sinistra _x000D_\nOA Left Knee...,low back pain iskialgia sinistra osteoarthriti...,- NON - INSULIN - DEPENDENT DIABETES MELLITUS ...,non insulin dependent diabetes mellitus type i...
27,"gigi berlubang,","gigi berlubang,",gigi berlubang gigi teeth dental tooth oral mulut,16 k02.1,16 k02.1,,


In [None]:
# =====================================================
# 9. ENCODING & TRAIN-TEST SPLIT
# =====================================================
le = LabelEncoder()
y = le.fit_transform(df_focused['poli'])

X_train, X_test, y_train, y_test = train_test_split(
    df_focused, y, test_size=0.2, random_state=42, stratify=y
)

print("Jumlah data latih:", len(X_train))
print("Jumlah data uji:", len(X_test))

Jumlah data latih: 117464
Jumlah data uji: 29367


In [None]:
# =====================================================
# 10. TF-IDF & FEATURE SELECTION
# =====================================================
tfidf = TfidfVectorizer(
    max_features=15000,
    ngram_range=(1,5),
    min_df=2,
    max_df=0.92,
    sublinear_tf=True,
    token_pattern=r'[a-zA-Z_]{2,}'
)
X_train_tfidf = tfidf.fit_transform(X_train['weighted_text'])
X_test_tfidf  = tfidf.transform(X_test['weighted_text'])

selector = SelectKBest(chi2, k=min(12000, X_train_tfidf.shape[1]))
X_train_tfidf_selected = selector.fit_transform(X_train_tfidf, y_train)
X_test_tfidf_selected  = selector.transform(X_test_tfidf)

print("\n=== INFO TF-IDF ===")
print("Jumlah fitur TF-IDF sebelum seleksi :", X_train_tfidf.shape[1])
print("Jumlah fitur TF-IDF setelah seleksi  :", X_train_tfidf_selected.shape[1])
print("Jumlah dokumen train                :", X_train_tfidf.shape[0])
print("Jumlah dokumen test                 :", X_test_tfidf.shape[0])

print("\n=== INFO TF-IDF ===")
print("Jumlah fitur TF-IDF sebelum seleksi :", X_train_tfidf.shape[1])
print("Jumlah fitur TF-IDF setelah seleksi  :", X_train_tfidf_selected.shape[1])
print("Jumlah dokumen train                :", X_train_tfidf.shape[0])
print("Jumlah dokumen test                 :", X_test_tfidf.shape[0])

feature_names = np.array(tfidf.get_feature_names_out())

selected_mask = selector.get_support()
selected_features = feature_names[selected_mask]

print("\n=== 15 TOKEN TERATAS SETELAH FEATURE SELECTION ===")
print(selected_features[:15])

print("\n=== SAMPLE TF-IDF (dokumen ke-0, 20 nilai pertama) ===")
sample_vector = X_train_tfidf[0].toarray()[0][:20]
print(sample_vector)

print("\n=== PREVIEW TEXT VS TF-IDF TOKENS ===")
preview_tfidf = pd.DataFrame({
    "weighted_text": X_train['weighted_text'].head(3).values,
    "jumlah_token_setelah_filter": [len(t.split()) for t in X_train['weighted_text'].head(3)]
})
display(preview_tfidf)

numerical_features = [
    'usia','pernafasan_x_per_menit','suhu_tubuh',
    'critical_fever','critical_resp','pediatric_case',
    'geriatric_case','vital_severity','age_risk','adult_case'
]

print("\n=== INFO FITUR NUMERIK ===")
print("Daftar fitur numerik:", numerical_features)
print("Jumlah fitur numerik:", len(numerical_features))


for col in numerical_features:
    if col not in X_train.columns: X_train[col] = 0
    if col not in X_test.columns:  X_test[col] = 0



=== INFO TF-IDF ===
Jumlah fitur TF-IDF sebelum seleksi : 15000
Jumlah fitur TF-IDF setelah seleksi  : 12000
Jumlah dokumen train                : 117464
Jumlah dokumen test                 : 29367

=== INFO TF-IDF ===
Jumlah fitur TF-IDF sebelum seleksi : 15000
Jumlah fitur TF-IDF setelah seleksi  : 12000
Jumlah dokumen train                : 117464
Jumlah dokumen test                 : 29367

=== 15 TOKEN TERATAS SETELAH FEATURE SELECTION ===
['abadominal' 'abadominal pain' 'abadominal pain diarrhoea'
 'abadominal pain diarrhoea other'
 'abadominal pain diarrhoea other unspecified'
 'abadominal pain diarrhoea tidak' 'abadominal pain diarrhoea tidak ada'
 'abadominal pain dispepsia' 'abadominal pain dispepsia other'
 'abadominal pain dispepsia other unspecified' 'abadominal pain fever'
 'abadominal pain fever other' 'abadominal pain fever other unspecified'
 'abadominal pain gastritis' 'abadominal pain gastritis other']

=== SAMPLE TF-IDF (dokumen ke-0, 20 nilai pertama) ===
[0. 0. 0

Unnamed: 0,weighted_text,jumlah_token_setelah_filter
0,pneumonia asma bronchiale suspect suspect tube...,140
1,fever unspecified fever unspecified fever unsp...,61
2,other unspecified gastroenteritis colitis of i...,71



=== INFO FITUR NUMERIK ===
Daftar fitur numerik: ['usia', 'pernafasan_x_per_menit', 'suhu_tubuh', 'critical_fever', 'critical_resp', 'pediatric_case', 'geriatric_case', 'vital_severity', 'age_risk', 'adult_case']
Jumlah fitur numerik: 10


In [None]:
# =====================================================
# 11. LIGHTGBM EMBEDDING
# =====================================================
lgb_params = {
    'objective': 'multiclass',
    'num_class': len(le.classes_),
    'learning_rate': 0.08,
    'num_leaves': 127,
    'max_depth': 10,
    'min_child_samples': 10,
    'feature_fraction': 0.95,
    'bagging_fraction': 0.95,
    'bagging_freq': 3,
    'reg_alpha': 0.03,
    'reg_lambda': 0.03,
    'verbose': -1,
    'seed': 42,
    'force_col_wise': True
}
lgb_train = lgb.Dataset(X_train[numerical_features].astype(float), label=y_train)
lgb_model = lgb.train(lgb_params, lgb_train, num_boost_round=400)

emb_train = lgb_model.predict(X_train[numerical_features].astype(float))
emb_test  = lgb_model.predict(X_test[numerical_features].astype(float))

scaler = StandardScaler()
emb_train_scaled = scaler.fit_transform(emb_train)
emb_test_scaled  = scaler.transform(emb_test)

X_train_final = hstack([csr_matrix(emb_train_scaled * 2.0), X_train_tfidf_selected * 1.2])
X_test_final  = hstack([csr_matrix(emb_test_scaled * 2.0),  X_test_tfidf_selected * 1.2])

print("\n=== INFO EMBEDDING LIGHTGBM ===")
print("emb_train shape :", emb_train.shape)   # (n_train, num_class)
print("emb_test shape  :", emb_test.shape)

# Buat DataFrame kecil untuk lihat 5 baris pertama embedding + label poli
emb_preview_df = pd.DataFrame(
    emb_train[:5],
    columns=[f"emb_class_{cls}" for cls in le.classes_]  # satu kolom per poli
)
emb_preview_df.insert(0, "poli_asli", le.inverse_transform(y_train[:5]))

print("\n=== SAMPLE 5 BARIS EMBEDDING (TRAIN) ===")
display(emb_preview_df)


=== INFO EMBEDDING LIGHTGBM ===
emb_train shape : (117464, 8)
emb_test shape  : (29367, 8)

=== SAMPLE 5 BARIS EMBEDDING (TRAIN) ===


Unnamed: 0,poli_asli,emb_class_INTERNIS,emb_class_KLINIK ANAK,emb_class_KLINIK ANASTESI,emb_class_KLINIK GIGI & MULUT,emb_class_KLINIK JANTUNG,emb_class_KLINIK MATA,emb_class_NEUROLOGI,emb_class_THT
0,KLINIK ANAK,0.002509,0.912462,0.044212,1.914089e-06,1.4e-05,3e-06,0.001445,0.039353
1,KLINIK ANAK,0.107688,0.884221,0.001061,9.251713e-06,0.001337,9e-06,0.005626,4.9e-05
2,KLINIK ANAK,0.037586,0.836518,0.123228,8.572506e-07,8e-06,0.002335,0.000147,0.000177
3,KLINIK ANASTESI,0.280832,0.516564,0.171149,6.18016e-05,0.002551,0.011012,0.006143,0.011687
4,KLINIK MATA,0.002312,0.891059,0.043827,8.446821e-06,0.000258,0.05937,0.00085,0.002314


In [None]:
# =====================================================
# 12. TRAINING SVM & LOGISTIC REGRESSION (SGD)
# =====================================================

print("\nTraining SVM (Linear)...")
svm = SGDClassifier(
    loss='hinge',              # SVM linear
    alpha=3e-5,
    max_iter=2500,
    tol=5e-5,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)
svm.fit(X_train_final, y_train)
y_pred_svm = svm.predict(X_test_final)
acc_svm = accuracy_score(y_test, y_pred_svm)

print("\nTraining Logistic Regression (Linear, SGD)...")
logreg = SGDClassifier(
    loss='log_loss',           # logistic regression
    alpha=3e-5,
    max_iter=2500,
    tol=5e-5,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)
logreg.fit(X_train_final, y_train)
y_pred_logreg = logreg.predict(X_test_final)
acc_logreg = accuracy_score(y_test, y_pred_logreg)


Training SVM (Linear)...

Training Logistic Regression (Linear, SGD)...


In [None]:
# =====================================================
# 13. PILIH MODEL TERBAIK
# =====================================================
if acc_logreg > acc_svm:
    best_model = logreg
    best_acc   = acc_logreg
    model_name = 'Logistic Regression (SGD)'
else:
    best_model = svm
    best_acc   = acc_svm
    model_name = 'SVM (SGD)'

print("\n" + "="*80)
print(f"SVM (Fusion) Accuracy                : {acc_svm*100:.2f}%")
print(f"Logistic Regression (Fusion) Accuracy: {acc_logreg*100:.2f}%")
print(f"FINAL MODEL TERPILIH: {model_name} dengan AKURASI: {best_acc*100:.2f}%")
print("="*80)

print("Classification Report (SVM):")
print(classification_report(y_test, y_pred_svm, target_names=le.classes_, zero_division=0))

print("\nClassification Report (Logistic Regression):")
print(classification_report(y_test, y_pred_logreg, target_names=le.classes_, zero_division=0))


SVM (Fusion) Accuracy                : 86.38%
Logistic Regression (Fusion) Accuracy: 85.56%
FINAL MODEL TERPILIH: SVM (SGD) dengan AKURASI: 86.38%
Classification Report (SVM):
                     precision    recall  f1-score   support

           INTERNIS       0.73      0.68      0.70      3594
        KLINIK ANAK       0.93      0.92      0.92     12119
    KLINIK ANASTESI       0.75      0.70      0.72      2456
KLINIK GIGI & MULUT       0.96      0.96      0.96      3302
     KLINIK JANTUNG       0.92      0.93      0.93      3856
        KLINIK MATA       0.70      0.86      0.77      1065
          NEUROLOGI       0.67      0.78      0.72      1717
                THT       0.85      0.84      0.85      1258

           accuracy                           0.86     29367
          macro avg       0.82      0.83      0.82     29367
       weighted avg       0.87      0.86      0.86     29367


Classification Report (Logistic Regression):
                     precision    recall  

In [None]:
# =====================================================
# 14. MODE INTERAKTIF PREDIKSI
# =====================================================

# --- daftar keluhan umum non-spesifik (untuk Klinik Umum) ---
KEYWORD_KLINIK_UMUM = [
    'demam', 'meriang', 'pusing', 'sakit kepala',
    'batuk pilek', 'pilek', 'flu',
    'panas', 'tidak enak badan', 'ga enak badan', 'nggak enak badan'
]

# =====================================================
# RULE: KAPAN KITA PAKSA KE KLINIK UMUM (SETELAH LIHAT CONFIDENCE)
# =====================================================
def RULE_KLINIK_UMUM(keluhan_raw, poli_triage, max_confidence, threshold):
    """
    Mengembalikan True kalau pasien sebaiknya diarahkan ke Klinik Umum.
    Dipakai di mode interaktif setelah model memberi skor.
    """
    text = str(keluhan_raw).lower().strip()
    poli_triage = str(poli_triage).upper().strip()

    # 1) Kalau kamus triase sudah bilang "KLINIK UMUM" â†’ langsung ke Klinik Umum
    if poli_triage == 'KLINIK UMUM':
        return True

    # 2) Kalau keluhan sangat umum + confidence model rendah â†’ ke Klinik Umum
    if any(kw in text for kw in KEYWORD_KLINIK_UMUM) and max_confidence < threshold:
        return True

    return False


print("\n" + "MODE INTERAKTIF SIAP!".center(80))
print("Ketik data pasien baru (atau ketik 'exit' untuk keluar)\n")

def predict_poli():
    while True:
        print("-" * 60)
        try:
            final_model = best_model
            current_acc = best_acc

            usia = input("Usia (tahun)                  : ").strip()
            if usia.lower() == 'exit':
                break
            usia = float(usia)

            jk = input("Jenis Kelamin (L/P)            : ").strip().upper()
            if jk not in ['L', 'P']:
                raise ValueError("Harus L atau P")

            riwayat = input("Riwayat Penyakit (kosongkan jika tidak ada) : ").strip()

            keluhan = input("Keluhan Utama                  : ").strip()
            if not keluhan:
                raise ValueError("Keluhan wajib diisi")

            # ----------------- DEFAULT VITAL SIGN (BISA DIKEMBANGKAN) -----------------
            nafas = 20.0
            suhu  = 36.5

            input_df = pd.DataFrame([{
                'usia': usia, 'jenis_kelamin': jk, 'riwayat_penyakit': riwayat,
                'keluhan': keluhan, 'diagnosa': '',
                'pernafasan_x_per_menit': nafas, 'suhu_tubuh': suhu
            }])

            # ----------------- FEATURE NUMERIK & KATEGORIK -----------------
            input_df['age_category'] = pd.cut(
                input_df['usia'],
                bins=[-1,2,12,18,45,65,200],
                labels=['infant','child','adolescent','adult','senior','geriatric']
            ).astype(str)

            input_df['fever_level']  = pd.cut(
                input_df['suhu_tubuh'],
                bins=[-1,36.5,37.5,38.5,60],
                labels=['hypothermia','normal','lowfever','highfever']
            ).astype(str)

            input_df['resp_level']   = pd.cut(
                input_df['pernafasan_x_per_menit'],
                bins=[-1,12,20,30,200],
                labels=['bradypnea','normal','tachypnea','severe']
            ).astype(str)

            input_df['critical_fever']   = (input_df['suhu_tubuh'] > 38.5).astype(int)
            input_df['critical_resp']    = (input_df['pernafasan_x_per_menit'] > 30).astype(int)
            input_df['pediatric_case']   = (input_df['usia'] < 18).astype(int)
            input_df['geriatric_case']   = (input_df['usia'] > 65).astype(int)
            input_df['adult_case']       = ((input_df['usia'] >= 18) & (input_df['usia'] <= 65)).astype(int)
            input_df['vital_severity']   = (
                input_df['critical_fever']*2 +
                input_df['critical_resp']*2 +
                (input_df['suhu_tubuh']>37.5).astype(int) +
                (input_df['pernafasan_x_per_menit']>20).astype(int)
            ).astype(int)
            input_df['age_risk']         = (
                input_df['pediatric_case']*2 + input_df['geriatric_case']*3
            ).astype(int)

            # ----------------- ENHANCED TEXT & TRIAGE KEYWORD -----------------
            input_df['keluhan_enhanced']  = input_df['keluhan'].apply(enhance_medical_text)
            input_df['diagnosa_enhanced'] = input_df['diagnosa'].apply(enhance_medical_text)
            input_df['riwayat_enhanced']  = input_df['riwayat_penyakit'].apply(enhance_medical_text)
            input_df['poli_triage']       = input_df.get('keluhan', '').apply(find_poli_from_keyword)
            input_df['weighted_text']     = input_df.apply(create_weighted_text, axis=1)

            poli_triage_input = str(input_df.loc[0, 'poli_triage']).strip()

            # --- RULE 1: JIKA KAMUS POLI TRIAGE PUNYA JAWABAN, PAKAI ITU DULU ---
            if poli_triage_input:
                poli_display = poli_triage_input.upper()

                # mapping khusus Gigi:
                if 'KLINIK GIGI UMUM KAPITASI' in poli_display:
                    poli_display = 'KLINIK GIGI & MULUT'

                print("\n" + "="*60)
                print("REKOMENDASI POLI       : " + poli_display)
                print(f"AKURASI MODEL (test)   : {current_acc*100:.2f}%")
                print("="*60)
                continue

            # ----------------- VEKTOR FITUR UNTUK MODEL -----------------
            text_vec = tfidf.transform(input_df['weighted_text'])
            text_sel = selector.transform(text_vec)

            num_pred   = lgb_model.predict(input_df[numerical_features].astype(float))
            num_scaled = scaler.transform(num_pred)

            final_vec = hstack([csr_matrix(num_scaled * 2.0), text_sel * 1.2])

            # ----------------- HITUNG CONFIDENCE -----------------
            if 'Logistic' in model_name:
                proba_vector = final_model.predict_proba(final_vec)[0]
                max_confidence = np.max(proba_vector)
                CONFIDENCE_THRESHOLD = 0.35
            else:
                scores = final_model.decision_function(final_vec)[0]
                if np.ndim(scores) == 0:
                    proba_vector = np.array([scores])
                else:
                    proba_vector = scores
                max_confidence = np.max(proba_vector)
                CONFIDENCE_THRESHOLD = 1.0

            # ----------------- RULE KLINIK UMUM (SETELAH LIHAT CONF) -----------------
            if RULE_KLINIK_UMUM(
                keluhan_raw=keluhan,
                poli_triage=poli_triage_input,
                max_confidence=max_confidence,
                threshold=CONFIDENCE_THRESHOLD
            ):
                poli_display = 'KLINIK UMUM'
                print(
                    f"Peringatan: Prediksi model ambigu / keluhan umum "
                    f"(Conf: {max_confidence:.2f} < {CONFIDENCE_THRESHOLD:.2f}). "
                    f"Pasien diarahkan ke KLINIK UMUM."
                )

            elif max_confidence < CONFIDENCE_THRESHOLD:
                # fallback umum kalau low confidence tapi tidak kena rule khusus
                poli_display = 'KLINIK UMUM'
                print(
                    f"Peringatan: Prediksi model ambigu (Conf: {max_confidence:.2f} "
                    f"< {CONFIDENCE_THRESHOLD:.2f}). Pasien diarahkan ke KLINIK UMUM."
                )

            else:
                # ----------------- LOGIC PREDIKSI NORMAL -----------------
                pred_class = np.argmax(proba_vector)
                poli_pred = le.inverse_transform([pred_class])[0]
                poli_display = re.sub(r'\s*\([A-Z]\)\s*$', '', poli_pred).strip()

                # mapping nama klinik gigi dari model -> tampilan user
                if 'KLINIK GIGI UMUM KAPITASI' in poli_display.upper():
                    poli_display = 'KLINIK GIGI & MULUT'

                # VALIDASI: Jika umur >= 18 tahun dan prediksi poli anak, cari alternatif dewasa
                if usia >= 18 and 'ANAK' in poli_pred.upper():
                    sorted_indices = np.argsort(proba_vector)[::-1]
                    for idx in sorted_indices[1:]:
                        alternative_poli = le.inverse_transform([idx])[0]
                        if 'ANAK' not in alternative_poli.upper():
                            poli_pred = alternative_poli
                            poli_display = re.sub(r'\s*\([A-Z]\)\s*$', '', poli_pred).strip()
                            break
                    else:
                        for poli_name in le.classes_:
                            if 'INTERNIS' in poli_name.upper() or 'PENYAKIT DALAM' in poli_name.upper():
                                poli_pred = poli_name
                                poli_display = re.sub(r'\s*\([A-Z]\)\s*$', '', poli_name).strip()
                                break

            print("\n" + "="*60)
            print(f"REKOMENDASI POLI       : {poli_display.upper()}")
            print(f"AKURASI MODEL (test)   : {current_acc*100:.2f}%")
            print("="*60)

        except Exception as e:
            print(f"Error: {e}. Coba lagi atau ketik 'exit'")

predict_poli()
print("\nTerima kasih! Model tetap aktif untuk prediksi kapan saja.")



                             MODE INTERAKTIF SIAP!                              
Ketik data pasien baru (atau ketik 'exit' untuk keluar)

------------------------------------------------------------

REKOMENDASI POLI       : KLINIK GIGI & MULUT
AKURASI MODEL (test)   : 86.38%
------------------------------------------------------------
Peringatan: Prediksi model ambigu (Conf: -0.36 < 1.00). Pasien diarahkan ke KLINIK UMUM.

REKOMENDASI POLI       : KLINIK UMUM
AKURASI MODEL (test)   : 86.38%
------------------------------------------------------------
Peringatan: Prediksi model ambigu (Conf: -0.36 < 1.00). Pasien diarahkan ke KLINIK UMUM.

REKOMENDASI POLI       : KLINIK UMUM
AKURASI MODEL (test)   : 86.38%
------------------------------------------------------------
Peringatan: Prediksi model ambigu (Conf: -1.17 < 1.00). Pasien diarahkan ke KLINIK UMUM.

REKOMENDASI POLI       : KLINIK UMUM
AKURASI MODEL (test)   : 86.38%
------------------------------------------------------------
