In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split

In [2]:
# ================= KONFIGURASI =================
INPUT_FOLDER = 'Dataset CSV'
OUTPUT_FOLDER = 'Dataset Split'  # Kita buat folder baru agar rapi

# Daftar file sesuai screenshot kamu
FILES_TO_PROCESS = [
    'efficientnet_features(non normalized).csv',
    'efficientnet_features(normalized).csv',
    'resnet50_features(non normalized).csv',
    'resnet50_features(normalized).csv'
]

RANDOM_STATE = 42
TEST_SIZE = 0.2

# ================= PROSES SPLITTING =================
def split_and_save_datasets():
    # Buat folder output jika belum ada
    if not os.path.exists(OUTPUT_FOLDER):
        os.makedirs(OUTPUT_FOLDER)
        print(f"üìÇ Membuat folder baru: {OUTPUT_FOLDER}")
        
    print(f"{'='*80}")
    print(f"PROSES PEMISAHAN DATASET (TRAIN 80% - TEST 20%)")
    print(f"{'='*80}\n")

    for filename in FILES_TO_PROCESS:
        file_path = os.path.join(INPUT_FOLDER, filename)
        
        # Cek apakah file ada
        if not os.path.exists(file_path):
            print(f"‚ö†Ô∏è File tidak ditemukan: {filename} (Skip)")
            continue
            
        print(f"üîÑ Memproses: {filename}...")
        
        # 1. Baca CSV
        df = pd.read_csv(file_path)
        
        # Cek kolom target untuk Stratified Split (agar proporsi kelas seimbang)
        # Biasanya 'label_encoded' atau 'label'
        if 'label_encoded' in df.columns:
            target_col = df['label_encoded']
        else:
            # Fallback jika nama kolom beda (misal ambil kolom terakhir)
            print("   ‚ö†Ô∏è Kolom 'label_encoded' tidak ditemukan, mencoba kolom terakhir sebagai target.")
            target_col = df.iloc[:, -1]

        # 2. Lakukan Splitting
        # Kita split seluruh DataFrame (X dan y menyatu) agar mudah disimpan
        train_df, test_df = train_test_split(
            df, 
            test_size=TEST_SIZE, 
            random_state=RANDOM_STATE, 
            stratify=target_col # Penting! Agar jumlah kelas seimbang di train & test
        )
        
        # 3. Simpan ke CSV baru
        # Nama file output: train_resnet50_normalized.csv, dst.
        base_name = os.path.splitext(filename)[0] # Hilangkan .csv
        
        train_filename = f"train_{base_name}.csv"
        test_filename = f"test_{base_name}.csv"
        
        train_path = os.path.join(OUTPUT_FOLDER, train_filename)
        test_path = os.path.join(OUTPUT_FOLDER, test_filename)
        
        train_df.to_csv(train_path, index=False)
        test_df.to_csv(test_path, index=False)
        
        print(f"   ‚úÖ Disimpan: {train_filename} ({len(train_df)} baris)")
        print(f"   ‚úÖ Disimpan: {test_filename} ({len(test_df)} baris)")
        print("-" * 50)

    print(f"\nüéâ Selesai! Semua file tersimpan di folder '{OUTPUT_FOLDER}'")

if __name__ == "__main__":
    split_and_save_datasets()

üìÇ Membuat folder baru: Dataset Split
PROSES PEMISAHAN DATASET (TRAIN 80% - TEST 20%)

üîÑ Memproses: efficientnet_features(non normalized).csv...
   ‚úÖ Disimpan: train_efficientnet_features(non normalized).csv (3373 baris)
   ‚úÖ Disimpan: test_efficientnet_features(non normalized).csv (844 baris)
--------------------------------------------------
üîÑ Memproses: efficientnet_features(normalized).csv...
   ‚úÖ Disimpan: train_efficientnet_features(normalized).csv (3373 baris)
   ‚úÖ Disimpan: test_efficientnet_features(normalized).csv (844 baris)
--------------------------------------------------
üîÑ Memproses: resnet50_features(non normalized).csv...
   ‚úÖ Disimpan: train_resnet50_features(non normalized).csv (3373 baris)
   ‚úÖ Disimpan: test_resnet50_features(non normalized).csv (844 baris)
--------------------------------------------------
üîÑ Memproses: resnet50_features(normalized).csv...
   ‚úÖ Disimpan: train_resnet50_features(normalized).csv (3373 baris)
   ‚úÖ Disimpa