In [1]:
import os
import gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
from sklearn.model_selection import KFold
import warnings

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models

# --- KONFIGURASI MODEL B4 ---
CONFIG = {
    'seed': 42,
    'n_fold': 5,            
    'epochs': 15,           
    'img_size': 380,        # UPDATE: B4 butuh resolusi 380x380
    'batch_size': 8,        # UPDATE: Turunkan batch size biar GPU gak meledak (B4 makan memori)
    'learning_rate': 1e-4,
    'device': torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    
    # PENTING: Ganti path ini sesuai lokasi file .pth B4 yang Anda Add Data
    # Contoh path umum di kaggle (silakan cek input folder Anda):
    'weights_path': '/kaggle/input/efficientnetb4weight/pytorch/default/1/efficientnet_b4_rwightman-23ab8bcd.pth' 
    # Jika tidak ketemu file-nya, kosongkan ('') tapi nyalakan internet saat training pertama
}

def seed_everything(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(CONFIG['seed'])
warnings.filterwarnings('ignore')
print(f"‚úÖ Siap menggunakan EfficientNet-B4 pada: {CONFIG['device']}")

‚úÖ Siap menggunakan EfficientNet-B4 pada: cuda


In [2]:
INPUT_ROOT = '/kaggle/input'
DATASET_DIR = ''

for dirname, _, filenames in os.walk(INPUT_ROOT):
    if 'train.csv' in filenames:
        DATASET_DIR = dirname
        break

TRAIN_CSV = os.path.join(DATASET_DIR, '/kaggle/input/csiro-biomass/train.csv')
TEST_CSV = os.path.join(DATASET_DIR, '/kaggle/input/csiro-biomass/test.csv')

# Cari folder gambar
possible_dirs = [os.path.join(DATASET_DIR, 'train'), os.path.join(DATASET_DIR, 'images'), DATASET_DIR]
TRAIN_IMG_DIR = None
for d in possible_dirs:
    if os.path.exists(d) and any(f.endswith('.jpg') for f in os.listdir(d)[:5]):
        TRAIN_IMG_DIR = d
        break
TEST_IMG_DIR = TRAIN_IMG_DIR.replace('train', 'test') if TRAIN_IMG_DIR else None

# Load Data & Log Transform
train_df = pd.read_csv(TRAIN_CSV)
train_pivot = train_df.pivot_table(index='image_path', columns='target_name', values='target').reset_index()

target_cols = ['Dry_Clover_g', 'Dry_Dead_g', 'Dry_Green_g', 'Dry_Total_g', 'GDM_g']
for col in target_cols:
    train_pivot[col] = np.log1p(train_pivot[col]) # Log Transform

print(f"üìÇ Train Image Dir: {TRAIN_IMG_DIR}")

üìÇ Train Image Dir: /kaggle/input/csiro-biomass/train


In [3]:
class BiomassDataset(Dataset):
    def __init__(self, df, img_dir, transforms=None, is_test=False):
        self.df = df
        self.img_dir = img_dir
        self.transforms = transforms
        self.is_test = is_test
        self.target_cols = target_cols
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_name = os.path.basename(row['image_path'])
        img_path = os.path.join(self.img_dir, img_name)
        
        try:
            image = Image.open(img_path).convert("RGB")
        except:
            image = Image.new('RGB', (CONFIG['img_size'], CONFIG['img_size']))

        if self.transforms:
            image = self.transforms(image)
            
        if self.is_test:
            return image, row['image_path']
        else:
            targets = row[self.target_cols].values.astype(np.float32)
            return image, torch.tensor(targets)

# Transforms untuk EfficientNet-B4 (380x380)
data_transforms = {
    'train': transforms.Compose([
        transforms.Resize((CONFIG['img_size'], CONFIG['img_size'])),
        transforms.RandomHorizontalFlip(),
        transforms.RandomVerticalFlip(),
        transforms.RandomRotation(30), # Rotasi lebih berani
        transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2), # Augmentasi warna
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Resize((CONFIG['img_size'], CONFIG['img_size'])),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

In [4]:
class RobustBiomassModel(nn.Module):
    def __init__(self, weights_path=None):
        super(RobustBiomassModel, self).__init__()
        
        # 1. Load B4 (Tanpa Internet)
        self.backbone = models.efficientnet_b4(weights=None)
        
        # 2. Load Weights Manual
        if weights_path and os.path.exists(weights_path):
            try:
                state_dict = torch.load(weights_path)
                # Kadang key di file .pth beda format, ini handle basicnya
                self.backbone.load_state_dict(state_dict, strict=False) 
                print("‚úÖ Pretrained B4 weights berhasil dimuat!")
            except Exception as e:
                print(f"‚ö†Ô∏è Warning: {e}")
        
        # 3. Modifikasi Head (Lebih Besar dari B3)
        num_features = self.backbone.classifier[1].in_features # B4 fiturnya 1792
        self.backbone.classifier = nn.Identity()
        
        self.regressor = nn.Sequential(
            nn.Linear(num_features, 1024), # Layer lebih lebar
            nn.SiLU(), 
            nn.Dropout(0.4), # Dropout lebih tinggi (0.4) untuk cegah overfitting
            nn.Linear(1024, 512),
            nn.SiLU(),
            nn.Linear(512, 5)
        )
        
    def forward(self, x):
        x = self.backbone(x)
        x = self.regressor(x)
        return x

In [5]:
def train_fold(fold, train_idx, val_idx):
    print(f"\n{'='*10} FOLD {fold+1}/{CONFIG['n_fold']} (B4 Model) {'='*10}")
    
    train_sub = train_pivot.iloc[train_idx].reset_index(drop=True)
    val_sub = train_pivot.iloc[val_idx].reset_index(drop=True)
    
    train_loader = DataLoader(
        BiomassDataset(train_sub, TRAIN_IMG_DIR, transforms=data_transforms['train']),
        batch_size=CONFIG['batch_size'], shuffle=True, num_workers=2
    )
    val_loader = DataLoader(
        BiomassDataset(val_sub, TRAIN_IMG_DIR, transforms=data_transforms['val']),
        batch_size=CONFIG['batch_size'], shuffle=False, num_workers=2
    )
    
    model = RobustBiomassModel(weights_path=CONFIG['weights_path']).to(CONFIG['device'])
    
    criterion = nn.HuberLoss(delta=1.0) 
    optimizer = optim.AdamW(model.parameters(), lr=CONFIG['learning_rate'], weight_decay=1e-4)
    
    # Scheduler diperlambat agar belajarnya lebih detail
    scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=CONFIG['epochs'], eta_min=1e-6)
    
    best_loss = float('inf')
    
    for epoch in range(CONFIG['epochs']):
        model.train()
        running_loss = 0.0
        
        for imgs, targets in train_loader:
            imgs, targets = imgs.to(CONFIG['device']), targets.to(CONFIG['device'])
            optimizer.zero_grad()
            out = model(imgs)
            loss = criterion(out, targets)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            
        scheduler.step()
        
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for imgs, targets in val_loader:
                imgs, targets = imgs.to(CONFIG['device']), targets.to(CONFIG['device'])
                out = model(imgs)
                loss = criterion(out, targets)
                val_loss += loss.item()
        
        avg_val_loss = val_loss / len(val_loader)
        
        if avg_val_loss < best_loss:
            best_loss = avg_val_loss
            torch.save(model.state_dict(), f'model_fold_{fold}.pth')
            
    print(f"  üèÜ Best Val Loss Fold {fold+1}: {best_loss:.4f}")
    return best_loss

# Jalankan K-Fold
kf = KFold(n_splits=CONFIG['n_fold'], shuffle=True, random_state=CONFIG['seed'])
losses = []
for fold, (train_idx, val_idx) in enumerate(kf.split(train_pivot)):
    loss = train_fold(fold, train_idx, val_idx)
    losses.append(loss)
print(f"\n‚ú® Average CV Score: {np.mean(losses):.4f}")


‚úÖ Pretrained B4 weights berhasil dimuat!
  üèÜ Best Val Loss Fold 1: 0.3108

‚úÖ Pretrained B4 weights berhasil dimuat!
  üèÜ Best Val Loss Fold 2: 0.3447

‚úÖ Pretrained B4 weights berhasil dimuat!
  üèÜ Best Val Loss Fold 3: 0.2887

‚úÖ Pretrained B4 weights berhasil dimuat!
  üèÜ Best Val Loss Fold 4: 0.2351

‚úÖ Pretrained B4 weights berhasil dimuat!
  üèÜ Best Val Loss Fold 5: 0.3032

‚ú® Average CV Score: 0.2965


In [6]:
def predict_ensemble(test_csv_path, img_dir):
    print("\nüöÄ Memulai Inference B4 (5 Model Ensemble)...")
    
    test_df_long = pd.read_csv(test_csv_path)
    unique_imgs = test_df_long[['image_path']].drop_duplicates().reset_index(drop=True)
    
    test_ds = BiomassDataset(unique_imgs, img_dir, data_transforms['val'], is_test=True)
    test_dl = DataLoader(test_ds, batch_size=16, shuffle=False)
    
    models_list = []
    for i in range(CONFIG['n_fold']):
        model = RobustBiomassModel(weights_path=None) 
        model.load_state_dict(torch.load(f'model_fold_{i}.pth'))
        model.to(CONFIG['device'])
        model.eval()
        models_list.append(model)
    
    image_path_keys = []
    final_preds = []
    
    with torch.no_grad():
        for xb, paths in test_dl:
            xb = xb.to(CONFIG['device'])
            batch_preds = []
            for model in models_list:
                out = model(xb)
                out = torch.expm1(out)  # Reverse Log
                out = torch.relu(out)
                batch_preds.append(out.cpu().numpy())
            
            avg_preds = np.mean(batch_preds, axis=0)
            
            for path, pred in zip(paths, avg_preds):
                image_path_keys.append(os.path.basename(path))
                final_preds.append(pred)
                
    preds_wide = pd.DataFrame(final_preds, columns=target_cols)
    preds_wide['image_path_key'] = image_path_keys
    
    preds_long = preds_wide.melt(id_vars=['image_path_key'], value_vars=target_cols, var_name='target_name', value_name='target')
    test_df_long['image_path_key'] = test_df_long['image_path'].apply(os.path.basename)
    
    submission = pd.merge(test_df_long[['sample_id', 'image_path_key', 'target_name']], preds_long, on=['image_path_key', 'target_name'], how='left')
    submission = submission[['sample_id', 'target']]
    submission.to_csv('submission.csv', index=False)
    print("üéâ Submission Created!")
    return submission.head()

predict_ensemble(TEST_CSV, TEST_IMG_DIR)


üöÄ Memulai Inference B4 (5 Model Ensemble)...
üéâ Submission Created!


Unnamed: 0,sample_id,target
0,ID1001187975__Dry_Clover_g,0.273096
1,ID1001187975__Dry_Dead_g,14.639925
2,ID1001187975__Dry_Green_g,21.390858
3,ID1001187975__Dry_Total_g,38.992695
4,ID1001187975__GDM_g,21.511496
