# Durian Leaf Disease Classification - Exploratory Data Analysis

วิเคราะห์ข้อมูลโรคใบทุเรียน 4 คลาส:
- 0: ไม่เป็นโรค
- 1: หนอนและแมงปีกแข็ง
- 2: เชื้อรา (ใบจุด ใบจุดสาหร่าย ราสนิม ใบไหม้ ฟิวซาเรี่ยม รากเน่าโคนเน่า ราดำ)
- 3: เพลี้ย (จั๊กจั่นฝอย เพลี้ยไก่แจ้ เพลี้ยนาสาร เพลี้ยแป้ง เพลี้ยหอย ไรแดง)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import os

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load Data

In [None]:
# Load train and test data
train_df = pd.read_csv('../data/raw/train.csv')
test_df = pd.read_csv('../data/raw/test.csv')
submit_df = pd.read_csv('../data/raw/submit.csv')

print(f"Train samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")
print(f"Submit samples: {len(submit_df)}")

In [None]:
# Display first few rows
print("\nTrain data:")
display(train_df.head())

print("\nTest data:")
display(test_df.head())

## 2. Class Distribution Analysis

In [None]:
# Class labels mapping
class_names = {
    0: 'ไม่เป็นโรค (Healthy)',
    1: 'หนอนและแมงปีกแข็ง (Worms & Beetles)',
    2: 'เชื้อรา (Fungal Diseases)',
    3: 'เพลี้ย (Aphids & Mites)'
}

# Count distribution
label_counts = train_df['label'].value_counts().sort_index()
print("Class distribution:")
for label, count in label_counts.items():
    print(f"{label}: {class_names[label]}: {count} ({count/len(train_df)*100:.2f}%)")

In [None]:
# Visualize class distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Bar plot
axes[0].bar(label_counts.index, label_counts.values, color='skyblue')
axes[0].set_xlabel('Class Label')
axes[0].set_ylabel('Count')
axes[0].set_title('Class Distribution in Training Set')
axes[0].set_xticks(range(4))
axes[0].set_xticklabels([f'Class {i}' for i in range(4)])

# Add count labels on bars
for i, v in enumerate(label_counts.values):
    axes[0].text(i, v + 1, str(v), ha='center', va='bottom')

# Pie chart
colors = ['#ff9999', '#66b3ff', '#99ff99', '#ffcc99']
axes[1].pie(label_counts.values, labels=[class_names[i] for i in range(4)], 
            autopct='%1.1f%%', colors=colors, startangle=90)
axes[1].set_title('Class Distribution (%)')

plt.tight_layout()
plt.show()

## 3. Sample Images Visualization

In [None]:
# Show sample images for each class
image_dir = '../data/images'
n_samples = 4  # Number of samples per class

fig, axes = plt.subplots(4, n_samples, figsize=(15, 12))

for class_idx in range(4):
    # Get sample images for this class
    class_samples = train_df[train_df['label'] == class_idx].sample(n=n_samples, random_state=42)
    
    for i, (_, row) in enumerate(class_samples.iterrows()):
        img_path = os.path.join(image_dir, row['id'])
        
        # Check if image exists
        if os.path.exists(img_path):
            img = Image.open(img_path)
            axes[class_idx, i].imshow(img)
        else:
            axes[class_idx, i].text(0.5, 0.5, 'Image not found', 
                                   ha='center', va='center')
        
        axes[class_idx, i].axis('off')
        
        if i == 0:
            axes[class_idx, i].set_title(f"Class {class_idx}:\n{class_names[class_idx]}", 
                                        fontsize=10, loc='left')

plt.suptitle('Sample Images from Each Class', fontsize=14, y=0.995)
plt.tight_layout()
plt.show()

## 4. Image Statistics

In [None]:
# Analyze image sizes
image_sizes = []
image_aspects = []

print("Analyzing image dimensions...")
for img_id in train_df['id'].head(50):  # Sample 50 images
    img_path = os.path.join(image_dir, img_id)
    if os.path.exists(img_path):
        img = Image.open(img_path)
        width, height = img.size
        image_sizes.append((width, height))
        image_aspects.append(width / height)

if image_sizes:
    image_sizes = np.array(image_sizes)
    print(f"\nImage dimensions (from {len(image_sizes)} samples):")
    print(f"Width: min={image_sizes[:, 0].min()}, max={image_sizes[:, 0].max()}, mean={image_sizes[:, 0].mean():.1f}")
    print(f"Height: min={image_sizes[:, 1].min()}, max={image_sizes[:, 1].max()}, mean={image_sizes[:, 1].mean():.1f}")
    print(f"Aspect ratio: min={min(image_aspects):.2f}, max={max(image_aspects):.2f}, mean={np.mean(image_aspects):.2f}")
else:
    print("No images found in the specified directory")

## 5. Data Summary

### Key Findings:

1. **Dataset Size:**
   - Training: 278 images
   - Testing: 278 images

2. **Class Distribution:**
   - Class 0 (ไม่เป็นโรค): 41 images (14.7%)
   - Class 1 (หนอนและแมงปีกแข็ง): 78 images (28.1%)
   - Class 2 (เชื้อรา): 87 images (31.3%)
   - Class 3 (เพลี้ย): 72 images (25.9%)

3. **Class Imbalance:**
   - มีความไม่สมดุลเล็กน้อย โดย Class 0 มีข้อมูลน้อยที่สุด
   - ควรใช้เทคนิค: Data Augmentation, Class Weighting, หรือ Label Smoothing

4. **Recommendations:**
   - ใช้ stratified split สำหรับ train/validation
   - ใช้ data augmentation ที่เหมาะสมกับใบไม้ (flips, rotations, color jitter)
   - Monitor metrics: accuracy, F1-score (weighted), confusion matrix