# BISINDO Classifier - Computer Vision Project

Sistem klasifikasi BISINDO menggunakan Random Forest Classifier dengan implementasi lengkap teknik Computer Vision.

## Configuration

In [None]:
IMG_HEIGHT = 256
IMG_WIDTH = 256
NUM_AUGMENTED_IMAGE = 50
CSV_FILENAME = 'data/bisindo_features.csv'

## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import cv2
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (classification_report, confusion_matrix, 
                            accuracy_score, precision_recall_fscore_support)
import joblib
import warnings
import time
warnings.filterwarnings('ignore')

print('Import library berhasil!')
print(f'OpenCV version: {cv2.__version__}')

## Data Preparation & Splitting

In [None]:
def split_data(csv_filename):
    """
    Load and split data into train, validation, and test sets
    """
    df = pd.read_csv(csv_filename)
    print('='*70)
    print('DATASET INFORMATION')
    print('='*70)
    print(f'Total samples: {len(df):,}')
    print(f'Number of features: {len(df.columns) - 1}')
    print(f"Number of classes: {df['label'].nunique()}")
    print(f"Classes: {sorted(df['label'].unique())}")
    print(f'\nClass distribution:')
    print(df['label'].value_counts().sort_index())
    
    # Check for class imbalance
    class_counts = df['label'].value_counts()
    print(f'\nMin samples per class: {class_counts.min()}')
    print(f'Max samples per class: {class_counts.max()}')
    print(f'Imbalance ratio: {class_counts.max() / class_counts.min():.2f}x')
    
    # Split: 70% train, 15% validation, 15% test
    train, temp = train_test_split(df, test_size=0.3, stratify=df['label'], random_state=42)
    val, test = train_test_split(temp, test_size=0.5, stratify=temp['label'], random_state=42)
    



    # Save splits
    train.to_csv('data/train.csv', index=False)
    val.to_csv('data/val.csv', index=False)
    test.to_csv('data/test.csv', index=False)
    
    print(f'\nüìÅ Data Split:')
    print(f'Train samples: {len(train):,} ({len(train)/len(df)*100:.1f}%)')
    print(f'Validation samples: {len(val):,} ({len(val)/len(df)*100:.1f}%)')
    print(f'Test samples: {len(test):,} ({len(test)/len(df)*100:.1f}%)')
    print('='*70)
    
    return train, val, test

train_df, val_df, test_df = split_data(CSV_FILENAME)

## Exploratory Data Analysis

In [None]:
# Visualize class distribution
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

train_df['label'].value_counts().sort_index().plot(kind='bar', ax=axes[0], color='skyblue', edgecolor='black')
axes[0].set_title('Train Set Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Class (BISINDO Letter)', fontsize=11)
axes[0].set_ylabel('Count', fontsize=11)
axes[0].grid(axis='y', alpha=0.3)
axes[0].tick_params(axis='x', rotation=0)

val_df['label'].value_counts().sort_index().plot(kind='bar', ax=axes[1], color='lightgreen', edgecolor='black')
axes[1].set_title('Validation Set Distribution', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Class (BISINDO Letter)', fontsize=11)
axes[1].set_ylabel('Count', fontsize=11)
axes[1].grid(axis='y', alpha=0.3)
axes[1].tick_params(axis='x', rotation=0)

test_df['label'].value_counts().sort_index().plot(kind='bar', ax=axes[2], color='lightcoral', edgecolor='black')
axes[2].set_title('Test Set Distribution', fontsize=14, fontweight='bold')
axes[2].set_xlabel('Class (BISINDO Letter)', fontsize=11)
axes[2].set_ylabel('Count', fontsize=11)
axes[2].grid(axis='y', alpha=0.3)
axes[2].tick_params(axis='x', rotation=0)

plt.tight_layout()
plt.savefig('images/data_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print('Data distribution visualization saved!')

## Feature Analysis

In [None]:
# Analyze features
feature_cols = [col for col in train_df.columns if col != 'label']
X_train_analysis = train_df[feature_cols]

print('='*70)
print('üìà FEATURE STATISTICS')
print('='*70)
print(f'Number of features: {len(feature_cols)}')
print(f'\nFeature statistics:')
print(X_train_analysis.describe())

# Check for missing values
missing = X_train_analysis.isnull().sum().sum()
print(f'\nMissing values: {missing}')

# Feature correlation (sample)
print(f'\nCalculating feature correlations...')
sample_features = X_train_analysis.iloc[:1000, :20]  # Sample for speed
correlation = sample_features.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation, cmap='coolwarm', center=0, square=True, linewidths=0.5)
plt.title('Feature Correlation Matrix (Sample: First 20 Features)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('images/feature_correlation.png', dpi=300, bbox_inches='tight')
plt.show()

print('Feature analysis completed!')

## Data Preprocessing

In [None]:
# Separate features and labels
X_train = train_df.drop('label', axis=1).values
y_train = train_df['label'].values
X_val = val_df.drop('label', axis=1).values
y_val = val_df['label'].values
X_test = test_df.drop('label', axis=1).values
y_test = test_df['label'].values

print('='*70)
print('DATA PREPROCESSING')
print('='*70)
print(f'Train shape: X={X_train.shape}, y={y_train.shape}')
print(f'Validation shape: X={X_val.shape}, y={y_val.shape}')
print(f'Test shape: X={X_test.shape}, y={y_test.shape}')

# Feature scaling (optional for Random Forest, but can help)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

print(f'\nüìä Feature statistics after scaling:')
print(f'Mean: {X_train_scaled.mean():.6f}')
print(f'Std: {X_train_scaled.std():.6f}')
print(f'Min: {X_train_scaled.min():.6f}')
print(f'Max: {X_train_scaled.max():.6f}')
print('='*70)

print('\nPreprocessing completed!')

## Random Forest Classifier

**Mengapa Random Forest?**

1. **Excellent Performance**: Sangat baik untuk tabular data
2. **Feature Importance**: Dapat menganalisis fitur mana yang penting
3. **Robust**: Tahan terhadap overfitting dengan ensemble method
4. **No Feature Scaling Required**: Tidak sensitif terhadap skala features
5. **Fast Inference**: Cepat untuk real-time prediction
6. **Interpretable**: Lebih mudah dipahami daripada deep learning

In [None]:
print('='*70)
print('TRAINING RANDOM FOREST CLASSIFIER')
print('='*70)

# Initialize Random Forest
rf_clf = RandomForestClassifier(
    n_estimators=200,         
    max_depth=30,              
    min_samples_split=2,       
    min_samples_leaf=1,        
    max_features='sqrt',      
    random_state=42,           
    n_jobs=-1,                 
    verbose=1,                 
    class_weight='balanced'    
)

print('\nüìã Model Configuration:')
print(f'Number of trees: {rf_clf.n_estimators}')
print(f'Max depth: {rf_clf.max_depth}')
print(f'Max features: {rf_clf.max_features}')
print(f'Class weight: {rf_clf.class_weight}')

# Train the model
print('\nTraining started...')
start_time = time.time()

rf_clf.fit(X_train_scaled, y_train)

training_time = time.time() - start_time
print(f'\nTraining completed in {training_time:.2f} seconds')
print('='*70)

## Model Evaluation

In [None]:
# Evaluate on all sets
print('='*70)
print('MODEL PERFORMANCE')
print('='*70)

train_acc = rf_clf.score(X_train_scaled, y_train)
val_acc = rf_clf.score(X_val_scaled, y_val)
test_acc = rf_clf.score(X_test_scaled, y_test)

print(f'\nüéØ Accuracy Results:')
print(f'Train Accuracy: {train_acc*100:.2f}%')
print(f'Validation Accuracy: {val_acc*100:.2f}%')
print(f'Test Accuracy: {test_acc*100:.2f}%')

# Check overfitting
overfit_gap = (train_acc - test_acc) * 100
print(f'\nüìâ Overfitting Analysis:')
print(f'Train-Test Gap: {overfit_gap:.2f}%')
if overfit_gap < 5:
    print('Model generalization: EXCELLENT')
elif overfit_gap < 10:
    print('Model generalization: GOOD')
else:
    print('Model generalization: Consider regularization')

print('='*70)

## Detailed Classification Report

In [None]:
# Make predictions
y_pred_train = rf_clf.predict(X_train_scaled)
y_pred_val = rf_clf.predict(X_val_scaled)
y_pred_test = rf_clf.predict(X_test_scaled)

print('='*70)
print('CLASSIFICATION REPORT (TEST SET)')
print('='*70)
print(classification_report(y_test, y_pred_test))

# Per-class accuracy
classes = sorted(np.unique(y_test))
class_accuracies = []

for cls in classes:
    mask = y_test == cls
    class_acc = accuracy_score(y_test[mask], y_pred_test[mask])
    class_accuracies.append(class_acc)

# Visualize per-class accuracy
plt.figure(figsize=(14, 6))
bars = plt.bar(classes, np.array(class_accuracies)*100, color='skyblue', edgecolor='black')
plt.axhline(y=test_acc*100, color='red', linestyle='--', linewidth=2, label=f'Overall Accuracy: {test_acc*100:.2f}%')
plt.xlabel('Class (BISINDO Letter)', fontsize=12, fontweight='bold')
plt.ylabel('Accuracy (%)', fontsize=12, fontweight='bold')
plt.title('Per-Class Accuracy', fontsize=14, fontweight='bold')
plt.ylim([0, 105])
plt.legend(fontsize=11)
plt.grid(axis='y', alpha=0.3)

# Color bars based on performance
for i, (bar, acc) in enumerate(zip(bars, class_accuracies)):
    if acc >= 0.95:
        bar.set_color('green')
    elif acc >= 0.85:
        bar.set_color('lightgreen')
    elif acc >= 0.75:
        bar.set_color('yellow')
    else:
        bar.set_color('red')

plt.tight_layout()
plt.savefig('images/per_class_accuracy.png', dpi=300, bbox_inches='tight')
plt.show()

print('Per-class accuracy analysis completed!')

## Confusion Matrix

In [None]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred_test)

# Plot confusion matrix
plt.figure(figsize=(16, 14))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=classes,
            yticklabels=classes,
            cbar_kws={'label': 'Count'})
plt.title('Confusion Matrix - Random Forest Classifier', fontsize=16, fontweight='bold')
plt.ylabel('True Label', fontsize=13)
plt.xlabel('Predicted Label', fontsize=13)
plt.tight_layout()
plt.savefig('images/confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

# Normalized confusion matrix
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

plt.figure(figsize=(16, 14))
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Greens', 
            xticklabels=classes,
            yticklabels=classes,
            cbar_kws={'label': 'Proportion'})
plt.title('Normalized Confusion Matrix (Row-wise)', fontsize=16, fontweight='bold')
plt.ylabel('True Label', fontsize=13)
plt.xlabel('Predicted Label', fontsize=13)
plt.tight_layout()
plt.savefig('images/confusion_matrix_normalized.png', dpi=300, bbox_inches='tight')
plt.show()

print('Confusion matrices saved!')

## Feature Importance Analysis

In [None]:
# Get feature importance
feature_importance = rf_clf.feature_importances_
feature_names = train_df.drop('label', axis=1).columns

# Create dataframe
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importance
}).sort_values('Importance', ascending=False)

print('='*70)
print('üîç TOP 20 MOST IMPORTANT FEATURES')
print('='*70)
print(importance_df.head(20).to_string(index=False))
print('='*70)

# Visualize top 30 features
plt.figure(figsize=(12, 10))
top_features = importance_df.head(30)
plt.barh(range(len(top_features)), top_features['Importance'], color='coral', edgecolor='black')
plt.yticks(range(len(top_features)), top_features['Feature'])
plt.xlabel('Importance', fontsize=12, fontweight='bold')
plt.ylabel('Feature', fontsize=12, fontweight='bold')
plt.title('Top 30 Most Important Features', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.savefig('images/feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()

print('\nFeature importance analysis completed!')

## Cross-Validation Analysis

In [None]:
print('='*70)
print('üîÑ CROSS-VALIDATION ANALYSIS (5-Fold)')
print('='*70)
print('This may take a few minutes...')

# Perform 5-fold cross-validation
cv_scores = cross_val_score(rf_clf, X_train_scaled, y_train, cv=5, 
                            scoring='accuracy', n_jobs=-1, verbose=1)

print(f'\nCross-Validation Results:')
print(f'Individual fold scores: {[f"{score*100:.2f}%" for score in cv_scores]}')
print(f'Mean CV Accuracy: {cv_scores.mean()*100:.2f}%')
print(f'Std CV Accuracy: {cv_scores.std()*100:.2f}%')
print(f"95% Confidence Interval: [{(cv_scores.mean() - 2*cv_scores.std())*100:.2f}%, {(cv_scores.mean() + 2*cv_scores.std())*100:.2f}%]\n")

# Visualize CV scores
plt.figure(figsize=(10, 6))
plt.plot(range(1, 6), cv_scores*100, marker='o', linewidth=2, markersize=10, color='blue')
plt.axhline(y=cv_scores.mean()*100, color='red', linestyle='--', linewidth=2, 
            label=f'Mean: {cv_scores.mean()*100:.2f}%')
plt.fill_between(range(1, 6), 
                 (cv_scores.mean() - cv_scores.std())*100, 
                 (cv_scores.mean() + cv_scores.std())*100, 
                 alpha=0.2, color='blue')
plt.xlabel('Fold', fontsize=12, fontweight='bold')
plt.ylabel('Accuracy (%)', fontsize=12, fontweight='bold')
plt.title('5-Fold Cross-Validation Scores', fontsize=14, fontweight='bold')
plt.xticks(range(1, 6))
plt.ylim([0, 105])
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('images/cross_validation.png', dpi=300, bbox_inches='tight')
plt.show()

print('Cross-validation completed!')

## Save Models and Results

In [None]:
print('='*70)
print('üíæ SAVING MODELS AND RESULTS')
print('='*70)

# Save Random Forest model
joblib.dump(rf_clf, 'model/rf_bisindo_99.pkl')
print('Random Forest model saved: model/rf_bisindo_99.pkl')

# Save scaler
joblib.dump(scaler, 'model/scaler.pkl')
print('Scaler saved: model/scaler.pkl')

# Save results summary
results = {
    'Model': 'Random Forest',
    'N_Estimators': rf_clf.n_estimators,
    'Max_Depth': rf_clf.max_depth,
    'Train_Accuracy': f'{train_acc*100:.2f}%',
    'Val_Accuracy': f'{val_acc*100:.2f}%',
    'Test_Accuracy': f'{test_acc*100:.2f}%',
    'CV_Mean_Accuracy': f'{cv_scores.mean()*100:.2f}%',
    'CV_Std_Accuracy': f'{cv_scores.std()*100:.2f}%',
    'Training_Time_Seconds': f'{training_time:.2f}',
    'Number_of_Classes': len(classes),
    'Total_Training_Samples': len(X_train),
    'Total_Test_Samples': len(X_test)
}

results_df = pd.DataFrame([results])
results_df.to_csv('results/model_performance.csv', index=False)
print('Results saved: results/model_performance.csv')

print('='*70)
print('\nALL MODELS AND RESULTS SAVED SUCCESSFULLY!')

# Print summary
print('\n' + '='*70)
print('FINAL SUMMARY')
print('='*70)
print(f'Model: Random Forest Classifier')
print(f'Number of Trees: {rf_clf.n_estimators}')
print(f'Test Accuracy: {test_acc*100:.2f}%')
print(f'Cross-Validation Accuracy: {cv_scores.mean()*100:.2f}% (¬±{cv_scores.std()*100:.2f}%)')
print(f'Training Time: {training_time:.2f} seconds')
print(f'Total Parameters: N/A (Tree-based model)')
print('='*70)

## Inference Time Analysis

In [None]:
# Test inference speed
import time

print('='*70)
print('INFERENCE SPEED ANALYSIS')
print('='*70)

# Single prediction
single_sample = X_test_scaled[0:1]
start = time.time()
for _ in range(1000):
    _ = rf_clf.predict(single_sample)
single_time = (time.time() - start) / 1000



# Batch prediction
batch_sample = X_test_scaled[:100]
start = time.time()
for _ in range(10):
    _ = rf_clf.predict(batch_sample)
batch_time = (time.time() - start) / 10 / 100



print(f'Single prediction time: {single_time*1000:.2f} ms')
print(f'Batch prediction time (per sample): {batch_time*1000:.2f} ms')
print(f'Theoretical FPS (single): {1/single_time:.2f} FPS')
print(f'Theoretical FPS (batch): {1/batch_time:.2f} FPS')
print('='*70)

if single_time < 0.033:  # 30 FPS
    print('\nModel is FAST ENOUGH for real-time applications (>30 FPS)')
else:
    print('\n Model may be too slow for real-time (target: <33ms per frame)')

## Summary: Computer Vision Techniques Applied

### Implemented Computer Vision Techniques (Pertemuan 1-16)

| Pertemuan | Topik | Implementasi | Lokasi |
|-----------|-------|--------------|---------|
| 1-2 | Image Formation & Camera Model | Camera intrinsics calculation | Dashboard |
| 3-4 | Image Preprocessing | Brightness, contrast, saturation | Dashboard |
| 5 | Histogram Processing | Histogram equalization | Dashboard |
| 6 | Thresholding | Binary, adaptive thresholding | Dashboard |
| 7 | Segmentation | Convex hull hand segmentation | Dashboard |
| 8-9 | Feature Extraction | MediaPipe 21 landmarks √ó 3D | Dashboard + Notebook |
| 10 | Object Detection | Hand detection using MediaPipe | Dashboard |
| 11 | Object Tracking | Real-time multi-hand tracking | Dashboard |
| 12 | 3D Reconstruction | Depth visualization (z-coords) | Dashboard |
| 13 | Augmented Reality | AR overlay with bounding boxes | Dashboard |
| 14 | Machine Learning | Random Forest classifier | Notebook |
| 15-16 | Deep Learning | Feature importance analysis | Notebook |

