# Random Forest Experiment v2

## Objective
Train a **Random Forest Classifier** on our **Augmented (Salted)** dataset with all **6 Features** aligned to the simulator parameters.

### Features:
1. Pressure(Bar)
2. Drift_Velocity
3. Confidence_R2
4. **Part Temp(C)** - OK: 830-870Â°C
5. Scan Speed
6. **Quench Flow(LPM)** - OK: 80-150 LPM

### Test Cases:
- TC-01: Golden Run (All OK)
- TC-02: Slow Death (Drift Failure)
- TC-03: Flow Failure (Pump Issue)
- TC-04: Cold Shock (Temp Too Low)

In [2]:
!pip install pandas numpy matplotlib seaborn scikit-learn joblib



In [3]:
# Cell 1: Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib

print("Libraries Loaded.")

ModuleNotFoundError: No module named 'pandas'

In [None]:
# Cell 2: Load Data
# Use 'Augmented_Training_Data.csv' generated by step_5_final_model.py

file_path = 'Data/Augmented_Training_Data.csv'
df = pd.read_csv(file_path)

# Updated Feature List (6 Features)
features = ['Pressure(Bar)', 'Drift_Velocity', 'Confidence_R2', 'Part Temp(C)', 'Scan Speed', 'Quench Flow(LPM)']

# Verify all features exist
missing = [f for f in features if f not in df.columns]
if missing:
    print(f"ERROR: Missing columns: {missing}")
    print(f"Available: {list(df.columns)}")
else:
    X = df[features]
    y = df['Is Anomaly']
    print(f"Data Loaded: {len(df)} rows")
    print(f"Features: {features}")

In [None]:
# Cell 3: Stratified Train/Test Split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    shuffle=True, 
    stratify=y, 
    random_state=42
)

print(f"Training Set: {len(X_train)} samples")
print(f"Testing Set:  {len(X_test)} samples")

In [None]:
# Cell 4: Initialize Random Forest with Regularization

print("Initializing Robust Model...")

rf_model = RandomForestClassifier(
    n_estimators=200,           # 200 Trees
    max_depth=12,               # Limit depth (Prevent Overfitting)
    min_samples_leaf=10,        # Smoothes decisions
    class_weight='balanced',    # Boost Recall for Failures
    random_state=42, 
    n_jobs=-1                   # Use all CPU cores
)

print("Model configured.")

In [None]:
# Cell 5: Training

print("Training Random Forest...")
rf_model.fit(X_train, y_train)
print("Training Complete.")

In [None]:
# Cell 6: Accuracy Evaluation

train_acc = accuracy_score(y_train, rf_model.predict(X_train))
test_acc = accuracy_score(y_test, rf_model.predict(X_test))
gap = train_acc - test_acc

print(f"Training Accuracy: {train_acc:.2%}")
print(f"Testing Accuracy:  {test_acc:.2%}")
print(f"Overfit Gap:       {gap:.2%}")

if gap > 0.05:
    print("DIAGNOSIS: Overfitting (Gap > 5%)")
else:
    print("DIAGNOSIS: Model is Healthy!")

In [None]:
# Cell 7: Classification Report

print("--- Classification Report ---")
print(classification_report(y_test, rf_model.predict(X_test)))

In [None]:
# Cell 8: Confusion Matrix

cm = confusion_matrix(y_test, rf_model.predict(X_test))
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['OK', 'NG'], yticklabels=['OK', 'NG'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Cell 9: Feature Importance

importances = pd.DataFrame({
    'Feature': features,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

print(importances)

plt.figure(figsize=(10, 5))
sns.barplot(x='Importance', y='Feature', data=importances, palette='viridis')
plt.title('Random Forest Feature Importance')
plt.show()

---
## Test Cases

In [None]:
# Cell 10: TC-01 - Golden Run (All OK)

tc_01 = pd.DataFrame([{
    'Pressure(Bar)': 3.5,
    'Drift_Velocity': 0.00,
    'Confidence_R2': 0.95,
    'Part Temp(C)': 850,       # OK: 830-870
    'Scan Speed': 10,
    'Quench Flow(LPM)': 120    # OK: 80-150
}])

pred = rf_model.predict(tc_01)[0]
prob = rf_model.predict_proba(tc_01)[0][1]

print("--- TC-01: Golden Run ---")
print(f"Expected: OK (0)")
print(f"Predicted: {pred}")
print(f"Risk Score: {prob:.1%}")
print("PASS" if pred == 0 else "FAIL")

In [None]:
# Cell 11: TC-02 - Slow Death (Drift Leak)

tc_02 = pd.DataFrame([{
    'Pressure(Bar)': 3.2,       # Still acceptable
    'Drift_Velocity': -0.06,    # DANGEROUS: Leaking!
    'Confidence_R2': 0.95,
    'Part Temp(C)': 850,
    'Scan Speed': 10,
    'Quench Flow(LPM)': 120
}])

pred = rf_model.predict(tc_02)[0]
prob = rf_model.predict_proba(tc_02)[0][1]

print("--- TC-02: Slow Death ---")
print(f"Expected: NG (1)")
print(f"Predicted: {pred}")
print(f"Risk Score: {prob:.1%}")
print("PASS" if pred == 1 else "FAIL")

In [None]:
# Cell 12: TC-03 - Flow Failure (Pump Issue)

tc_03 = pd.DataFrame([{
    'Pressure(Bar)': 3.5,
    'Drift_Velocity': 0.00,
    'Confidence_R2': 0.95,
    'Part Temp(C)': 850,
    'Scan Speed': 10,
    'Quench Flow(LPM)': 40      # DOWN: < 50 LPM
}])

pred = rf_model.predict(tc_03)[0]
prob = rf_model.predict_proba(tc_03)[0][1]

print("--- TC-03: Flow Failure ---")
print(f"Expected: NG (1)")
print(f"Predicted: {pred}")
print(f"Risk Score: {prob:.1%}")
print("PASS" if pred == 1 else "FAIL")

In [None]:
# Cell 13: TC-04 - Cold Shock (Temp Too Low)

tc_04 = pd.DataFrame([{
    'Pressure(Bar)': 3.5,
    'Drift_Velocity': 0.00,
    'Confidence_R2': 0.95,
    'Part Temp(C)': 42,         # PROBLEM: Water Temp, not Metal Temp
    'Scan Speed': 10,
    'Quench Flow(LPM)': 120
}])

pred = rf_model.predict(tc_04)[0]
prob = rf_model.predict_proba(tc_04)[0][1]

print("--- TC-04: Cold Shock ---")
print(f"Expected: NG (1) - Temperature way outside OK range")
print(f"Predicted: {pred}")
print(f"Risk Score: {prob:.1%}")
print("PASS" if pred == 1 else "FAIL")

---
## Save Model

In [None]:
# Cell 14: Save Model

model_filename = 'final_random_forest.joblib'
joblib.dump(rf_model, model_filename)

print(f"Model saved as '{model_filename}'")