# ðŸ”¥ Gas Leakage â€” Multi-Class Classification

This notebook loads MQ-sensor readings from `gas_data.csv` (6,400 samples, 4 balanced classes: **NoGas**, **Perfume**, **Smoke**, **Mixture**), performs EDA, trains a **Random Forest Classifier**, evaluates it, and exports the model.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
import joblib, os, warnings
warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
print('Libraries loaded âœ…')

## 1 â€” Load Data

In [None]:
df = pd.read_csv('../raw/gas_data.csv')
print(f'Total samples: {len(df)}')
print(f'Columns: {list(df.columns)}')
df.head()

In [None]:
# Define feature columns (7 MQ sensors) and target
sensor_cols = ['MQ2', 'MQ3', 'MQ5', 'MQ6', 'MQ7', 'MQ8', 'MQ135']
target_col = 'Gas'

print('\nClass distribution:')
print(df[target_col].value_counts())
print(f'\nMissing values: {df[sensor_cols].isnull().sum().sum()}')
df[sensor_cols].describe()

## 2 â€” Exploratory Data Analysis

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Class distribution
colors = ['#2ecc71', '#e74c3c', '#3498db', '#f39c12']
df[target_col].value_counts().plot.bar(ax=axes[0, 0], color=colors, edgecolor='white')
axes[0, 0].set_title('Class Distribution')
axes[0, 0].set_ylabel('Count')

# Correlation heatmap
corr = df[sensor_cols].corr()
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', ax=axes[0, 1], vmin=-1, vmax=1)
axes[0, 1].set_title('Sensor Correlation Matrix')

# Sensor distributions by class (MQ2)
for gas_type, color in zip(df[target_col].unique(), colors):
    subset = df[df[target_col] == gas_type]
    axes[1, 0].hist(subset['MQ2'], bins=30, alpha=0.5, label=gas_type, color=color)
axes[1, 0].set_title('MQ2 Distribution by Gas Type')
axes[1, 0].set_xlabel('MQ2 Reading')
axes[1, 0].legend()

# MQ135 distribution by class
for gas_type, color in zip(df[target_col].unique(), colors):
    subset = df[df[target_col] == gas_type]
    axes[1, 1].hist(subset['MQ135'], bins=30, alpha=0.5, label=gas_type, color=color)
axes[1, 1].set_title('MQ135 Distribution by Gas Type')
axes[1, 1].set_xlabel('MQ135 Reading')
axes[1, 1].legend()

plt.tight_layout()
plt.show()

In [None]:
# Box plots per sensor grouped by gas type
fig, axes = plt.subplots(2, 4, figsize=(18, 8))
axes = axes.flatten()

for i, col in enumerate(sensor_cols):
    df.boxplot(column=col, by=target_col, ax=axes[i])
    axes[i].set_title(col)
    axes[i].set_xlabel('')

axes[-1].axis('off')  # hide extra subplot
plt.suptitle('Sensor Readings by Gas Type', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()

## 3 â€” Train / Test Split

In [None]:
X = df[sensor_cols]
le = LabelEncoder()
y = le.fit_transform(df[target_col])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f'Train: {len(X_train)}, Test: {len(X_test)}')
print(f'Classes: {list(le.classes_)}')

## 4 â€” Train Random Forest Classifier

In [None]:
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=20,
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)')
print('\nClassification Report:')
print(classification_report(y_test, y_pred, target_names=le.classes_))

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.title(f'Confusion Matrix â€” Accuracy: {accuracy*100:.2f}%')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.show()

In [None]:
# Feature Importance
importances = pd.Series(rf_model.feature_importances_, index=sensor_cols).sort_values(ascending=True)

plt.figure(figsize=(8, 5))
importances.plot.barh(color='#3498db', edgecolor='white')
plt.title('Feature Importance (Random Forest)')
plt.xlabel('Importance')
plt.tight_layout()
plt.show()

## 5 â€” Export Model

In [None]:
os.makedirs('../models', exist_ok=True)

joblib.dump({
    'random_forest': rf_model,
    'label_encoder': le,
    'features': sensor_cols,
    'classes': list(le.classes_),
    'accuracy': accuracy
}, '../models/gas_model.pkl')

print('âœ… Model exported to models/gas_model.pkl')