In [None]:
!pip install kagglehub --upgrade




In [None]:
import pandas as pd

df = pd.read_csv("/content/smoke_detection_iot.csv", sep=None, engine='python')

df = df.dropna(axis=1, how='all')
df.columns = df.columns.str.strip().str.replace('"', '').str.replace('\ufeff', '')

print("âœ… Auto-detected columns:")
print(df.columns.tolist())


âœ… Auto-detected columns:
['Unnamed: 0', 'UTC', 'Temperature[C]', 'Humidity[%]', 'TVOC[ppb]', 'eCO2[ppm]', 'Raw H2', 'Raw Ethanol', 'Pressure[hPa]', 'PM1.0', 'PM2.5', 'NC0.5', 'NC1.0', 'NC2.5', 'CNT', 'Fire Alarm']


In [None]:
target_col = 'Fire Alarm'

# Drop rows with missing target values
df = df.dropna(subset=[target_col])

# Prepare X and y
X = df.drop(columns=[target_col], errors='ignore')
y = df[target_col]

# Drop any remaining NaNs
X = X.dropna()
y = y.loc[X.index]

print("âœ… X shape:", X.shape, " | y shape:", y.shape)

âœ… X shape: (62630, 16)  | y shape: (62630,)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
import time

# Define pipeline
lr_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Train
start = time.time()
lr_pipeline.fit(X_train, y_train)
lr_time = time.time() - start

# Evaluate
y_pred_lr = lr_pipeline.predict(X_test)
lr_acc = accuracy_score(y_test, y_pred_lr)
lr_f1 = f1_score(y_test, y_pred_lr)

print(f"ðŸ”¹ Logistic Regression Accuracy: {lr_acc:.3f}")
print(f"ðŸ”¹ Logistic Regression F1 Score: {lr_f1:.3f}")
print(f"ðŸ”¹ Training Time: {lr_time:.3f} seconds")


ðŸ”¹ Logistic Regression Accuracy: 0.988
ðŸ”¹ Logistic Regression F1 Score: 0.992
ðŸ”¹ Training Time: 1.423 seconds


In [None]:
from xgboost import XGBClassifier

# Define pipeline
xgb_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # kept for MLOps consistency
    ('classifier', XGBClassifier(
        n_estimators=100,
        max_depth=5,
        learning_rate=0.1,
        use_label_encoder=False,
        eval_metric='logloss'
    ))
])

# Train
start = time.time()
xgb_pipeline.fit(X_train, y_train)
xgb_time = time.time() - start

# Evaluate
y_pred_xgb = xgb_pipeline.predict(X_test)
xgb_acc = accuracy_score(y_test, y_pred_xgb)
xgb_f1 = f1_score(y_test, y_pred_xgb)

print(f"ðŸŒ³ XGBoost Accuracy: {xgb_acc:.3f}")
print(f"ðŸŒ³ XGBoost F1 Score: {xgb_f1:.3f}")
print(f"ðŸŒ³ Training Time: {xgb_time:.3f} seconds")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


ðŸŒ³ XGBoost Accuracy: 1.000
ðŸŒ³ XGBoost F1 Score: 1.000
ðŸŒ³ Training Time: 0.851 seconds


In [None]:
import time

# Logistic Regression runtime
start_time = time.time()
_ = lr_pipeline.predict(X_test)
lr_time = time.time() - start_time

# XGBoost runtime
start_time = time.time()
_ = xgb_pipeline.predict(X_test)
xgb_time = time.time() - start_time

print(f"\nRuntime (seconds):")
print(f"LR-Pipeline: {lr_time:.5f}")
print(f"XGB-Pipeline: {xgb_time:.5f}")




Runtime (seconds):
LR-Pipeline: 0.00675
XGB-Pipeline: 0.02720


In [None]:
%pip install memory_profiler

Collecting memory_profiler
  Downloading memory_profiler-0.61.0-py3-none-any.whl.metadata (20 kB)
Downloading memory_profiler-0.61.0-py3-none-any.whl (31 kB)
Installing collected packages: memory_profiler
Successfully installed memory_profiler-0.61.0


In [None]:
import joblib, os, psutil

# Save models
joblib.dump(lr_pipeline, "lr_pipeline.joblib")
joblib.dump(xgb_pipeline, "xgb_pipeline.joblib")

# Model sizes in KB
lr_size = os.path.getsize("lr_pipeline.joblib") / 1024
xgb_size = os.path.getsize("xgb_pipeline.joblib") / 1024

# Current system memory usage
mem_usage = psutil.virtual_memory().percent

print("\nðŸ’¾ Resource Summary:")
print(f"â€¢ Logistic Regression model size: {lr_size:.2f} KB")
print(f"â€¢ XGBoost model size: {xgb_size:.2f} KB")
print(f"â€¢ Current system RAM usage: {mem_usage:.2f}%")



ðŸ’¾ Resource Summary:
â€¢ Logistic Regression model size: 2.22 KB
â€¢ XGBoost model size: 126.30 KB
â€¢ Current system RAM usage: 8.90%


In [None]:
# ============================
# TP4 - Model Preparation Script
# Generates: lr_pipeline.pkl, xgb_pipeline.pkl
# ============================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from xgboost import XGBClassifier
import pickle

# ----------------------------
# 1. Load and clean dataset
# ----------------------------
df = pd.read_csv("/content/smoke_detection_iot.csv", sep=None, engine='python')

df = df.dropna(axis=1, how='all')
df.columns = df.columns.str.strip().str.replace('"', '').str.replace('\ufeff', '')

print("âœ… Auto-detected columns:")
print(df.columns.tolist())

# df = pd.read_csv(file_path, sep=";", skiprows=1, names=columns)
df = df.dropna(axis=1, how="all")  # drop empty columns
df = df.dropna()  # drop missing rows

# Clean column names
df.columns = [c.strip().replace('"', "") for c in df.columns]

print("âœ… Cleaned columns:", df.columns.tolist()[:10], "...")

# ----------------------------
# 2. Select features + target
# ----------------------------
# Target = Fire Alarm
target_col = "Fire Alarm"
if target_col not in df.columns:
    raise ValueError(f"Column '{target_col}' not found! Check dataset structure.")

# Select numeric sensor features
features = [
   'Unnamed: 0', 'UTC', 'Temperature[C]', 'Humidity[%]', 'TVOC[ppb]', 'eCO2[ppm]', 'Raw H2', 'Raw Ethanol', 'Pressure[hPa]', 'PM1.0', 'PM2.5', 'NC0.5', 'NC1.0', 'NC2.5', 'CNT'
]

X = df[features]
y = df[target_col]

# ----------------------------
# 3. Train/test split
# ----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ----------------------------
# 4. Logistic Regression Pipeline
# ----------------------------
lr_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LogisticRegression(max_iter=500))
])

lr_pipeline.fit(X_train, y_train)

y_pred_lr = lr_pipeline.predict(X_test)
print("\nðŸ”¹ Logistic Regression Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("F1 Score:", f1_score(y_test, y_pred_lr))

# Save model
# Create the directory if it doesn't exist
import os
os.makedirs("TP4/models", exist_ok=True)

with open("TP4/models/lr_pipeline.pkl", "wb") as f:
    pickle.dump(lr_pipeline, f)
print("âœ… Saved: TP4/models/lr_pipeline.pkl")

# ----------------------------
# 5. XGBoost Pipeline
# ----------------------------
xgb_pipeline = Pipeline([
    ("scaler", StandardScaler()),  # for consistency in MLOps
    ("model", XGBClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=4,
        use_label_encoder=False,
        eval_metric="logloss"
    ))
])

xgb_pipeline.fit(X_train, y_train)

y_pred_xgb = xgb_pipeline.predict(X_test)
print("\nðŸ”¹ XGBoost Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("F1 Score:", f1_score(y_test, y_pred_xgb))

# Save model
with open("TP4/models/xgb_pipeline.pkl", "wb") as f:
    pickle.dump(xgb_pipeline, f)
print("âœ… Saved: TP4/models/xgb_pipeline.pkl")

# ----------------------------
# 6. Optional: Model sizes
# ----------------------------
import os
lr_size = os.path.getsize("TP4/models/lr_pipeline.pkl") / 1024
xgb_size = os.path.getsize("TP4/models/xgb_pipeline.pkl") / 1024
print(f"\nðŸ“¦ Model sizes -> Logistic: {lr_size:.2f} KB | XGBoost: {xgb_size:.2f} KB")

âœ… Auto-detected columns:
['Unnamed: 0', 'UTC', 'Temperature[C]', 'Humidity[%]', 'TVOC[ppb]', 'eCO2[ppm]', 'Raw H2', 'Raw Ethanol', 'Pressure[hPa]', 'PM1.0', 'PM2.5', 'NC0.5', 'NC1.0', 'NC2.5', 'CNT', 'Fire Alarm']
âœ… Cleaned columns: ['Unnamed: 0', 'UTC', 'Temperature[C]', 'Humidity[%]', 'TVOC[ppb]', 'eCO2[ppm]', 'Raw H2', 'Raw Ethanol', 'Pressure[hPa]', 'PM1.0'] ...

ðŸ”¹ Logistic Regression Results:
Accuracy: 0.9869072329554527
F1 Score: 0.9908256880733946
âœ… Saved: TP4/models/lr_pipeline.pkl


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



ðŸ”¹ XGBoost Results:
Accuracy: 0.9999201660546064
F1 Score: 0.9999441371990392
âœ… Saved: TP4/models/xgb_pipeline.pkl

ðŸ“¦ Model sizes -> Logistic: 1.77 KB | XGBoost: 114.91 KB
