## 1) Imports

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import kagglehub

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

from xgboost import XGBClassifier


## 2) Load dataset from Kaggle + read CSV

In [None]:
dataset_dir = kagglehub.dataset_download("deepcontractor/smoke-detection-dataset")
csv_path = os.path.join(dataset_dir, "smoke_detection_iot.csv")

df = pd.read_csv(csv_path)
print("Data shape:", df.shape)
df.head()


## 3) Quick inspection + missing values handling .

In [None]:
df.info()


In [None]:
print("Missing values per column:")
print(df.isnull().sum())


In [None]:
df = df.dropna().reset_index(drop=True)
print("After dropna:", df.shape)


## 4) Split features/target

In [None]:
target_col = "Fire Alarm"

X = df.drop(columns=[target_col])
y = df[target_col]

print("X shape:", X.shape)
print("y shape:", y.shape)


## 5) Train/Test split (80/20)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train:", X_train.shape, "Test:", X_test.shape)


## 6) Standardization + Logistic Regression training

In [None]:
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s  = scaler.transform(X_test)

lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_s, y_train)


## 7) Logistic Regression evaluation + Confusion Matrix

In [None]:
y_pred_lr = lr_model.predict(X_test_s)

acc_lr = metrics.accuracy_score(y_test, y_pred_lr)
pre_lr = metrics.precision_score(y_test, y_pred_lr)
rec_lr = metrics.recall_score(y_test, y_pred_lr)
f1_lr  = metrics.f1_score(y_test, y_pred_lr)

print("Logistic Regression results")
print(f"Accuracy : {acc_lr:.4f}")
print(f"Precision: {pre_lr:.4f}")
print(f"Recall   : {rec_lr:.4f}")
print(f"F1-score : {f1_lr:.4f}")


In [None]:
cm_lr = metrics.confusion_matrix(y_test, y_pred_lr)

plt.figure(figsize=(5,4))
sns.heatmap(
    cm_lr, annot=True, fmt="d",
    xticklabels=["No Fire", "Fire"],
    yticklabels=["No Fire", "Fire"]
)
plt.title("Confusion Matrix — Logistic Regression")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


## 8) Bonus: XGBoost training + evaluation

In [None]:
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric="logloss")
xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)

acc_x = metrics.accuracy_score(y_test, y_pred_xgb)
pre_x = metrics.precision_score(y_test, y_pred_xgb)
rec_x = metrics.recall_score(y_test, y_pred_xgb)
f1_x  = metrics.f1_score(y_test, y_pred_xgb)

print("XGBoost results")
print(f"Accuracy : {acc_x:.4f}")
print(f"Precision: {pre_x:.4f}")
print(f"Recall   : {rec_x:.4f}")
print(f"F1-score : {f1_x:.4f}")


## 9) Conclusion: focus on Recall and False Negatives

In [None]:
print("Conclusion")
print(f"- Recall (Logistic Regression): {rec_lr:.4f}")
print(f"- Recall (XGBoost)          : {rec_x:.4f}")
print("In fire detection, the most dangerous error is a False Negative (fire exists but model predicts no fire).")
print("That’s why Recall is a key metric for reliability of an IoT fire alarm system.")
