<a href="https://colab.research.google.com/github/adamelouardi000-dot/energy-classification-ml/blob/main/energy_classification_ml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix

# Charger le CSV (si tu l’as upload dans /content)
df = pd.read_csv("/content/energy_dataset.csv")

df.head()


Unnamed: 0,surface_area,wall_area,roof_area,glazing_area,height,compactness,orientation,energy_class
0,274,378,272,19,4.702751,0.530007,4,1
1,339,359,175,18,4.972581,0.817926,1,0
2,314,297,140,17,3.80912,0.644334,2,0
3,332,448,170,17,5.141293,0.680344,4,0
4,313,342,260,28,4.652013,0.756585,2,1


In [2]:
print("Shape:", df.shape)
display(df.info())
display(df.describe(include="all").T)

# valeurs manquantes
missing = df.isna().sum().sort_values(ascending=False)
display(missing[missing > 0])

# doublons
print("Doublons:", df.duplicated().sum())


Shape: (5000, 8)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   surface_area  5000 non-null   int64  
 1   wall_area     5000 non-null   int64  
 2   roof_area     5000 non-null   int64  
 3   glazing_area  5000 non-null   int64  
 4   height        5000 non-null   float64
 5   compactness   5000 non-null   float64
 6   orientation   5000 non-null   int64  
 7   energy_class  5000 non-null   int64  
dtypes: float64(2), int64(6)
memory usage: 312.6 KB


None

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
surface_area,5000.0,204.9948,84.449791,60.0,132.0,206.0,279.0,349.0
wall_area,5000.0,296.979,115.159417,100.0,197.0,297.0,398.0,499.0
roof_area,5000.0,189.041,63.670434,80.0,133.0,189.0,244.0,299.0
glazing_area,5000.0,22.017,10.029558,5.0,13.0,22.0,31.0,39.0
height,5000.0,3.966727,0.871372,2.500358,3.210989,3.948918,4.720554,5.499744
compactness,5000.0,0.751081,0.141972,0.500037,0.63137,0.750931,0.872326,0.999992
orientation,5000.0,2.499,1.113125,1.0,2.0,2.0,3.0,4.0
energy_class,5000.0,0.491,0.60827,0.0,0.0,0.0,1.0,2.0


Unnamed: 0,0


Doublons: 0


In [3]:
# Afficher les colonnes pour choisir la cible
df.columns


Index(['surface_area', 'wall_area', 'roof_area', 'glazing_area', 'height',
       'compactness', 'orientation', 'energy_class'],
      dtype='object')

In [5]:
# Définition de la variable cible (Y)
TARGET_COL = "energy_class"

# Séparation X / y
X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL]

print("X shape:", X.shape)
print("y distribution:\n", y.value_counts())

print("y distribution:\n", y.value_counts())


X shape: (5000, 7)
y distribution:
 energy_class
0    2845
1    1855
2     300
Name: count, dtype: int64
y distribution:
 energy_class
0    2845
1    1855
2     300
Name: count, dtype: int64


In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y if y.nunique() > 1 else None
)

print("Train:", X_train.shape, "Test:", X_test.shape)


Train: (4000, 7) Test: (1000, 7)


In [7]:
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

print("Num cols:", num_cols)
print("Cat cols:", cat_cols)

preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ],
    remainder="drop"
)


Num cols: ['surface_area', 'wall_area', 'roof_area', 'glazing_area', 'height', 'compactness', 'orientation']
Cat cols: []


In [8]:
models = {
    "LogisticRegression": LogisticRegression(max_iter=2000),
    "KNN": KNeighborsClassifier(n_neighbors=7),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "RandomForest": RandomForestClassifier(n_estimators=300, random_state=42)
}

results = []

for name, model in models.items():
    clf = Pipeline(steps=[
        ("preprocess", preprocess),
        ("model", model)
    ])

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    p, r, f1, _ = precision_recall_fscore_support(y_test, y_pred, average="weighted", zero_division=0)

    results.append({
        "model": name,
        "accuracy": acc,
        "precision_weighted": p,
        "recall_weighted": r,
        "f1_weighted": f1
    })

results_df = pd.DataFrame(results).sort_values("f1_weighted", ascending=False)
results_df


Unnamed: 0,model,accuracy,precision_weighted,recall_weighted,f1_weighted
0,LogisticRegression,0.866,0.864843,0.866,0.864353
3,RandomForest,0.864,0.859912,0.864,0.858841
1,KNN,0.822,0.811818,0.822,0.810502
2,DecisionTree,0.807,0.810607,0.807,0.808598


In [9]:
best_model_name = results_df.iloc[0]["model"]
print("Best model:", best_model_name)

best_clf = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", models[best_model_name])
])

best_clf.fit(X_train, y_train)
y_pred_best = best_clf.predict(X_test)

print("Classification report:\n")
print(classification_report(y_test, y_pred_best, zero_division=0))

print("Confusion matrix:\n")
print(confusion_matrix(y_test, y_pred_best))


Best model: LogisticRegression
Classification report:

              precision    recall  f1-score   support

           0       0.93      0.92      0.92       569
           1       0.80      0.85      0.83       371
           2       0.62      0.47      0.53        60

    accuracy                           0.87      1000
   macro avg       0.78      0.75      0.76      1000
weighted avg       0.86      0.87      0.86      1000

Confusion matrix:

[[521  48   0]
 [ 37 317  17]
 [  0  32  28]]


In [10]:
import joblib
joblib.dump(best_clf, "energy_classifier.pkl")
print("Saved: energy_classifier.pkl")


Saved: energy_classifier.pkl
