In [1]:
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn import ensemble
from sklearn import svm
from sklearn import neighbors
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import VotingClassifier

In [2]:
df=pd.read_csv("ENB_data.csv",sep = ',')

In [3]:
df.columns = ["Relative Compactness","Surface Area","Wall Area",
                "Roof Area", "Overall Height","Orientation","Glazing Area",
                "Glazing Area Distribution", "Heating Load", "Cooling Load"]

In [4]:
df

Unnamed: 0,Relative Compactness,Surface Area,Wall Area,Roof Area,Overall Height,Orientation,Glazing Area,Glazing Area Distribution,Heating Load,Cooling Load
0,0.98,514.5,294.0,110.25,7.0,2,0.0,0,15.55,21.33
1,0.98,514.5,294.0,110.25,7.0,3,0.0,0,15.55,21.33
2,0.98,514.5,294.0,110.25,7.0,4,0.0,0,15.55,21.33
3,0.98,514.5,294.0,110.25,7.0,5,0.0,0,15.55,21.33
4,0.90,563.5,318.5,122.50,7.0,2,0.0,0,20.84,28.28
...,...,...,...,...,...,...,...,...,...,...
763,0.64,784.0,343.0,220.50,3.5,5,0.4,5,17.88,21.40
764,0.62,808.5,367.5,220.50,3.5,2,0.4,5,16.54,16.88
765,0.62,808.5,367.5,220.50,3.5,3,0.4,5,16.44,17.11
766,0.62,808.5,367.5,220.50,3.5,4,0.4,5,16.48,16.61


In [5]:
df.corr()

Unnamed: 0,Relative Compactness,Surface Area,Wall Area,Roof Area,Overall Height,Orientation,Glazing Area,Glazing Area Distribution,Heating Load,Cooling Load
Relative Compactness,1.0,-0.9919015,-0.2037817,-0.8688234,0.8277473,4.6785920000000006e-17,-2.960552e-15,-7.107006e-16,0.622272,0.634339
Surface Area,-0.9919015,1.0,0.1955016,0.8807195,-0.8581477,-3.459372e-17,3.636925e-15,2.438409e-15,-0.65812,-0.672999
Wall Area,-0.2037817,0.1955016,1.0,-0.2923165,0.2809757,-2.429499e-17,-8.567455e-17,2.067384e-16,0.455671,0.427117
Roof Area,-0.8688234,0.8807195,-0.2923165,1.0,-0.9725122,-5.830058e-17,-1.759011e-15,-1.078071e-15,-0.861828,-0.862547
Overall Height,0.8277473,-0.8581477,0.2809757,-0.9725122,1.0,4.4922050000000005e-17,1.4891340000000002e-17,-2.9206130000000004e-17,0.889431,0.895785
Orientation,4.6785920000000006e-17,-3.459372e-17,-2.429499e-17,-5.830058e-17,4.4922050000000005e-17,1.0,-9.406007e-16,-2.549352e-16,-0.002587,0.01429
Glazing Area,-2.960552e-15,3.636925e-15,-8.567455e-17,-1.759011e-15,1.4891340000000002e-17,-9.406007e-16,1.0,0.2129642,0.269841,0.207505
Glazing Area Distribution,-7.107006e-16,2.438409e-15,2.067384e-16,-1.078071e-15,-2.9206130000000004e-17,-2.549352e-16,0.2129642,1.0,0.087368,0.050525
Heating Load,0.6222722,-0.6581202,0.4556712,-0.8618283,0.8894307,-0.002586534,0.269841,0.08736759,1.0,0.975862
Cooling Load,0.6343391,-0.6729989,0.427117,-0.8625466,0.8957852,0.0142896,0.207505,0.05052512,0.975862,1.0


In [6]:
#Roof Area et Overall Height sont les plus corrélées aux deux variables cibles	

In [7]:
df['total_charges']=df["Heating Load"]+df["Cooling Load"]

In [8]:
q=df['total_charges'].quantile([0.25, 0.5, 0.75]).values
def classify(x):
    if x<=q[0]:
        return 0
    if x>q[0] and x<=q[1]:
        return 1
    if x>q[1] and x<=q[2]:
        return 2
    else :
        return 3
charge_classes=df['total_charges'].apply(classify)

In [29]:
q

array([28.75 , 40.97 , 64.335])

In [28]:
charge_classes.value_counts()

1    192
2    192
3    192
0    192
Name: total_charges, dtype: int64

In [9]:
data=df.drop(['Heating Load','Cooling Load','total_charges'],axis=1)

In [10]:
X_train,X_test,y_train,y_test=train_test_split(data,charge_classes,test_size=0.2)
#PREPROCESSING
sc=preprocessing.StandardScaler()
X_train_transformed=sc.fit_transform(X_train)
X_test_transformed=sc.transform(X_test)

**K plus proches voisins**

In [11]:
parameters= { 'n_neighbors' : range(2,51) }
knn = neighbors.KNeighborsClassifier()        
grid=model_selection.GridSearchCV(knn,parameters)
grid.fit(X_train_transformed,y_train.values.ravel())

GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': range(2, 51)})

In [12]:
grid.best_params_

{'n_neighbors': 3}

In [13]:
# Prediction on test set
Y_pred = grid.predict( X_test_transformed )

In [14]:
model_knn=grid.best_estimator_
model_knn.score(X_test_transformed,y_test)

0.8701298701298701

In [15]:
from sklearn.metrics import confusion_matrix
confusion_matrix=confusion_matrix(y_test,Y_pred)
confusion_matrix

array([[33,  1,  0,  0],
       [ 3, 28,  3,  0],
       [ 0,  3, 36,  4],
       [ 0,  1,  5, 37]], dtype=int64)

**SVM**

In [16]:
# import support vector classifier
parameters = { 'kernel' : ['rbf','linear'],'C':[0.1,1,10,50] }
model = svm.SVC()        
grid=model_selection.GridSearchCV(model,parameters)
grid.fit(X_train_transformed,y_train.values.ravel())

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 50], 'kernel': ['rbf', 'linear']})

In [17]:
grid.best_params_

{'C': 50, 'kernel': 'rbf'}

In [18]:
Y_pred = grid.predict( X_test_transformed )

In [19]:
model_sv=grid.best_estimator_
model_sv.score(X_test_transformed,y_test)

0.922077922077922

In [20]:
from sklearn.metrics import confusion_matrix
confusion_matrix=confusion_matrix(y_test,Y_pred)
confusion_matrix

array([[34,  0,  0,  0],
       [ 1, 30,  3,  0],
       [ 0,  4, 39,  0],
       [ 0,  1,  3, 39]], dtype=int64)

**Random Forest**

In [21]:
parameters = { "max_features": ["sqrt","log2",None],
"min_samples_split": range(2,31,2) }
model = ensemble.RandomForestClassifier()        
grid=model_selection.GridSearchCV(model,parameters)
grid.fit(X_train_transformed,y_train.values.ravel())

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'max_features': ['sqrt', 'log2', None],
                         'min_samples_split': range(2, 31, 2)})

In [22]:
grid.best_params_

{'max_features': 'sqrt', 'min_samples_split': 6}

In [23]:
Y_pred = grid.predict( X_test_transformed )

In [24]:
model_forest=grid.best_estimator_
model_forest.score(X_test_transformed,y_test)

0.935064935064935

In [25]:
from sklearn.metrics import confusion_matrix
confusion_matrix=confusion_matrix(y_test,Y_pred)
confusion_matrix

array([[34,  0,  0,  0],
       [ 0, 31,  3,  0],
       [ 0,  3, 39,  1],
       [ 0,  0,  3, 40]], dtype=int64)