In [68]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.feature_selection import chi2
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.preprocessing import LabelEncoder, StandardScaler
import xgboost
import pickle as pkl
from sklearn.model_selection import cross_val_score, GridSearchCV
from imblearn.over_sampling import SMOTE
import warnings

In [69]:
df = pd.read_csv('Train.csv')

In [70]:
df

Unnamed: 0,S.No.,datetime,temperature,humidity,fanSpeed
0,1,01/04/2021 0:01,32,34,4
1,2,01/04/2021 0:01,32,35,4
2,3,01/04/2021 0:01,32,35,4
3,4,01/04/2021 0:02,32,35,4
4,5,01/04/2021 0:04,32,34,4
...,...,...,...,...,...
79379,79380,31/07/2021 9:54,32,92,4
79380,79381,31/07/2021 9:55,31,93,4
79381,79382,31/07/2021 9:55,32,92,4
79382,79383,31/07/2021 9:56,31,93,4


In [71]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79384 entries, 0 to 79383
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   S.No.        79384 non-null  int64 
 1   datetime     79384 non-null  object
 2   temperature  79384 non-null  int64 
 3   humidity     79384 non-null  int64 
 4   fanSpeed     79384 non-null  int64 
dtypes: int64(4), object(1)
memory usage: 3.0+ MB


In [72]:
print('Train data shape:', df.shape)

Train data shape: (79384, 5)


In [73]:
df_cleaned = df.drop(columns=['datetime','S.No.'])

In [74]:
df_cleaned 

Unnamed: 0,temperature,humidity,fanSpeed
0,32,34,4
1,32,35,4
2,32,35,4
3,32,35,4
4,32,34,4
...,...,...,...
79379,32,92,4
79380,31,93,4
79381,32,92,4
79382,31,93,4


## Handling Missing Values


In [75]:
df_cleaned.isnull().sum()

temperature    0
humidity       0
fanSpeed       0
dtype: int64

In [76]:
df_cleaned.isnull().any()

temperature    False
humidity       False
fanSpeed       False
dtype: bool

In [77]:
print(df_cleaned['fanSpeed'].value_counts())

fanSpeed
4    61989
5    10146
3     5495
8     1521
1      122
6       78
2       21
7        8
0        4
Name: count, dtype: int64


In [78]:
df_test = pd.read_csv('Testt.csv')

In [79]:
df_test

Unnamed: 0,S.No.,datetime,temperature,humidity,mode,speed,opTime,eSpent,eSaved,fanSpeed
0,0,10/11/2020 11:42,25,48,0,0,0,0,0,8
1,1,10/11/2020 11:42,25,48,0,0,0,0,0,8
2,2,11/11/2020 20:18,30,29,0,5,31,1488,1488,3
3,3,11/11/2020 20:18,30,25,0,2,31,465,589,8
4,4,11/11/2020 20:19,30,29,0,3,148,2960,5180,3
...,...,...,...,...,...,...,...,...,...,...
110404,110404,15/04/2022 16:31,35,31,0,5,31,1488,1488,8
110405,110405,15/04/2022 16:31,36,32,0,5,1241,59568,59568,8
110406,110406,15/04/2022 16:52,36,33,0,3,93,1860,3255,8
110407,110407,15/04/2022 16:54,35,33,0,3,93,1860,3255,8


## SMOTE

In [80]:
print(df_cleaned["fanSpeed"].value_counts())
print(df_test["fanSpeed"].value_counts())

fanSpeed
4    61989
5    10146
3     5495
8     1521
1      122
6       78
2       21
7        8
0        4
Name: count, dtype: int64
fanSpeed
4    45971
8    36618
3    21864
5     3076
6     1816
1     1052
2        7
7        5
Name: count, dtype: int64


In [81]:
smote=SMOTE(k_neighbors=3)

In [82]:
y_train = df_cleaned["fanSpeed"]
x_train = df_cleaned.drop(columns=['fanSpeed'])

In [83]:
x_res_train,y_res_train = smote.fit_resample(x_train,y_train)

In [84]:
print(y_train.value_counts())
print(y_res_train.value_counts())

fanSpeed
4    61989
5    10146
3     5495
8     1521
1      122
6       78
2       21
7        8
0        4
Name: count, dtype: int64
fanSpeed
4    61989
3    61989
8    61989
1    61989
5    61989
2    61989
7    61989
6    61989
0    61989
Name: count, dtype: int64


In [85]:
scaler = StandardScaler()
x_res_train = pd.DataFrame(scaler.fit_transform(x_res_train), columns=x_train.columns)

In [86]:
x_res_train

Unnamed: 0,temperature,humidity
0,-0.323454,-0.951826
1,-0.323454,-0.935172
2,-0.323454,-0.935172
3,-0.323454,-0.935172
4,-0.323454,-0.951826
...,...,...
557896,-0.281189,-1.201629
557897,-0.238923,-0.502180
557898,-0.281189,-0.935172
557899,-0.238923,-0.402259


In [87]:
y_test = df_cleaned["fanSpeed"]
x_test = df_cleaned.drop(columns=['fanSpeed'])

In [88]:
x_test = pd.DataFrame(scaler.fit_transform(x_test), columns=x_test.columns)

In [89]:
x_test

Unnamed: 0,temperature,humidity
0,-0.298437,-1.890073
1,-0.298437,-1.826733
2,-0.298437,-1.826733
3,-0.298437,-1.826733
4,-0.298437,-1.890073
...,...,...
79379,-0.298437,1.783684
79380,-0.650831,1.847025
79381,-0.298437,1.783684
79382,-0.650831,1.847025


## Writing Fucntions to train the model

In [90]:
#temp_x = x_res_train
#temp_y = y_res_train

def train_models_eval(x_res_train,y_res_train,fts):
    print("\n---›RANDOM FOREST" )
    rf = RandomForestClassifier(random_state=1234)
    rf.fit(x_res_train[fts], y_res_train)
    y_pred=rf.predict(x_test[fts])
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    print("SCORE:",rf.score(x_test[fts],y_test))
    
    print("\n---›DECISION TREE")
    dtf = DecisionTreeClassifier(random_state=1234)
    dtf.fit(x_res_train[fts], y_res_train)
    y_pred=dtf.predict(x_test[fts])
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    print("SCORE:",dtf.score(x_test[fts],y_test))
    
    print(" \n--->KNN")
    knn = KNeighborsClassifier()
    knn.fit(x_res_train[fts], y_res_train)
    y_pred=knn.predict(x_test[fts])
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    print("SCORE:",knn.score(x_test[fts],y_test))
    
    print("\n---›EXTRAS TREES CLASSIFIER")
    etc = ExtraTreesClassifier(random_state=1234)
    etc.fit(x_res_train[fts],y_res_train)
    y_pred=etc.predict(x_test[fts])
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    print("SCORE:",etc.score(x_test[fts],y_test))
    
    print("\n---›XGBOOST" )
    xgb = xgboost.XGBClassifier ()
    xgb.fit(x_res_train[fts], y_res_train)
    y_pred=xgb.predict(x_test[fts])
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    print("SCORE:", xgb.score(x_test[fts],y_test))
    
    return rf,dtf,knn,etc,xgb

In [91]:
fts = ['temperature','humidity']
rf,dtf,knn,etc,xgb = train_models_eval(x_res_train,y_res_train,fts)


---›RANDOM FOREST
[[    4     0     0     0     0     0     0     0     0]
 [  122     0     0     0     0     0     0     0     0]
 [   21     0     0     0     0     0     0     0     0]
 [ 5495     0     0     0     0     0     0     0     0]
 [ 9895     0     0     0 31757   793 12140  2638  4766]
 [    0     0     0     0  8254   191   190    73  1438]
 [    0     0     0     0     0     0    16     0    62]
 [    0     0     0     0     0     0     0     8     0]
 [  268     0     0     0   293    23   820    59    58]]
              precision    recall  f1-score   support

           0       0.00      1.00      0.00         4
           1       0.00      0.00      0.00       122
           2       0.00      0.00      0.00        21
           3       0.00      0.00      0.00      5495
           4       0.79      0.51      0.62     61989
           5       0.19      0.02      0.03     10146
           6       0.00      0.21      0.00        78
           7       0.00      1.00 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[[    4     0     0     0     0     0     0     0     0]
 [  122     0     0     0     0     0     0     0     0]
 [   21     0     0     0     0     0     0     0     0]
 [ 5495     0     0     0     0     0     0     0     0]
 [ 9895     0     0     0 16616     0 12140  2638 20700]
 [    0     0     0     0     0     0   190    11  9945]
 [    0     0     0     0     0     0    16     0    62]
 [    0     0     0     0     0     0     0     8     0]
 [  268     0     0     0     0     0   820     9   424]]
              precision    recall  f1-score   support

           0       0.00      1.00      0.00         4
           1       0.00      0.00      0.00       122
           2       0.00      0.00      0.00        21
           3       0.00      0.00      0.00      5495
           4       1.00      0.27      0.42     61989
           5       0.00      0.00      0.00     10146
           6       0.00      0.21      0.00        78
           7       0.00      1.00      0.01         8

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[[    4     0     0     0     0     0     0     0     0]
 [  102    20     0     0     0     0     0     0     0]
 [    3    18     0     0     0     0     0     0     0]
 [  969  3705     0     0   174     0     0     0   647]
 [  611  7860     0     0 37133  2329  4100     0  9956]
 [    0     0     0     0  7224  1102   992     0   828]
 [    0     0     0     0     0     0    66     0    12]
 [    0     0     0     0     0     0     0     8     0]
 [  109    58     0     0   178   174   353     5   644]]
              precision    recall  f1-score   support

           0       0.00      1.00      0.00         4
           1       0.00      0.16      0.00       122
           2       0.00      0.00      0.00        21
           3       0.00      0.00      0.00      5495
           4       0.83      0.60      0.70     61989
           5       0.31      0.11      0.16     10146
           6       0.01      0.85      0.02        78
           7       0.62      1.00      0.76         8

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


SCORE: 0.49099314723369947

---›EXTRAS TREES CLASSIFIER
[[    4     0     0     0     0     0     0     0     0]
 [  108    14     0     0     0     0     0     0     0]
 [   21     0     0     0     0     0     0     0     0]
 [ 3392   970     0     0     0     0     0     0  1133]
 [ 7775   583     0     0 32508   793 11629  1159  7542]
 [    0     0     0     0  2409   191   430  5866  1250]
 [    0     0     0     0     0     0    15    63     0]
 [    0     0     0     0     0     0     0     8     0]
 [  106    32     0     0     0     0   769   433   181]]
              precision    recall  f1-score   support

           0       0.00      1.00      0.00         4
           1       0.01      0.11      0.02       122
           2       0.00      0.00      0.00        21
           3       0.00      0.00      0.00      5495
           4       0.93      0.52      0.67     61989
           5       0.19      0.02      0.03     10146
           6       0.00      0.19      0.00        

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[[    4     0     0     0     0     0     0     0     0]
 [  122     0     0     0     0     0     0     0     0]
 [   21     0     0     0     0     0     0     0     0]
 [ 5495     0     0     0     0     0     0     0     0]
 [ 9895     0 16715     0 17680     0 12140     0  5559]
 [    0     0  8327     0     0     0   190     0  1629]
 [    0     0     0     0     0     0    16     0    62]
 [    0     0     0     0     0     0     0     8     0]
 [  268     0   352     0     0     0   820     0    81]]
              precision    recall  f1-score   support

           0       0.00      1.00      0.00         4
           1       0.00      0.00      0.00       122
           2       0.00      0.00      0.00        21
           3       0.00      0.00      0.00      5495
           4       1.00      0.29      0.44     61989
           5       0.00      0.00      0.00     10146
           6       0.00      0.21      0.00        78
           7       1.00      1.00      1.00         8

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
