Getting necessary libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier
import time

ASSUMPTIONS:

1. I am considering 'plot2-IoT Sensor Data' as my test set.
2. 'plot2-Source 1 Weather' and 'plot2-Source 2 Weather' will be considered as train set.
3. For sensor data, I will convert KPa to Pa for standard to remain same throughout.
4. Datetime variable will be feature engineered.
5. Target variable will be 'sensor' and will be made categorical.

In [5]:
Train1 = pd.read_csv('plot2-Source 1 Weather.csv')
Train2 = pd.read_csv('plot2-Source 2 Weather.csv')
Train = Train1.append(Train2, ignore_index=True) #Appended to form Main train set
Train['datetime'] = Train['datetime'].astype('datetime64[ns]') #converting object type to datetime
del Train1, Train2
Train['sensor'] = Train['sensor'].astype('category') #converting dependent variable to categorical

Test = pd.read_csv('plot2-IoT Sensor Data.csv')
Test['value'] = Test['value']*1000 #all pressure are in Pa units
Test['datetime'] = Test['datetime'].astype('datetime64[ns]') #converting object type to datetime
Test['sensor'] = Test['sensor'].astype('category') #converting dependent variable to categorical

In [6]:
Train.head()

Unnamed: 0,sensor,value,datetime
0,TC,16.5,2019-01-01 00:00:00
1,HUM,44.0,2019-01-01 00:00:00
2,PRES,1020.0,2019-01-01 00:00:00
3,TC,17.3,2019-01-01 01:00:00
4,HUM,43.0,2019-01-01 01:00:00


In [7]:
Test.head()

Unnamed: 0,sensor,value,datetime
0,HUM,100000.0,2019-01-01 00:26:26.749
1,HUM,100000.0,2019-01-01 01:26:26.650
2,HUM,100000.0,2019-01-01 02:26:27.049
3,HUM,89860.0,2019-01-01 03:26:26.966
4,HUM,36460.0,2019-01-01 04:26:27.397


Feature engineering done on datetime variable:

1. Not creating year variable as test data had only 2019 as year, redundant in this case.
2. Not creating seconds variable as it is highly volatile in distribution and can cause trouble.
3. After creating new variables, will drop datetime variable.
4. Will one hot encode weekday variable.
5. Normalizing our independent variables.

In [8]:
min_max_scaler = preprocessing.MinMaxScaler()

Train['month'] = Train['datetime'].dt.month
Train['day'] = Train['datetime'].dt.day
Train['hour'] = Train['datetime'].dt.hour
Train['minute'] = Train['datetime'].dt.minute
Train['weekday'] = Train['datetime'].dt.day_name()
onehotTrain = pd.get_dummies(Train['weekday'])
Train = Train.drop('weekday',axis = 1)
Train = Train.join(onehotTrain)
del Train['datetime']
Train.iloc[:,1:] = Train.iloc[:,1:].astype(int)
Train.iloc[:,1:] = min_max_scaler.fit_transform(Train.iloc[:,1:].values)

Test['month'] = Test['datetime'].dt.month
Test['day'] = Test['datetime'].dt.day
Test['hour'] = Test['datetime'].dt.hour
Test['minute'] = Test['datetime'].dt.minute
Test['weekday'] = Test['datetime'].dt.day_name()
onehotTest = pd.get_dummies(Test['weekday'])
Test = Test.drop('weekday',axis = 1)
Test = Test.join(onehotTest)
del Test['datetime']
Test.iloc[:,1:] = Test.iloc[:,1:].astype(int)
Test.iloc[:,1:] = min_max_scaler.fit_transform(Test.iloc[:,1:].values)



In [9]:
Train.shape

(52419, 13)

In [10]:
Test.shape

(24966, 13)

Splitting original Train data into Train/Test set.

In [11]:
X = Train.iloc[:,1:]
y = Train.iloc[:,0]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

Method one:
    
Running a base model. We can choose any model however here I am selecting RandomForestClassifier for classification
and then calculate it's accuracy.

In [12]:
clf = RandomForestClassifier(max_depth=2, random_state=0)

start_time = time.time()
clf.fit(X_train, y_train)
print('Time taken to fit is: ', time.time() - start_time, ' seconds')



Time taken to fit is:  0.3071005344390869  seconds


In [13]:
y_pred = clf.predict(X_test)
print('Accuracy is: ', accuracy_score(y_test, y_pred)*100, ' percent')

Accuracy is:  86.2552460892789  percent


Method Two:

Selecting best base model and improving it further using RandomizedSearchCV and GridSearchCV.
I am considering SVC and RandomForestClassifier as example, we can add more classifiers too.

In [14]:
list_of_models = [SVC(), RandomForestClassifier()]
list_of_acc = []

for i in list_of_models:
    start_time = time.time()
    i.fit(X_train, y_train)
    print(time.time() - start_time)

    predicted = i.predict(X_test) ## Prediction of data
        
    cc = accuracy_score(y_test, predicted)
    list_of_acc.append(cc)
    print('##################')

a = max([(v,i) for i,v in enumerate(list_of_acc)])
b = a[1]
c = list_of_models[b]
print('Best Model is: ', c)



68.15707349777222
##################




0.6257307529449463
##################
Best Model is:  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


Creating parameters grid for above base models:

In [18]:
gridSVC = {'kernel': ['linear', 'rbf', 'sigmoid'],
               'degree': [2,3,4,5],
               'shrinking': [True,False],
               'probability' : [True,False],
               'decision_function_shape': ['ovr', 'ovo'],
               'random_state' : [44,47,48,50,51]}

#----------------------------------------------------------------------------#

gridRandomForestClassifier = {'n_estimators': [10,100,1000],
               'criterion': ['gini', 'entropy'],
               'min_samples_split': [2,5,10],
               'min_samples_leaf': [1,2,5],
               'max_leaf_nodes' : [100,1000],
               'bootstrap' : [True,False]}

list_of_param = [gridSVC,
                 gridRandomForestClassifier]

d = a[1]
e = list_of_param[d]
print('Parameter grid for above selected best base model: ', e)

Parameter grid for above selected best base model:  {'n_estimators': [10, 100, 1000], 'criterion': ['gini', 'entropy'], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 5], 'max_leaf_nodes': [100, 1000], 'bootstrap': [True, False]}


Let us try RandomizedSearchCV with cross validation as 3 over 5 iterations.

In [20]:
op = RandomizedSearchCV(c, e, cv=3, random_state=42, n_iter=5)

start_time = time.time()
op.fit(X_train, y_train) 
print('Time taken to fit is: ', time.time() - start_time, ' seconds')

predicted = op.predict(X_test)
print('Accuracy with RandomizedSearchCV: ', accuracy_score(y_test, predicted)*100, 'percent')

Time taken to fit is:  9.931749105453491  seconds
Accuracy with RandomizedSearchCV:  95.41205646699733 percent


As we can see we already got really better accuracy compared to just base model.
Let us try GridSearchCV on same base model

In [21]:
op =  GridSearchCV(c, e, cv=3)

start_time = time.time()
op.fit(X_train, y_train) 
print('Time taken to fit is: ', time.time() - start_time, ' seconds')

predicted = op.predict(X_test)
print('Accuracy with GridSearchCV: ', accuracy_score(y_test, predicted)*100, 'percent')

Time taken to fit is:  9472.738417387009  seconds
Accuracy with GridSearchCV:  96.19420068676078 percent


We can also try HyperOpt if needed based on our data, what is our objective etc instead of RandomizedSearchCV or GridSearchCV.