In [2]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn import metrics

import pandas as pd
import numpy as np
import seaborn as sns

# Summary
- A clean dataset with all continuous, numeric variables.
- I utilized logistic regression and random forest classifier. 
- The data required little cleaning or feature engineering.
- The random forest classifier had the best accuracy of the two models. 

### Data Import

In [3]:
PATH = 'C:\\Users\\corey\\Desktop\\git\\portfolio\\data_portfolio\\smoke_detection\\data\\smoke_detection_iot.csv'
data = pd.read_csv(PATH, index_col=0)
data.head()

Unnamed: 0,UTC,Temperature[C],Humidity[%],TVOC[ppb],eCO2[ppm],Raw H2,Raw Ethanol,Pressure[hPa],PM1.0,PM2.5,NC0.5,NC1.0,NC2.5,CNT,Fire Alarm
0,1654733331,20.0,57.36,0,400,12306,18520,939.735,0.0,0.0,0.0,0.0,0.0,0,0
1,1654733332,20.015,56.67,0,400,12345,18651,939.744,0.0,0.0,0.0,0.0,0.0,1,0
2,1654733333,20.029,55.96,0,400,12374,18764,939.738,0.0,0.0,0.0,0.0,0.0,2,0
3,1654733334,20.044,55.28,0,400,12390,18849,939.736,0.0,0.0,0.0,0.0,0.0,3,0
4,1654733335,20.059,54.69,0,400,12403,18921,939.744,0.0,0.0,0.0,0.0,0.0,4,0


In [4]:
# No null values and each var is numeric
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 62630 entries, 0 to 62629
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   UTC             62630 non-null  int64  
 1   Temperature[C]  62630 non-null  float64
 2   Humidity[%]     62630 non-null  float64
 3   TVOC[ppb]       62630 non-null  int64  
 4   eCO2[ppm]       62630 non-null  int64  
 5   Raw H2          62630 non-null  int64  
 6   Raw Ethanol     62630 non-null  int64  
 7   Pressure[hPa]   62630 non-null  float64
 8   PM1.0           62630 non-null  float64
 9   PM2.5           62630 non-null  float64
 10  NC0.5           62630 non-null  float64
 11  NC1.0           62630 non-null  float64
 12  NC2.5           62630 non-null  float64
 13  CNT             62630 non-null  int64  
 14  Fire Alarm      62630 non-null  int64  
dtypes: float64(8), int64(7)
memory usage: 7.6 MB


In [5]:
data.describe()

Unnamed: 0,UTC,Temperature[C],Humidity[%],TVOC[ppb],eCO2[ppm],Raw H2,Raw Ethanol,Pressure[hPa],PM1.0,PM2.5,NC0.5,NC1.0,NC2.5,CNT,Fire Alarm
count,62630.0,62630.0,62630.0,62630.0,62630.0,62630.0,62630.0,62630.0,62630.0,62630.0,62630.0,62630.0,62630.0,62630.0,62630.0
mean,1654792000.0,15.970424,48.539499,1942.057528,670.021044,12942.453936,19754.257912,938.627649,100.594309,184.46777,491.463608,203.586487,80.049042,10511.386157,0.714626
std,110002.5,14.359576,8.865367,7811.589055,1905.885439,272.464305,609.513156,1.331344,922.524245,1976.305615,4265.661251,2214.738556,1083.383189,7597.870997,0.451596
min,1654712000.0,-22.01,10.74,0.0,400.0,10668.0,15317.0,930.852,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1654743000.0,10.99425,47.53,130.0,400.0,12830.0,19435.0,938.7,1.28,1.34,8.82,1.384,0.033,3625.25,0.0
50%,1654762000.0,20.13,50.15,981.0,400.0,12924.0,19501.0,938.816,1.81,1.88,12.45,1.943,0.044,9336.0,1.0
75%,1654778000.0,25.4095,53.24,1189.0,438.0,13109.0,20078.0,939.418,2.09,2.18,14.42,2.249,0.051,17164.75,1.0
max,1655130000.0,59.93,75.2,60000.0,60000.0,13803.0,21410.0,939.861,14333.69,45432.26,61482.03,51914.68,30026.438,24993.0,1.0


Unbalanced data in favor of the positive class. This can be fixed with downsampling the positive case, upsampling the negative case, or using weights for each class

In [6]:
fire = data.loc[data['Fire Alarm'] == 1]
noFire = data.loc[data['Fire Alarm'] == 0]
print(f'Proportion of positive class: {len(fire)/len(data):.2f}')

Proportion of positive class: 0.71


In [7]:
# UTC (timestamp) and CNT (sample counter) are most likely unhelpful data

data = data.drop(['UTC', 'CNT'], axis=1)

### Split data into train/test sets and normalize

In [8]:
# Split data into test/train
X = data.drop('Fire Alarm', axis=1)
y = data['Fire Alarm']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Normalize features
scaler = StandardScaler()
X_train_prep, X_test_prep = scaler.fit_transform(X_train), scaler.transform(X_test)

### Define util functions

In [9]:
def crossVal(pipeline, X, y):
    """
    Accepts pipelin and data
    Returns cross val accuracy scores and mean accuracy
    """
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
    scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

    print(f'Cross-validation scores: {scores}')
    print('---------------------------------')
    print(f'Avg Accuracy (std): {np.mean(scores)*100:.2f} ({np.std(scores)*100:.2f})')

In [10]:
def evalModel(model, X_train, y_train):
    """
    Accepts: model and prepared data
    Returns: prints precision, recall and AUC
    """
    model.fit(X_train, y_train)
    predicted = model.predict(X_train)
    tn, fp, fn, tp = confusion_matrix(y_train, predicted).ravel()
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    fpr, tpr, thresholds = metrics.roc_curve(y_train, predicted, pos_label=1)
    auc = metrics.auc(fpr, tpr)
    print(f'Model precision: {precision}')
    print(f'Model recall:    {recall}')
    print(f'Model AUC:       {auc}')

## Begin with logistic regression

### Grid search for best parameters

In [11]:
# parameters = {"C": (1, 10, 100),
#               "solver": ('newton-cg', 'sag', 'saga'),
#               "max_iter": (500, 1000)}
# lr = LogisticRegression(penalty='l2', class_weight='balanced')
# gs = GridSearchCV(lr, parameters)
# gs.fit(X_train_prep, y_train)
# gs.best_params_
# output:{'C': 10, 'max_iter': 500, 'solver': 'saga'}

In [12]:
# Create pipeline and model using best params
steps = list()
steps.append(('std_scaler', StandardScaler()))
steps.append(('model', LogisticRegression(C=10, class_weight='balanced', penalty='l2', max_iter=500, solver='saga')))
pipeline = Pipeline(steps)

crossVal(pipeline, X_train_prep, y_train)

Cross-validation scores: [0.90846195 0.90544616 0.90286525 0.90277655 0.90304267 0.90642186
 0.90411566 0.90588131 0.9010024  0.90614743 0.90855065 0.90065638
 0.90490553 0.90437328 0.90437328]
---------------------------------
Avg Accuracy (std): 90.46 (0.23)


In [13]:
model = LogisticRegression(C=10, class_weight='balanced', penalty='l2', max_iter=1000, solver='saga')
evalModel(model, X_train_prep, y_train)

Model precision: 0.964257199127056
Model recall:    0.8997466971292342
Model AUC:       0.9081626833059054


## Random forest classifier

In [14]:
forest = RandomForestClassifier(max_depth=20, 
                               max_features='log2', 
                               min_samples_leaf=1, 
                               n_estimators=100, 
                               bootstrap=True, 
                               n_jobs=3)
# Create pipeline
steps = list()
steps.append(('std_scaler', StandardScaler()))
steps.append(('model', forest))
pipeline = Pipeline(steps)
crossVal(pipeline, X_train_prep, y_train)

Cross-validation scores: [0.9999113  1.         0.99982258 1.         1.         0.9997339
 1.         1.         1.         1.         0.9998226  1.
 1.         0.99982258 0.99982258]
---------------------------------
Avg Accuracy (std): 99.99 (0.01)


In [15]:
evalModel(forest, X_train_prep, y_train)

Model precision: 1.0
Model recall:    1.0
Model AUC:       1.0


In [16]:
forest.fit(X_train_prep, y_train)

In [18]:
pred = forest.predict(X_test_prep)

tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
precision = tp/(tp+fp)
recall = tp/(tp+fn)
fpr, tpr, thresholds = metrics.roc_curve(y_test, pred, pos_label=1)
auc = metrics.auc(fpr, tpr)
print(f'Test set precision: {precision}')
print(f'            recall: {precision}')
print(f'               AUC: {precision}')

Test set precision: 1.0
            recall: 1.0
               AUC: 1.0
