<a href="https://www.kaggle.com/code/anuptirpude/rf-classification-on-fire-extinguisher-dataset?scriptVersionId=94440346" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# list of available dataset
# ../input/acoustic-extinguisher-fire-dataset/Acoustic_Extinguisher_Fire_Dataset/Acoustic_Extinguisher_Fire_Dataset.xlsx
# ../input/acoustic-extinguisher-fire-dataset/Acoustic_Extinguisher_Fire_Dataset/Acoustic_Extinguisher_Fire_Dataset.arff
# ../input/acoustic-extinguisher-fire-dataset/Acoustic_Extinguisher_Fire_Dataset/Acoustic_Extinguisher_Fire_Dataset_Citation_Request.txt

# reading the data with .arff format
from scipy.io import arff
import pandas as pd

data = arff.loadarff('../input/acoustic-extinguisher-fire-dataset/Acoustic_Extinguisher_Fire_Dataset/Acoustic_Extinguisher_Fire_Dataset.arff')
df = pd.DataFrame(data[0])

df.head()

In [None]:
# covert bytes into standard format 
df['FUEL'] = df['FUEL'].str.decode("utf-8")
df['CLASS'] = df['CLASS'].str.decode("utf-8")
print(df.head())
print(df.info())

In [None]:
# changing the data types 
df['SIZE'] = df['SIZE'].astype('int')
df['FUEL'] = df['FUEL'].astype('str')
df['DISTANCE'] = df['DISTANCE'].astype('int')
df['DESIBEL'] = df['DESIBEL'].astype('int')
df['FREQUENCY'] = df['FREQUENCY'].astype('int')
df['CLASS'] = df['CLASS'].astype('int')
print(df.head())
print(df.info())

In [None]:
!pip install openpyxl

In [None]:
# reading the data shared in excel to compare with .arff format
df_xl = pd.read_excel('../input/acoustic-extinguisher-fire-dataset/Acoustic_Extinguisher_Fire_Dataset/Acoustic_Extinguisher_Fire_Dataset.xlsx')
print(df_xl.head())
print(df_xl.info())

In [None]:
# data type is same for both formats
print(df.info() == df_xl.info())

In [None]:
# checking the dimentions
print(df.shape)
print(df_xl.shape)

In [None]:
# summary by fuel type
df.groupby("FUEL")["SIZE"].count()

In [None]:
# summary by status of flame, seems balanced dataset
df.groupby("CLASS")["SIZE"].count()

In [None]:
# number summary for numeric feilds
df.describe().round(decimals = 2)

In [None]:
# coverting string/character variable/feature to machine understandable format by including dummy variable
# using one hot encoding as column is non-ordinal
df2 = pd.get_dummies(df)
df2.head()

In [None]:
# need to convert data frame to array  
type(df2)

In [None]:
# Labels are the values we want to predict
labels = np.array(df2['CLASS'])

# Remove the labels from the features
# axis 1 refers to the columns
features= df2.drop('CLASS', axis = 1)

# Saving feature names for later use
feature_list = list(df2.columns)

# Convert to numpy array
features = np.array(features)

In [None]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.20, random_state = 42)

print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

In [None]:
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
# Instantiate model with 500 decision trees
rf = RandomForestClassifier(n_estimators = 500, random_state = 42)
# Train the model on training data
rf.fit(train_features, train_labels)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(train_labels, rf.predict(train_features)))

In [None]:
# confusion matrix on train set
from sklearn.metrics import confusion_matrix

conf_mat = confusion_matrix(train_labels, rf.predict(train_features))
print(conf_mat)

In [None]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)
# Calculate the absolute errors
errors = abs(predictions - test_labels)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

In [None]:
from sklearn.metrics import classification_report
print(classification_report(test_labels, predictions))

In [None]:
# confusion matrix on test set
from sklearn.metrics import confusion_matrix

conf_mat = confusion_matrix(test_labels, predictions)
print(conf_mat)

In [None]:
from sklearn.metrics import roc_curve, auc
false_positive_rate, true_positive_rate, thresholds = roc_curve(test_labels, predictions)
roc_auc = auc(false_positive_rate, true_positive_rate)
# print(false_positive_rate)
# print(true_positive_rate)
# print(thresholds)
print(roc_auc)

In [None]:
# for visualisation purpose creating feature 
import copy
feature_list_bc = copy.deepcopy(feature_list)
feature_list_bc.remove('CLASS')
print(feature_list_bc)

In [None]:
# limit max depth for visualisation
model = RandomForestClassifier(max_depth = 3, n_estimators=10)
model.fit(train_features, train_labels)
estimator_limited = model.estimators_[5]

In [None]:
from sklearn.tree import export_graphviz
export_graphviz(estimator_limited, out_file = 'tree_limited.dot', feature_names = feature_list_bc,
                class_names = 'CLASS',
                rounded = True, proportion = False, precision = 2, filled = True)

In [None]:
!dot -Tpng tree_limited.dot -o tree_limited.png -Gdpi=600

In [None]:
from IPython.display import Image
Image(filename = 'tree_limited.png')

In [None]:
from pprint import pprint
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(rf.get_params())

# n_estimators = number of trees in the foreset
# max_features = max number of features considered for splitting a node
# max_depth = max number of levels in each decision tree
# min_samples_split = min number of data points placed in a node before the node is split
# min_samples_leaf = min number of data points allowed in a leaf node
# bootstrap = method for sampling data points (with or without replacement)

In [None]:
# Random Hyperparameter Grid Search
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 50, stop = 800, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 110, num = 11)]
# max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [1,2,3,4,5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1,2,3,4,5]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf2 = RandomForestClassifier()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf2, param_distributions = random_grid, n_iter = 50, cv = 3, verbose=2, random_state=42, n_jobs = -1)

# Fit the random search model
rf_random.fit(train_features, train_labels)

rf_random.best_params_

In [None]:
best_random = rf_random.best_estimator_

# Use the forest's predict method on the test data
predictions = best_random.predict(test_features)
# Calculate the absolute errors
errors = abs(predictions - test_labels)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

# Earlier
# Mean Absolute Error: 0.03 degrees.

In [None]:
# from sklearn.metrics import classification_report
print(classification_report(test_labels, predictions))

# Earlier
#               precision    recall  f1-score   support

#            0       0.97      0.97      0.97      1714
#            1       0.97      0.97      0.97      1775

#     accuracy                           0.97      3489
#    macro avg       0.97      0.97      0.97      3489
# weighted avg       0.97      0.97      0.97      3489

In [None]:
# confusion matrix on test set
# from sklearn.metrics import confusion_matrix

conf_mat = confusion_matrix(test_labels, predictions)
print(conf_mat)

# Earlier 
# [[1659   55]
#  [  46 1729]]

In [None]:
from sklearn.metrics import roc_curve, auc
false_positive_rate, true_positive_rate, thresholds = roc_curve(test_labels, predictions)
roc_auc = auc(false_positive_rate, true_positive_rate)

print(roc_auc)

# Earlier
# 0.9709979127976729

In [None]:
# Grid Search with Cross Validation based on Random Search
# Random Search Given Below Result

# {'n_estimators': 216,
#  'min_samples_split': 3,
#  'min_samples_leaf': 2,
#  'max_features': 'sqrt',
#  'max_depth': 57,
#  'bootstrap': False}

from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [False, True],
    'max_depth': [50,56,57,58,60],
    'max_features': ['sqrt'],
    'min_samples_leaf': [1,2,3],
    'min_samples_split': [2,3,4],
    'n_estimators': [150,200,210,215,220,250]
}
# Create a based model
rf3 = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf3, param_grid = param_grid, 
                          cv = 2, n_jobs = -1, verbose = 2)

# Fit the grid search to the data
grid_search.fit(train_features, train_labels)
grid_search.best_params_

In [None]:
best_grid = grid_search.best_estimator_

# Use the forest's predict method on the test data
predictions = best_grid.predict(test_features)
# Calculate the absolute errors
errors = abs(predictions - test_labels)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

# Earlier best
# Mean Absolute Error: 0.02 degrees.

In [None]:
# from sklearn.metrics import classification_report
print(classification_report(test_labels, predictions))

# Earlier Best
#               precision    recall  f1-score   support

#            0       0.98      0.97      0.98      1714
#            1       0.97      0.98      0.98      1775

#     accuracy                           0.98      3489
#    macro avg       0.98      0.98      0.98      3489
# weighted avg       0.98      0.98      0.98      3489

In [None]:
# confusion matrix on test set
# from sklearn.metrics import confusion_matrix

conf_mat = confusion_matrix(test_labels, predictions)
print(conf_mat)

# Earlier Best
# [[1667   47]
#  [  38 1737]]

In [None]:
from sklearn.metrics import roc_curve, auc
false_positive_rate, true_positive_rate, thresholds = roc_curve(test_labels, predictions)
roc_auc = auc(false_positive_rate, true_positive_rate)

print(roc_auc)

# Earlier Best
# 0.9755851562114812

In [None]:
# hence finalizing model with parameters 

# {'n_estimators': 216,
#  'min_samples_split': 3,
#  'min_samples_leaf': 2,
#  'max_features': 'sqrt',
#  'max_depth': 57,
#  'bootstrap': False}

final_rf = RandomForestClassifier(n_estimators = 216, 
                                  min_samples_split= 3,                                
                                  min_samples_leaf = 2, 
                                  max_features = 'sqrt', 
                                  max_depth = 57, 
                                  bootstrap = False,
                                  random_state = 42)

final_rf.fit(train_features, train_labels)

# Use the forest's predict method on the test data
predictions = final_rf.predict(test_features)
# Calculate the absolute errors
errors = abs(predictions - test_labels)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

print(classification_report(test_labels, predictions))

conf_mat = confusion_matrix(test_labels, predictions)
print(conf_mat)

false_positive_rate, true_positive_rate, thresholds = roc_curve(test_labels, predictions)
roc_auc = auc(false_positive_rate, true_positive_rate)

print(roc_auc)