In [1]:
# Importing necessary libraries

import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import joblib

In [2]:
# Importing labelled dataset from the detection module

df = pd.read_csv('D:/bmsAnomalyDetection/application/Anomaly-detection/notebook/data/labelledData.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,timestamps,BMS_state,BMS_soc,BMS_soh,BMS_bus_voltage,BMS_bus_current,BMS_isolation,BMS_max_cell_temp,BMS_max_cell_temp_id,...,OBC_port_status,OBC_overvoltage_fault,OBC_overcurrent_fault,OBC_output_voltage,OBC_output_current,OBC_port_weld_fault,OBC_internal_voltage,OBC_internal_current,Anomaly,Anomaly_Score
0,0,2024-07-23 10:54:34.076000+05:30,3.0,95.0,96.5,443.8,223.3,1190.0,29.3,1.0,...,0.0,0.0,0.0,348.0,212.0,0.0,345.0,143.0,0,301.415981
1,1,2024-07-23 10:54:39.076000+05:30,3.0,95.0,96.106415,443.996792,222.414435,1190.0,29.102313,1.0,...,0.0,0.0,0.0,345.059976,251.200314,0.0,343.009653,161.410708,0,-47.702002
2,2,2024-07-23 10:54:44.076000+05:30,3.0,95.0,95.712831,444.193585,221.528869,1190.0,28.904626,1.0,...,0.0,0.0,0.0,342.119953,290.400627,0.0,341.019306,179.821416,0,30.631203
3,3,2024-07-23 10:54:49.076000+05:30,3.0,95.094741,96.173705,445.763226,220.126256,1205.0,28.661967,2.0,...,0.0,0.0,0.0,344.309767,291.076093,0.0,339.02896,198.232124,0,5.399433
4,4,2024-07-23 10:54:54.076000+05:30,3.0,95.192675,96.663373,447.37913,218.706219,1205.0,28.418231,2.0,...,0.0,0.0,0.0,346.717781,290.112888,0.0,337.038613,216.642832,0,255.746391


In [3]:
# Correcting data-type of categorical features and dropping few irrelevant like: 'Anomaly_Score','Unnamed: 0','timestamps'

df['BMS_max_cell_temp_id'] = df['BMS_max_cell_temp_id'].astype('object')
df['BMS_state'] = df['BMS_state'].astype('object')
df['BMS_min_cell_temp_id'] = df['BMS_min_cell_temp_id'].astype('object')
df['BMS_max_cell_voltage_id'] = df['BMS_max_cell_voltage_id'].astype('object')
df['BMS_min_cell_voltage_id'] = df['BMS_min_cell_voltage_id'].astype('object')
df['OBC_mux'] = df['OBC_mux'].astype('object')
df['OBC_port_status'] = df['OBC_port_status'].astype('object')
df['OBC_overvoltage_fault'] = df['OBC_overvoltage_fault'].astype('object')
df['OBC_overcurrent_fault'] = df['OBC_overcurrent_fault'].astype('object')
df['OBC_port_weld_fault'] = df['OBC_port_weld_fault'].astype('object')
df['Anomaly'] = df['Anomaly'].astype('object')

df = df.drop(['Anomaly_Score','Unnamed: 0','timestamps'], axis = 1)

In [4]:
df.shape

(17280, 31)

In [5]:
# Isolating predictor features from target feature

x = df.drop('Anomaly', axis =1)
y = df['Anomaly'].astype('int')

In [6]:
# Isolating Numerical & Categorical variables

char = []
num = []

for i in x.columns:
    if x[i].dtype == 'object':
        char.append(i)
    else:
        num.append(i)

In [7]:
print(char)
print()
print(num)


['BMS_state', 'BMS_max_cell_temp_id', 'BMS_min_cell_temp_id', 'BMS_max_cell_voltage_id', 'BMS_min_cell_voltage_id', 'OBC_mux', 'OBC_port_status', 'OBC_overvoltage_fault', 'OBC_overcurrent_fault', 'OBC_port_weld_fault']

['BMS_soc', 'BMS_soh', 'BMS_bus_voltage', 'BMS_bus_current', 'BMS_isolation', 'BMS_max_cell_temp', 'BMS_min_cell_temp', 'BMS_max_cell_voltage', 'BMS_min_cell_voltage', 'LV_soc', 'LV_soh', 'LV_voltage', 'LV_current', 'LV_temperature', 'MCU_motor_speed', 'MCU_motor_avg_temp', 'OBC_output_voltage', 'OBC_output_current', 'OBC_internal_voltage', 'OBC_internal_current']


In [6]:
# Numeric transformations: Impute missing values and scaling

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Categorical transformations: Impute missing and OneHotEncode

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))
])

In [7]:
# Use ColumnTransformer to apply transformations

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num),
        ('cat', categorical_transformer, char)
    ])

In [8]:
# Create the final pipeline

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])


In [9]:
# Performing RandomOverSampling to address Imbalance data

sampler = RandomOverSampler(random_state=0,sampling_strategy='auto')
x_resampled, y_resampled = sampler.fit_resample(x,y)

In [10]:
# Train test split

x_train, x_test,y_train, y_test = train_test_split(x_resampled, y_resampled, test_size=0.3, random_state=101)

In [11]:
# Fit the pipeline on the training data
pipeline.fit(x_train, (y_train))

# Make predictions on the test data
y_pred = pipeline.predict(x_test)

In [None]:
# Checking accuracy on test data

print(f'Accuracy: {accuracy_score(y_test, y_pred)}')

In [None]:
# Save the preprocessor
joblib.dump(preprocessor,'D:/bmsAnomalyDetection/application/Anomaly-detection/notebook/artifacts/Preprocessor.pkl')

# Save the model
joblib.dump(pipeline, 'D:/bmsAnomalyDetection/application/Anomaly-detection/notebook/artifacts/Model.pkl')