In [1]:
from imblearn.under_sampling import NearMiss
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import pathlib

DATA_PATH = pathlib.Path("../Data")

In [2]:
df = pd.read_feather(DATA_PATH / "Classification.feather")
df["Target"].value_counts()

#df.head()

0    10539882
2      212106
1       21600
Name: Target, dtype: int64

In [None]:
X = df.drop(columns=["timestamp", "Target"]) # this is purely a classification no time steps are needed
y = df["Target"]

nm_undersampler = NearMiss(version=3, n_neighbors_ver3=3, n_jobs=-1) # Warning takes very long to run
X, y = nm_undersampler.fit_resample(X, y)


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [10]:
svm = SVC(kernel="rbf")
svm.fit(X_train, y_train)

In [12]:
y_pred = svm.predict(X_test)

In [15]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.55      0.62      0.58      6529
           1       0.60      0.62      0.61      6444
           2       0.52      0.36      0.43      3303

    accuracy                           0.57     16276
   macro avg       0.56      0.53      0.54     16276
weighted avg       0.56      0.57      0.56     16276



In [18]:
confusion_matrix(y_test, y_pred, normalize="true")

array([[0.61969674, 0.26742227, 0.11288099],
       [0.32976412, 0.61530106, 0.05493482],
       [0.37874659, 0.26006661, 0.3611868 ]])

## SVM

- Accuracy is 57% (It's like slightly better than a random guess)
- Precision for class 1 (Pre Failure) is 0.6, 60% of instances are correctly classified as pre failure
- Getting confused between normal and pre failure at least 32% of the time
- It is not too good at predicting an actually failure interestingly enough

## Decision Tree (Hyper parameter have no significant differences)

- 66% Accuracy (Good)
- 69% of pre failures are correct (Nice 😉)
- Predicting an actual failure is still a coin toss (49%)
- Precision for class 1 is improvement 0.7
- Trains much much quicker than SVM and is more accurate

## Random Forest

- Slight improvement over Decision Tree 69% (Nice 😉)
- 1% decrease in predicting pre failures (68%)
- Slightly improved at predicting actual failures but still pretty much a coin toss (52%)
- Using Entropy is better, 1 addition percent accuracy

## Neural Nets

- Most Accurate (72%)
- It's Really good a predicting prefailures (90%)
- Still not very good at predicting actual failures
- Training time better than SVM But also use Cuda so it's not very fair
- Fairly small model is still doing good (25k Param)

In [24]:
dt = DecisionTreeClassifier(criterion="entropy")
dt.fit(X_train, y_train)

In [21]:
y_pred_dt = dt.predict(X_test)

In [22]:
print(classification_report(y_test, y_pred_dt))

              precision    recall  f1-score   support

           0       0.70      0.68      0.69      6529
           1       0.69      0.70      0.70      6444
           2       0.51      0.52      0.52      3303

    accuracy                           0.66     16276
   macro avg       0.63      0.63      0.63     16276
weighted avg       0.66      0.66      0.66     16276



In [23]:
confusion_matrix(y_test, y_pred, normalize="true")

array([[0.68019605, 0.18456119, 0.13524276],
       [0.18171943, 0.70251397, 0.1157666 ],
       [0.22615804, 0.25158946, 0.5222525 ]])

In [28]:
dt = DecisionTreeClassifier(criterion="log_loss")
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
print(classification_report(y_test, y_pred_dt))
confusion_matrix(y_test, y_pred_dt, normalize="true")

              precision    recall  f1-score   support

           0       0.69      0.69      0.69      6529
           1       0.69      0.71      0.70      6444
           2       0.53      0.49      0.51      3303

    accuracy                           0.66     16276
   macro avg       0.63      0.63      0.63     16276
weighted avg       0.65      0.66      0.66     16276



array([[0.68816051, 0.18869658, 0.1231429 ],
       [0.18885785, 0.70732464, 0.1038175 ],
       [0.25158946, 0.25491977, 0.49349077]])

In [29]:
rf = RandomForestClassifier(criterion="gini", random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print(classification_report(y_test, y_pred_rf))
confusion_matrix(y_test, y_pred_rf, normalize="true")

              precision    recall  f1-score   support

           0       0.71      0.72      0.71      6529
           1       0.68      0.76      0.72      6444
           2       0.68      0.52      0.59      3303

    accuracy                           0.69     16276
   macro avg       0.69      0.67      0.67     16276
weighted avg       0.69      0.69      0.69     16276



array([[0.71634247, 0.2014091 , 0.08224843],
       [0.20173805, 0.75713842, 0.04112353],
       [0.19194672, 0.28580079, 0.5222525 ]])

In [30]:
rf = RandomForestClassifier(criterion="entropy", random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print(classification_report(y_test, y_pred_rf))
confusion_matrix(y_test, y_pred_rf, normalize="true")

              precision    recall  f1-score   support

           0       0.73      0.72      0.72      6529
           1       0.69      0.78      0.73      6444
           2       0.70      0.54      0.61      3303

    accuracy                           0.70     16276
   macro avg       0.70      0.68      0.69     16276
weighted avg       0.71      0.70      0.70     16276



array([[0.71634247, 0.20554449, 0.07811303],
       [0.1801676 , 0.77855369, 0.04127871],
       [0.17741447, 0.28670905, 0.53587648]])

In [31]:
rf = RandomForestClassifier(criterion="log_loss", random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print(classification_report(y_test, y_pred_rf))
confusion_matrix(y_test, y_pred_rf, normalize="true")

              precision    recall  f1-score   support

           0       0.73      0.72      0.72      6529
           1       0.69      0.78      0.73      6444
           2       0.70      0.54      0.61      3303

    accuracy                           0.70     16276
   macro avg       0.70      0.68      0.69     16276
weighted avg       0.71      0.70      0.70     16276



array([[0.71634247, 0.20554449, 0.07811303],
       [0.1801676 , 0.77855369, 0.04127871],
       [0.17741447, 0.28670905, 0.53587648]])

# Trying a neural Net

In [47]:
import torch
from torch.utils.data import TensorDataset, DataLoader
import lightning.pytorch as pl
import torch.nn.functional as F
import torch.nn as nn
from typing import Any
from lightning.pytorch.utilities.types import STEP_OUTPUT, OptimizerLRScheduler
from lightning.pytorch.callbacks.early_stopping import EarlyStopping

In [41]:
# Convert the NumPy arrays to PyTorch tensors
X_train_t = torch.from_numpy(X_train)
y_train_t = torch.from_numpy(y_train.to_numpy())
X_test_t = torch.from_numpy(X_test)
y_test_t = torch.from_numpy(y_test.to_numpy())

# Create the dataset object
train_ds = TensorDataset(X_train_t, y_train_t)
test_ds = TensorDataset(X_test_t, y_test_t)

train_loader = DataLoader(train_ds, batch_size=32)
test_loader = DataLoader(test_ds, batch_size=32)

X_train_t.shape

torch.Size([37976, 16])

In [64]:
class NN_Classifier(pl.LightningModule):
    def __init__(self, n_inputs) -> None:
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(16, 256),
            nn.Tanh(),
            nn.Linear(256, 64),
            nn.Tanh(),
            nn.Linear(64, 64),
            nn.Tanh(),
            nn.Linear(64, 3)
        )
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.model(x)
    
    def training_step(self, batch, batch_idx) -> STEP_OUTPUT:
        X, y = batch
        y_pred = self(X)
        loss = F.cross_entropy(y_pred, y)
        return loss
    
    def validation_step(self, batch, batch_idx) -> STEP_OUTPUT:
        X, y = batch
        y_pred = self(X)
        y_lab = torch.argmax(F.softmax(y_pred, dim=1), dim=1)
        
        correct = (y_lab == y).sum()
        acc = correct / X.shape[0]
        
        loss = F.cross_entropy(y_pred, y)
        self.log("val_loss", loss, prog_bar=True, on_epoch=True)
        self.log("val_acc", acc, prog_bar=True, on_epoch=True)
        return loss
    
    def configure_optimizers(self) -> OptimizerLRScheduler:
        return torch.optim.Adam(self.parameters(), lr = 1e-3)

In [65]:
model = NN_Classifier(X_train_t.shape[1])
es_callback = EarlyStopping(monitor="val_loss", min_delta=0, patience=3, strict=True)

trainer = pl.Trainer(accelerator="gpu", callbacks=[es_callback])
trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=test_loader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
c:\Users\DELL\AppData\Local\Programs\Python\Python311\Lib\site-packages\lightning\pytorch\loops\utilities.py:72: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type       | Params
-------------------------------------
0 | model | Sequential | 25.2 K
-------------------------------------
25.2 K    Trainable params
0         Non-trainable params
25.2 K    Total params
0.101     Total estimated model params size (MB)


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

                                                                            

c:\Users\DELL\AppData\Local\Programs\Python\Python311\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
c:\Users\DELL\AppData\Local\Programs\Python\Python311\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Epoch 33: 100%|██████████| 1187/1187 [00:07<00:00, 151.76it/s, v_num=5, val_loss=0.592, val_acc=0.716]


In [68]:
y_pred = model(X_test_t)
y_hat = torch.argmax(F.softmax(y_pred, dim=1), dim=1).detach().cpu().numpy()
y_true = y_test_t.numpy()

print(classification_report(y_true, y_hat))

              precision    recall  f1-score   support

           0       0.79      0.66      0.72      6529
           1       0.67      0.91      0.77      6444
           2       0.72      0.46      0.56      3303

    accuracy                           0.72     16276
   macro avg       0.72      0.67      0.68     16276
weighted avg       0.73      0.72      0.71     16276



In [71]:
confusion_matrix(y_true, y_hat, normalize="true")

array([[0.66089753, 0.2660438 , 0.07305866],
       [0.07479826, 0.90657976, 0.01862197],
       [0.20829549, 0.33636088, 0.45534363]])