In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc

import numpy as np


In [2]:
# Load the entire dataset into a pandas dataframe

df = pd.read_parquet('/nobackup/amimalik/bits/dataset/dimensions/4Q/working_dataset.parquet')
model_df = pd.read_csv('/nobackup/amimalik/bits/dataset/dimensions/4Q/sorted_failures.csv')

model_df = model_df.head(5)

In [3]:
# make a group with serial no of drive and return the last count no of days

def group_by_serial_number(df, count):

    # Group the instances dataframe by serial number
    grouped_instances = df.groupby('serial_number')

    # Create an empty list to store the last 150 samples for each serial number
    last_n = []

    # Iterate over each group
    for name, group in grouped_instances:
        # Get the last n (count) samples for the current serial number
        last_n_group = group.tail(count)
        # Append the last n samples to the list
        last_n.append(last_n_group)

    # Concatenate the list of dataframes into a single dataframe
    instance_last_n = pd.concat(last_n)

    # Reset the index of the dataframe
    instance_last_n.reset_index(drop=True, inplace=True)

    # Drop the date column from the dataframe
    # instance_last_n.drop(columns=['date'], inplace=True)

    # reset index
    instance_last_n.reset_index(drop=True, inplace=True)

    # Print the updated dataframe
    # instance_last_n.head()

    return instance_last_n

In [4]:
df = group_by_serial_number(df, 15)
df.shape

(63537, 13)

In [5]:
def get_clear_data(df):
    df.drop(columns=['model', 'serial_number'], inplace=True)
    df_filled = df.fillna(df.mean())
    return df_filled

In [6]:
model_data = get_clear_data(df)

model_data = model_data.copy()


In [7]:
# Split the dataframe into features (X) and target (y)
X = model_data.drop(columns=['failure'])
y = model_data['failure']

# Split the data into train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [11]:
# Initialize the models
rf_model = RandomForestClassifier(random_state=42)
gb_model = GradientBoostingClassifier(random_state=42)

# Train the models
rf_model.fit(X_train, y_train)
gb_model.fit(X_train, y_train)

In [12]:
# Make predictions
rf_predictions = rf_model.predict(X_test)
gb_predictions = gb_model.predict(X_test)

# Evaluate Random Forest
print("Random Forest Metrics:")
print(f"Accuracy: {accuracy_score(y_test, rf_predictions)}")
print(f"Precision: {precision_score(y_test, rf_predictions, average='weighted')}")
print(f"Recall: {recall_score(y_test, rf_predictions, average='weighted')}")
print(f"F1-Score: {f1_score(y_test, rf_predictions, average='weighted')}")
print(classification_report(y_test, rf_predictions))

# Evaluate Gradient Boosting
print("\nGradient Boosting Metrics:")
print(f"Accuracy: {accuracy_score(y_test, gb_predictions)}")
print(f"Precision: {precision_score(y_test, gb_predictions, average='weighted')}")
print(f"Recall: {recall_score(y_test, gb_predictions, average='weighted')}")
print(f"F1-Score: {f1_score(y_test, gb_predictions, average='weighted')}")
print(classification_report(y_test, gb_predictions))

Random Forest Metrics:
Accuracy: 0.9143321792047004
Precision: 0.8698378057451831
Recall: 0.9143321792047004
F1-Score: 0.8911925174103219
              precision    recall  f1-score   support

           0       0.93      0.98      0.96     17771
           1       0.02      0.01      0.01      1291

    accuracy                           0.91     19062
   macro avg       0.48      0.49      0.48     19062
weighted avg       0.87      0.91      0.89     19062


Gradient Boosting Metrics:
Accuracy: 0.9320637918371629
Precision: 0.8935204259402378
Recall: 0.9320637918371629
F1-Score: 0.9000026271786162
              precision    recall  f1-score   support

           0       0.93      1.00      0.96     17771
           1       0.36      0.00      0.01      1291

    accuracy                           0.93     19062
   macro avg       0.64      0.50      0.49     19062
weighted avg       0.89      0.93      0.90     19062



In [13]:
df.columns

Index(['Reallocated_Sectors_Count', 'Power-On_Hours',
       'I/O_Error_Detection_and_Correction', 'Reported_Uncorrectable_Errors',
       'Load_Unload_Cycle', 'Current_Pending_Sector_Count',
       'Offline_Uncorrectable', 'Head_Flying_Hours', 'Total_LBAs_Written',
       'Total_LBAs_Read', 'failure'],
      dtype='object')