In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import class_weight

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import plot_model

2024-09-13 17:38:00.320783: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-13 17:38:00.324372: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-13 17:38:00.334699: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-13 17:38:00.350865: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-13 17:38:00.355661: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-13 17:38:00.369112: I tensorflow/core/platform/cpu_feature_gu

In [2]:
def get_clear_data(df):
    # Drop the date column from the dataframe
    if 'data' in df.columns:
        df.drop(columns=['date'], inplace=True)
    if 'model' in df.columns:
        df.drop(columns=['model'], inplace=True)
    if 'serial_number' in df.columns:
        df.drop(columns=['serial_number'], inplace=True)

    df_filled = df.fillna(df.mean())

    # Keep the original 'failure' column unchanged
    df_filled['failure'] = df['failure']

    return df_filled

In [3]:
# Load the entire dataset into a pandas dataframe
column_list = []
column_list.append('date')
column_list.append('failure')
column_list.append('model')
column_list.append('serial_number')
column_list.append('smart_5_raw')
column_list.append('smart_9_raw')
column_list.append('smart_184_raw')
column_list.append('smart_187_raw')
column_list.append('smart_193_raw')
column_list.append('smart_197_raw')
column_list.append('smart_198_raw')
column_list.append('smart_240_raw')
column_list.append('smart_241_raw')
column_list.append('smart_242_raw')

df = pd.read_parquet('/nobackup/amimalik/bits/dataset/dimensions/4Q/failed_devices_df_all.parquet', columns=column_list)
model_df = pd.read_csv('/nobackup/amimalik/bits/dataset/dimensions/4Q/sorted_failures.csv')

model_df = model_df.head(5)

In [4]:
def get_df_with_iteration_count(df, count):
    serial_number_counts = df['serial_number'].value_counts()

    # Filter the serial numbers based on value counts
    filtered_serial_numbers = serial_number_counts[serial_number_counts > count].index

    # Create a new dataframe with filtered serial numbers
    filtered_df = df[df['serial_number'].isin(filtered_serial_numbers)].copy()

    # reset index inplace 
    filtered_df.reset_index(drop=True, inplace=True)

    # copy the dataframe to ddf
    filtered_df = filtered_df.copy()

    return filtered_df

In [5]:
def get_resample_data(df, sample_frequency):
    # Group the dataframe by serial number
    grouped_df = df.groupby('serial_number')

    # Create an empty list to store the resampled dataframes
    resampled_dfs = []

    # Iterate over each group
    for name, group in grouped_df:
        # Resample the group to weekly frequency
        resampled_group = group.resample(sample_frequency, on='date').max()
        # drop the tail row if the last row is not a failure
        while resampled_group.tail(1)['failure'].values[0] != 1:
            resampled_group = resampled_group[:-1]
        # Append the resampled group to the list
        resampled_dfs.append(resampled_group)

    # Concatenate the list of dataframes into a single dataframe
    resampled_data = pd.concat(resampled_dfs)

    # Reset the index of the dataframe
    # resampled_data.reset_index(drop=True, inplace=True)

    return resampled_data

In [6]:
def get_norm(ddf):
    raw_columns = [col for col in ddf.columns if 'raw' in col]

    scaler = MinMaxScaler()
    scaler.fit(ddf[raw_columns])
    normalized_data = scaler.transform(ddf[raw_columns])
    normalized_data = pd.DataFrame(normalized_data, columns=raw_columns)

    # Drop the raw columns from the original dataframe and combine the normalized data with the original dataframe
    combined_df = pd.concat([normalized_data, ddf.drop(columns=raw_columns)], axis=1)

    # rename the columns with raw to normalized
    combined_df.columns = combined_df.columns.str.replace('_raw', '_norm')


    combined_df.head()

    combined_df.rename(columns={'failure':'failure', 'smart_5_norm':'Reallocated_Sectors_Count', 'smart_9_norm':'Power-On_Hours',
                        'smart_184_norm':'I/O_Error_Detection_and_Correction','smart_187_norm':'Reported_Uncorrectable_Errors', 
                        'smart_193_norm':'Load_Unload_Cycle', 'smart_197_norm':'Current_Pending_Sector_Count', 'smart_198_norm':'Offline_Uncorrectable',
                        'smart_240_norm':'Head_Flying_Hours', 'smart_241_norm':'Total_LBAs_Written', 'smart_242_norm':'Total_LBAs_Read'}, inplace=True)

    ddf = combined_df.copy()
    return ddf

In [7]:
def split_dataset(df):
    # Split the dataset into features (X) and target variable (y)
    X = df.drop(columns=['failure'])
    y = df['failure']

    # Split the dataset into train and test sets
    train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

    # Print the shape of the train and test sets
    # print(train_X.shape, test_X.shape, train_y.shape, test_y.shape)

    train_y = np.array(train_y).reshape(-1, 1)
    test_y = np.array(test_y).reshape(-1, 1)

    # train_X = np.reshape(train_X, (train_X.shape[0], 1, train_X.shape[1]))
    train_X = np.reshape(train_X, (train_X.shape[0], 1, train_X.shape[1]))
    test_X = np.reshape(test_X, (test_X.shape[0], 1, test_X.shape[1]))

    # Print the shape of reshaped train and test sets
    # print(train_X.shape, test_X.shape, train_y.shape, test_y.shape)

    return train_X, train_y, test_X, test_y 

In [10]:
def model_fit(X_train, y_train, x_test, y_test, dev_model, epochs=50, batch_size=32, learning_rate=0.001):
    validation_number = int(0.2 * X_train.shape[0])
    validation_X = X_train[validation_number:]
    validation_y = y_train[validation_number:]
    train_X = X_train[:validation_number]
    train_y = y_train[:validation_number]

    # Calculate class weights using inverse frequency
    class_weights = {0: 0.5128462099125365, 1: 19.96099290780142}

    # Define the model
    model = Sequential()
    model.add(LSTM(64, activation='relu', input_shape=(train_X.shape[1], train_X.shape[2]),return_sequences=True, dropout=0.25))
    model.add(LSTM(64, return_sequences=True, dropout=0.25))
    model.add(LSTM(32, return_sequences=False, dropout=0.25))
    model.add(Dense(1, activation='sigmoid'))

    model.summary()

    # Compile the model
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    # Fit the model
    # history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(validation_X, validation_y), verbose=0)
    history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(validation_X, validation_y), class_weight=class_weights, verbose=0)

    # summarize performance of the model
    scores = model.evaluate(train_X, train_y, verbose=0)

    # Use the trained model to predict on the test_X dataset
    predictions = model.predict(test_X)


    # Convert the predictions to binary values (0 or 1)
    binary_predictions = (predictions > 0.5).astype(int)


    # Calculate the evaluation metrics
    accuracy = accuracy_score(test_y, binary_predictions)
    precision = precision_score(test_y, binary_predictions)
    recall = recall_score(test_y, binary_predictions)
    f1 = f1_score(test_y, binary_predictions)
    roc_auc = roc_auc_score(test_y, binary_predictions)

    # Print the evaluation metrics
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-score:", f1)
    print("ROC AUC score:", roc_auc)

    # plot confusion matrix
    cm = confusion_matrix(test_y, binary_predictions)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    cm

    return model, binary_predictions, scores
"""
    # plot history
    plt.plot(history.history['loss'], label='train')
    plt.plot(history.history['val_loss'], label='validation')
    # Optional: Plot other metrics by adding similar lines
    plt.plot(epochs, history.history['accuracy'], label='Training Accuracy')
    plt.plot(epochs, history.history['val_accuracy'], label='Validation Accuracy')

    plt.title('model accuracy for model: ' + dev_model)
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend()
    plt.grid()
    plt.show()

    return model, binary_predictions, scores
"""

"\n    # plot history\n    plt.plot(history.history['loss'], label='train')\n    plt.plot(history.history['val_loss'], label='validation')\n    # Optional: Plot other metrics by adding similar lines\n    plt.plot(epochs, history.history['accuracy'], label='Training Accuracy')\n    plt.plot(epochs, history.history['val_accuracy'], label='Validation Accuracy')\n\n    plt.title('model accuracy for model: ' + dev_model)\n    plt.ylabel('accuracy')\n    plt.xlabel('epoch')\n    plt.legend()\n    plt.grid()\n    plt.show()\n\n    return model, binary_predictions, scores\n"

In [11]:
# Filter the dataframe to include only serial numbers with more than count records
ddf = get_df_with_iteration_count(df, 340)
ddf_norm = get_norm(ddf)

# Resample the dataframe to weekly frequency
weekly_df = get_resample_data(ddf_norm, 'W')

# Drop nan values from the dataframe
weekly_df = weekly_df.dropna(how='all')

# Get the clear data
weekly_df = get_clear_data(weekly_df)

# Convert the failure column to integer type
weekly_df['failure'] = weekly_df['failure'].astype(int)


train_X, train_y, test_X, test_y = split_dataset(weekly_df)

model_x, predict_y, score =  model_fit(train_X, train_y, test_X, test_y, epochs=200, batch_size=32, learning_rate=0.005, dev_model="all")


  super().__init__(**kwargs)


[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step   
Accuracy: 0.9289772727272727
Precision: 0.075
Recall: 0.1875
F1-score: 0.10714285714285714
ROC AUC score: 0.5668604651162791


In [14]:
# plot confusion_matrix for model_x
cm = confusion_matrix(test_y, predict_y)
cm

array([[1302,   74],
       [  26,    6]])

In [None]:
model_x, predict_y, score =  model_fit(train_X, train_y, test_X, test_y, epochs=200, batch_size=32, learning_rate=0.005, dev_model="all")

# Filter the dataframe to include only serial numbers with more than count records
ddf = get_df_with_iteration_count(df, 340)
ddf_norm = get_norm(ddf)

# Resample the dataframe to weekly frequency
weekly_df = get_resample_data(ddf_norm, '2W')

# Drop nan values from the dataframe
weekly_df = weekly_df.dropna(how='all')

# Get the clear data
weekly_df = get_clear_data(weekly_df)

# Convert the failure column to integer type
weekly_df['failure'] = weekly_df['failure'].astype(int)


train_X, train_y, test_X, test_y = split_dataset(weekly_df)

model_x, predict_y, score =  model_fit(train_X, train_y, test_X, test_y, epochs=500, batch_size=32, learning_rate=0.001, dev_model="all")

# Filter the dataframe to include only serial numbers with more than count records
ddf = get_df_with_iteration_count(df, 340)
ddf_norm = get_norm(ddf)

# Resample the dataframe to weekly frequency
weekly_df = get_resample_data(ddf_norm, '4W')

# Drop nan values from the dataframe
weekly_df = weekly_df.dropna(how='all')

# Get the clear data
weekly_df = get_clear_data(weekly_df)

# Convert the failure column to integer type
weekly_df['failure'] = weekly_df['failure'].astype(int)


train_X, train_y, test_X, test_y = split_dataset(weekly_df)

model_x, predict_y, score =  model_fit(train_X, train_y, test_X, test_y, epochs=500, batch_size=32, learning_rate=0.001, dev_model="all")

plot_model(model_x, show_shapes=True, show_layer_names=True)