In [None]:
import warnings
warnings.filterwarnings('ignore')

import glob
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn import tree
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer, scale
from sklearn import metrics
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [None]:
tf.get_logger().setLevel('ERROR')
tf.autograph.set_verbosity(1)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
tf.config.set_soft_device_placement(True)
tf.debugging.set_log_device_placement(False)

Num GPUs Available:  0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# cd /into/project/folder/
cd /content/drive/MyDrive/Lab/PIVE/backblaze/

/content/drive/MyDrive/Lab/PIVE/backblaze


## Dataset

In [None]:
files = sorted(glob.glob("202*.csv"))
models = ["ST12000NM0007", "ST8000NM0055", "ST8000DM002", "ST4000DM000"]
models_seen = set()
for fname in files:
  print("processing file -- {0}".format(fname))
  df = pd.read_csv(fname)
  for model in models:
    curr = df[df.model == model].copy()
    
    if model in models_seen:
      curr.to_csv("models_logs/latest/{0}.csv".format(model), mode='a', index=False, header=False)
    else:
      curr.to_csv("models_logs/latest/{0}.csv".format(model), index=False)
      models_seen.add(model)

    print("model: {0}, size: {1}".format(model, len(curr)))
  print()

In [None]:
models = ["ST3000DM001", "ST8000NM0055", "ST8000DM002", "ST4000DM000", "ST12000NM0007"]

columns = ["date", "serial_number"]
features = [1, 4, 5, 7, 9, 12, 187, 188, 193, 194, 197, 198, 199]
for feature in features:
    columns.append("smart_{0}_raw".format(feature))
columns.append("label")

for model in models[:1]:
  print("processing - model :: {0}".format(model))

  df_iterator = pd.read_csv("models_logs/{0}.csv".format(model), chunksize=250000000)
  # df = pd.read_csv("models_logs/{0}.csv".format(model))
  for i, df in enumerate(df_iterator):
    print("\t processing - chunk :: {0}".format(i+1))
    df = df.sort_values(by=["date", "serial_number"])
    df.smart_187_raw.fillna(0, inplace=True)
    df["label"] = df.groupby("serial_number").smart_187_raw.diff(periods=-1).fillna(0)
    df.loc[df.label != 0, "label"] = 1
    df.label = df.label.astype(int)
    df = df[columns]
    df.dropna(inplace=True)
    print(df.label.value_counts())
    print()

    # Set writing mode to append after first chunk
    mode = 'w' if i == 0 else 'a'
    
    # Add header if it is the first chunk
    header = i == 0
    df.to_csv("dataset/{0}.csv".format(model), index=False, header=header, mode=mode)

In [None]:
files = sorted(glob.glob("2021*.csv"))
models = ["ST12000NM0007", "ST8000NM0055", "ST8000DM002", "ST4000DM000"]
created = False

for fname in files:
  print("processing files -- {0}".format(fname))
  df = pd.read_csv(fname)
  for model in models:
    curr = df[df.model == model].copy()
    if created:
      curr.to_csv("models_logs/2021/combined.csv".format(model), mode='a', index=False, header=False)
    else:
      curr.to_csv("models_logs/2021/combined.csv".format(model), mode='w', index=False, header=True)
      created = True

    print("model: {0}, size: {1}".format(model, len(curr)))
  print()

processing -- 2021q1.csv
model: ST12000NM0007, size: 1732307
model: ST8000NM0055, size: 1297674
model: ST8000DM002, size: 878106
model: ST4000DM000, size: 1701967

processing -- 2021q2.csv
model: ST12000NM0007, size: 726571
model: ST8000NM0055, size: 1310887
model: ST8000DM002, size: 885873
model: ST4000DM000, size: 1714240

processing -- 2021q3.csv
model: ST12000NM0007, size: 216486
model: ST8000NM0055, size: 1323645
model: ST8000DM002, size: 895281
model: ST4000DM000, size: 1724626



In [None]:
features = [1, 4, 5, 7, 9, 12, 187, 188, 193, 194, 197, 198, 199]
columns = ["date", "serial_number"]
for feature in features:
    columns.append("smart_{0}_raw".format(feature))
columns.append("label")

models = ["combined"]
for model in models:
  print("processing - model :: {0}".format(model))

  df_iterator = pd.read_csv("models_logs/2021/{0}.csv".format(model), chunksize=250000000)
  # df = pd.read_csv("models_logs/{0}.csv".format(model))
  for i, df in enumerate(df_iterator):
    print("\t processing - chunk :: {0}".format(i+1))
    df = df.sort_values(by=["date", "serial_number"])
    df.smart_187_raw.fillna(0, inplace=True)
    df["label"] = df.groupby("serial_number").smart_187_raw.diff(periods=-1).fillna(0)
    df.loc[df.label != 0, "label"] = 1
    df.label = df.label.astype(int)
    df = df[columns]
    df.dropna(inplace=True)
    print(df.label.value_counts())
    print()

    # Set writing mode to append after first chunk
    mode = 'w' if i == 0 else 'a'
    
    # Add header if it is the first chunk
    header = i == 0
    df.to_csv("dataset/{0}.csv".format(model), index=False, header=header, mode=mode)

## Non DNN

In [None]:
models = ["ST3000DM001", "ST8000NM0055", "ST8000DM002", "ST4000DM000", "ST12000NM0007", "combined"]

for model in models[:]:
  print("Model: {0}".format(model))
  df = pd.read_csv("dataset/{0}.csv".format(model))
  df.dropna(inplace=True)
  y = df["label"]
  X = df.drop(columns=["date", "serial_number", "label"])
  X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.2)
  X_train1, y_train1 = RandomUnderSampler(sampling_strategy=.33).fit_resample(X_train, y_train)
  X_train, y_train = RandomUnderSampler(sampling_strategy=1).fit_resample(X_train, y_train)
  X_test, y_test = RandomUnderSampler(sampling_strategy=.1).fit_resample(X_test, y_test)

  clf = make_pipeline(StandardScaler(), XGBClassifier(n_estimators=125, max_depth=6, learning_rate=0.018, objective= 'binary:logistic', eval_metric="mlogloss"))
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  y_pred_proba = clf.predict_proba(X_test)[::,1]
  fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
  auc = metrics.roc_auc_score(y_test, y_pred_proba)
  print("XGB -- AUC:", np.round(auc, 3))
  with open("results/{0}_{1}.csv".format("XGB", model), "w") as ff:
    ff.write("tpr,fpr\n")
    for i in range(min(len(tpr),len(fpr))):
      ff.write("{0},{1}\n".format(tpr[i],fpr[i]))


  clf = make_pipeline(MinMaxScaler(), RandomForestClassifier(n_estimators=20))
  clf.fit(X_train1, y_train1)
  y_pred = clf.predict(X_test)
  y_pred_proba = clf.predict_proba(X_test)[::,1]
  fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
  auc = metrics.roc_auc_score(y_test, y_pred_proba)
  print("Mahdisoltani -- AUC:", np.round(auc, 3))
  with open("results/{0}_{1}.csv".format("Mahdisoltani", model), "w") as ff:
    ff.write("tpr,fpr\n")
    for i in range(min(len(tpr),len(fpr))):
      ff.write("{0},{1}\n".format(tpr[i],fpr[i]))


  clf = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators=75))
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  y_pred_proba = clf.predict_proba(X_test)[::,1]
  fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
  auc = metrics.roc_auc_score(y_test, y_pred_proba)
  print("RF -- AUC:", np.round(auc, 3))
  with open("results/{0}_{1}.csv".format("RF", model), "w") as ff:
    ff.write("tpr,fpr\n")
    for i in range(min(len(tpr),len(fpr))):
      ff.write("{0},{1}\n".format(tpr[i],fpr[i]))


  clf = make_pipeline(StandardScaler(), MLPClassifier())
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  y_pred_proba = clf.predict_proba(X_test)[::,1]
  fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
  auc = metrics.roc_auc_score(y_test, y_pred_proba)
  print("MLP -- AUC:", np.round(auc, 3))
  with open("results/{0}_{1}.csv".format("DNN", model), "w") as ff:
    ff.write("tpr,fpr\n")
    for i in range(min(len(tpr),len(fpr))):
      ff.write("{0},{1}\n".format(tpr[i],fpr[i]))


  clf = make_pipeline(StandardScaler(), SVC(kernel="rbf", probability=True))
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  y_pred_proba = clf.predict_proba(X_test)[::,1]
  fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
  auc = metrics.roc_auc_score(y_test, y_pred_proba)
  print("SVC -- AUC:", np.round(auc, 3))
  with open("results/{0}_{1}.csv".format("SVC", model), "w") as ff:
    ff.write("tpr,fpr\n")
    for i in range(min(len(tpr),len(fpr))):
      ff.write("{0},{1}\n".format(tpr[i],fpr[i]))

Model: ST3000DM001
XGB -- AUC: 0.912
Model: ST8000NM0055
XGB -- AUC: 0.951
Model: ST8000DM002
XGB -- AUC: 0.958
Model: ST4000DM000
XGB -- AUC: 0.942
Model: ST12000NM0007
XGB -- AUC: 0.975
Model: combined
XGB -- AUC: 0.966


## DNN

In [None]:
models = ["ST3000DM001", "ST8000NM0055", "ST8000DM002", "ST4000DM000", "ST12000NM0007", "combined"]

for model in models[:1]:
  print("Model: {0}".format(model))
  df = pd.read_csv("dataset/{0}.csv".format(model))
  df.dropna(inplace=True)
  y = df["label"]
  X = df.drop(columns=["date", "serial_number", "label"])
  X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.2)
  X_train, y_train = RandomUnderSampler(sampling_strategy=1).fit_resample(X_train, y_train)
  X_test, y_test = RandomUnderSampler(sampling_strategy=.1).fit_resample(X_test, y_test)
  scaler = StandardScaler()
  X_train_scaled = scaler.fit_transform(X_train)
  X_test_scaled = scaler.transform(X_test)

  dnn = tf.keras.Sequential([
                            #  tf.keras.Input(shape=(13,)),
                             tf.keras.layers.Dense(128, activation='relu'),
                             tf.keras.layers.Dense(64, activation='relu'),
                             tf.keras.layers.Dense(32, activation='relu'),
                             tf.keras.layers.Dense(1, activation='sigmoid')
                             ])

  dnn.compile(
      loss=tf.keras.losses.binary_crossentropy,
      optimizer=tf.keras.optimizers.Adam(lr=0.003),
      metrics=[
          tf.keras.metrics.AUC(name='AUC')
      ]
  )

  history = dnn.fit(X_train_scaled, y_train, epochs=25, verbose=0)
  predictions = dnn.predict(X_test_scaled)
  prediction_classes = [
      1 if prob > 0.5 else 0 for prob in np.ravel(predictions)
  ]

  fpr, tpr, _ = metrics.roc_curve(y_test,  predictions)
  auc = metrics.roc_auc_score(y_test, predictions)
  print("AUC:", np.round(auc* 100, 2))

## CNN-LSTM

In [None]:
models = ["ST3000DM001", "ST8000NM0055", "ST8000DM002", "ST4000DM000", "ST12000NM0007", "combined"]

for model in models:
  print("Model: {0}".format(model))
  df = pd.read_csv("dataset/{0}.csv".format(model))
  df.dropna(inplace=True)
  y = df["label"]
  X = df.drop(columns=["date", "serial_number", "label"])
  X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.2)
  X_train, y_train = RandomUnderSampler(sampling_strategy=1).fit_resample(X_train, y_train)
  X_test, y_test = RandomUnderSampler(sampling_strategy=.1).fit_resample(X_test, y_test)

  scaler = StandardScaler()
  X_train_scaled = scaler.fit_transform(X_train)
  n_features = X_train_scaled.shape[1]
  X_test_scaled = scaler.transform(X_test)

  dnn = tf.keras.Sequential([
                             tf.keras.layers.Conv1D(filters=10, kernel_size=3, activation='relu', input_shape=(n_features, 1)),
                             tf.keras.layers.Conv1D(filters=20, kernel_size=3, activation='relu'),
                             tf.keras.layers.Dropout(rate=0.3),
                             tf.keras.layers.MaxPool1D(),
                            #  tf.keras.layers.Flatten(),
                             tf.keras.layers.LSTM(300),
                             tf.keras.layers.Dense(64, activation='relu'),
                             tf.keras.layers.Dense(1, activation='sigmoid')
                             ])

  dnn.compile(
      loss=tf.keras.losses.binary_crossentropy,
      optimizer=tf.keras.optimizers.Adam(lr=0.01),
      metrics=[
          tf.keras.metrics.AUC(name='AUC')
      ]
  )

  history = dnn.fit(X_train_scaled, y_train, epochs=20, verbose=0)
  predictions = dnn.predict(X_test_scaled)
  prediction_classes = [
      1 if prob > 0.5 else 0 for prob in np.ravel(predictions)
  ]

  fpr, tpr, _ = metrics.roc_curve(y_test,  predictions)
  auc = metrics.roc_auc_score(y_test, predictions)
  print("AUC:", np.round(auc* 100, 2))

  with open("results/{0}_{1}.csv".format("CNNLSTM", model), "w") as ff:
    ff.write("tpr,fpr\n")
    for i in range(min(len(tpr),len(fpr))):
      ff.write("{0},{1}\n".format(tpr[i],fpr[i]))

## Evaluation

In [None]:
models = ["ST3000DM001", "ST12000NM0007", "ST8000NM0055", "ST8000DM002", "ST4000DM000"]
ml_models = ["dt"] #, "svc", "cnnlstm","xgb", "mds", "rf", "dnn"
with open("results/dt_latest_auc.csv", "w") as ff:
  ff.write("disk_model,ml_model,period,pos_sample,neg_sample,auc,tpr,fpr\n")

  for model in models:
    print("Model: {0}".format(model))
    df = pd.read_csv("dataset/{0}.csv".format(model))
    df.dropna(inplace=True)

    for i in range(3):
      if model == "ST3000DM001":
        y = df["label"]
        X = df.drop(columns=["date", "serial_number", "label"])
        X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.75)
      else:
        if i==0:
          train = df[(df.date<"2020-07-01")].sort_values(by="date")
          test = df[(df.date>"2020-06-30") & (df.date<"2020-10-01")].sort_values(by="date")
        elif i==1:
          train = df[(df.date<"2020-10-01")].sort_values(by="date")
          test = df[(df.date>"2020-09-30") & (df.date<"2021-01-01")].sort_values(by="date")
        else:
          train = df[(df.date<"2021-01-01")].sort_values(by="date")
          test = df[(df.date>"2020-12-31")].sort_values(by="date")

        y_test = test["label"]
        X_test = test.drop(columns=["date", "serial_number", "label"])
        y_train = train["label"]
        X_train = train.drop(columns=["date", "serial_number", "label"])
      
      X_train1, y_train1 = RandomUnderSampler(sampling_strategy=.33).fit_resample(X_train, y_train)
      X_train, y_train = RandomUnderSampler(sampling_strategy=.1).fit_resample(X_train, y_train)
      # X_test, y_test = RandomUnderSampler(sampling_strategy=.05).fit_resample(X_test, y_test)

      for ml in ml_models:
        if ml != "cnnlstm" :
          if ml == "xgb":
            clf = make_pipeline(StandardScaler(),
                              XGBClassifier(
                                  n_estimators=125, max_depth=6, learning_rate=0.018, objective= 'binary:logistic', eval_metric="mlogloss"))
            clf.fit(X_train, y_train)

          elif ml == "mds":
            clf = make_pipeline(MinMaxScaler(), RandomForestClassifier(n_estimators=20))
            clf.fit(X_train1, y_train1)
          
          elif ml == "rf":
            clf = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators=75))
            clf.fit(X_train, y_train)
          
          elif ml == "dt":
            clf = make_pipeline(StandardScaler(), tree.DecisionTreeClassifier())
            clf.fit(X_train, y_train)

          # elif ml == "dnn":
          #   clf = make_pipeline(StandardScaler(), MLPClassifier())
          #   clf.fit(X_train, y_train)
  
          else:
            clf = make_pipeline(StandardScaler(), SVC(kernel="linear", probability=True))
            clf.fit(X_train, y_train)

          y_pred_proba = clf.predict_proba(X_test)[::,1]
        
        elif ml == "cnnlstm":
          # X_test, y_test = RandomUnderSampler(sampling_strategy=.05).fit_resample(X_test, y_test)
          scaler = StandardScaler()
          X_train_scaled = scaler.fit_transform(X_train)
          n_features = X_train_scaled.shape[1]
          X_test_scaled = scaler.transform(X_test)

          dnn = tf.keras.Sequential([
                                    tf.keras.layers.Conv1D(filters=10, kernel_size=3, activation='relu', input_shape=(n_features, 1)),
                                    tf.keras.layers.Conv1D(filters=20, kernel_size=3, activation='relu'),
                                    tf.keras.layers.Dropout(rate=0.3),
                                    tf.keras.layers.MaxPool1D(),
                                    tf.keras.layers.LSTM(300),
                                    tf.keras.layers.Dense(64, activation='relu'),
                                    tf.keras.layers.Dense(1, activation='sigmoid')
                                    ])

          dnn.compile(
              loss=tf.keras.losses.binary_crossentropy,
              optimizer=tf.keras.optimizers.Adam(lr=0.01),
              metrics=[
                  tf.keras.metrics.AUC(name='AUC')
              ]
          )

          history = dnn.fit(X_train_scaled, y_train, epochs=25, verbose=0)
          y_pred_proba = dnn.predict(X_test_scaled)
        
        else:
          # X_test, y_test = RandomUnderSampler(sampling_strategy=.1).fit_resample(X_test, y_test)
          scaler = StandardScaler()
          X_train_scaled = scaler.fit_transform(X_train)
          X_test_scaled = scaler.transform(X_test)

          dnn = tf.keras.Sequential([
                                    #  tf.keras.Input(shape=(13,)),
                                    tf.keras.layers.Dense(128, activation='relu'),
                                    tf.keras.layers.ReLU(),
                                    tf.keras.layers.Dropout(rate=0.10),
                                    tf.keras.layers.Dense(128, activation='relu'),
                                    tf.keras.layers.ReLU(),
                                    tf.keras.layers.Dense(1, activation='linear')
                                    ])

          dnn.compile(
              loss=tf.keras.losses.binary_crossentropy,
              optimizer=tf.keras.optimizers.Adam(lr=0.003),
              metrics=[
                  tf.keras.metrics.AUC(name='AUC')
              ]
          )

          history = dnn.fit(X_train_scaled, y_train, epochs=25, verbose=0)
          y_pred_proba = dnn.predict(X_test_scaled)


        fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
        temp = pd.DataFrame({
              "tpr":tpr,
              "fpr":fpr
            })
        
        tpr, fpr = 0.9, np.round(temp[temp.tpr>=0.9].fpr.min(),4)
        auc = metrics.roc_auc_score(y_test, y_pred_proba)
        neg, pos = np.sum(y_train.values == 0), np.sum(y_train.values == 1)
        row = "{0},{1},{2},{3},{4},{5},{6},{7}".format(model, ml, i+1, pos, neg, np.round(auc, 4), tpr, fpr)
        print(row)
        ff.write(row + "\n")

## Random Analysis

In [None]:
# "ST8000NM0055", "ST4000DM000", "ST12000NM0007"
models = ["ST8000DM002"]

for model in models[:]:
  print("Model: {0}".format(model))
  df = pd.read_csv("dataset/{0}.csv".format(model))
  train = df[(df.date>"2019-12-31") & (df.date<"2020-12-01")].sort_values(by="date")
  test = df[(df.date>"2020-11-30") & (df.date<"2021-01-01")].sort_values(by="date")
  del(df)

  y_test = test["label"]
  X_test = test.drop(columns=["date", "serial_number", "label"])
  y_train = train["label"]
  X_train = train.drop(columns=["date", "serial_number", "label"])
  print(np.sum(y_train.values == 0), np.sum(y_train.values == 1))
  # X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.2)
  X_train, y_train = RandomUnderSampler(sampling_strategy=1).fit_resample(X_train, y_train)
  print(np.sum(y_train.values == 0), np.sum(y_train.values == 1))
  # X_test, y_test = RandomUnderSampler(sampling_strategy=.1).fit_resample(X_test, y_test)

  clf = make_pipeline(StandardScaler(), XGBClassifier(n_estimators=125, max_depth=6, learning_rate=0.018, objective= 'binary:logistic', eval_metric="mlogloss"))
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  y_pred_proba = clf.predict_proba(X_test)[::,1]
  fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
  auc = metrics.roc_auc_score(y_test, y_pred_proba)
  print("XGB -- AUC:", np.round(auc, 3))
  with open("results/logs_{0}.csv".format(model), "w") as ff:
    ff.write("probability,original\n")
    for i in range(len(X_test)):
      ff.write("{0},{1}\n".format(y_pred_proba[i], y_test.iloc[i]))

In [None]:
# df = pd.read_csv("dataset/{0}.csv".format("ST12000NM0007"))
# df.date.value_counts().plot()
# metrics.recall_score(y_test, y_pred)
models = ["ST8000NM0055", "ST4000DM000", "ST12000NM0007", "ST8000DM002"]
for model in models:
  print(model)
  # df = pd.read_csv ("results/logs2_{0}.csv".format(model))
  # fpr, tpr, _ = metrics.roc_curve(df.original,  df.probability)
  # # print(model, len(df))
  # temp = pd.DataFrame({
  #     "tpr":tpr,
  #     "fpr":fpr
  # })
  temp = pd.read_csv("results/XGB_{0}.csv".format(model))
  print(0.5, "-", np.round(temp[temp.tpr>=0.5].fpr.min(),3))
  print(0.9, "-", np.round(temp[temp.tpr>=0.899].fpr.min(),3))
  # print(0.95, "-", np.round(temp[temp.tpr>=0.95].fpr.min(),3))
  # print(0.99, "-", np.round(temp[temp.tpr>=0.99].fpr.min(),3))
  # print(0.999, "-", np.round(temp[temp.tpr>=0.999].fpr.min(),3))

In [None]:
df = pd.read_csv("dataset/{0}.csv".format("ST4000DM000"))
df['date'] = pd.to_datetime(df['date'])
df = df[(df.date>"2020-12-01")]
df1 = df.groupby("date").label.sum().reset_index()
df1.set_index("date", inplace=True)
df2 = df.groupby("date").serial_number.count().reset_index()
df2.set_index("date", inplace=True)
np.mean(df1.label/df2.serial_number)


In [None]:
df = pd.read_csv("dataset/{0}.csv".format("ST8000NM0055"))
df['date'] = pd.to_datetime(df['date'])
df = df[(df.date<"2020-12-01")]
df1 = df.groupby("date").label.sum().reset_index()
df1.set_index("date", inplace=True)
df2 = df.groupby("date").serial_number.count().reset_index()
df2.set_index("date", inplace=True)
np.mean(df1.label/df2.serial_number)
# plt.tight_layout()


In [None]:
df = pd.read_csv("dataset/{0}.csv".format("ST12000NM0007"))
df['date'] = pd.to_datetime(df['date'])


In [None]:

df = df[(df.date<"2021-01-01")]
df1 = df.groupby("date").label.sum().reset_index()
df1.set_index("date", inplace=True)
df2 = df.groupby("date").serial_number.count().reset_index()
df2.set_index("date", inplace=True)
np.mean(df1.label/df2.serial_number)

In [None]:
df = pd.read_csv("results/latest_auc.csv")
np.round(df.groupby(["disk_model","ml_model"])["auc", "fpr"].mean()*100, 2).sort_values(by="disk_model")

In [None]:
## Feature scores
models = ["ST3000DM001", "ST8000NM0055", "ST8000DM002", "ST4000DM000", "ST12000NM0007", "combined"]
score = {}
for model in models:
  print("Model: {0}".format(model))
  df = pd.read_csv("dataset/{0}.csv".format(model))
  df.dropna(inplace=True)
  y = df["label"]
  X = df.drop(columns=["date", "serial_number", "label"])
  X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.75)
  X_train, y_train = RandomUnderSampler(sampling_strategy=1).fit_resample(X_train, y_train)
  X_test, y_test = RandomUnderSampler(sampling_strategy=.1).fit_resample(X_test, y_test)

  clf = XGBClassifier(n_estimators=125, max_depth=6, learning_rate=0.018, objective= 'binary:logistic', eval_metric="mlogloss")
  clf.fit(X_train, y_train)
  imp = np.round(clf.feature_importances_, 2)
  print(imp)
  columns = X.columns

  for i in range(len(columns)):
    if columns[i] in score:
      score[columns[i]].append(imp[i])
    else:
      score[columns[i]] = [imp[i]]

In [None]:
models = ["ST3000DM001", "ST8000NM0055", "ST8000DM002", "ST4000DM000", "ST12000NM0007", "combined"]
score2 = {}
for model in models:
  print("Model: {0}".format(model))
  df = pd.read_csv("dataset/{0}.csv".format(model))
  df.dropna(inplace=True)
  y = df["label"]
  X = df.drop(columns=["date", "serial_number", "label"])
  # X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.75)
  # X_train, y_train = RandomUnderSampler(sampling_strategy=1).fit_resample(X_train, y_train)
  # X_test, y_test = RandomUnderSampler(sampling_strategy=.1).fit_resample(X_test, y_test)

  clf = RandomForestClassifier(n_estimators=10)
  clf.fit(X, y)
  imp = np.round(clf.feature_importances_, 4)
  print(imp)
  columns = X.columns

  for i in range(len(columns)):
    if columns[i] in score2:
      score2[columns[i]].append(imp[i])
    else:
      score2[columns[i]] = [imp[i]]

In [None]:
models = ["ST3000DM001", "ST8000NM0055", "ST8000DM002", "ST4000DM000", "ST12000NM0007", "combined"]

for model in models[:]:
  print("Model: {0}".format(model))
  df = pd.read_csv("dataset/{0}.csv".format(model))
  df.dropna(inplace=True)
  y = df["label"]
  X = df.drop(columns=["date", "serial_number", "label"])
  X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.2)
  X_train1, y_train1 = RandomUnderSampler(sampling_strategy=.33).fit_resample(X_train, y_train)
  X_train, y_train = RandomUnderSampler(sampling_strategy=1).fit_resample(X_train, y_train)
  X_test, y_test = RandomUnderSampler(sampling_strategy=.1).fit_resample(X_test, y_test)

  clf = make_pipeline(StandardScaler(), XGBClassifier(n_estimators=125, max_depth=6, learning_rate=0.018, objective= 'binary:logistic', eval_metric="mlogloss"))
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  y_pred_proba = clf.predict_proba(X_test)[::,1]
  fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
  auc = metrics.roc_auc_score(y_test, y_pred_proba)
  print("XGB -- AUC:", np.round(auc, 3))
  # with open("results/{0}_{1}.csv".format("XGB", model), "w") as ff:
  #   ff.write("tpr,fpr\n")
  #   for i in range(min(len(tpr),len(fpr))):
  #     ff.write("{0},{1}\n".format(tpr[i],fpr[i]))

In [None]:
scores = []
for key in score2:
  scores.append(np.round(np.mean(score2[key]),4))
  print(key, np.round(np.mean(score2[key]),4))

print(scores)
np.round(np.array(scores)/np.sum(scores),4)

In [None]:
## Disk error ratio
models = ["ST3000DM001", "ST8000NM0055", "ST8000DM002", "ST4000DM000", "ST12000NM0007"]
total = []
for model in models:
  disks = set()
  errors = set()
  double_errs = set()
  print("Model: {0}".format(model))
  with open("models_logs/{0}.csv".format(model), "r") as ff:
    line = ff.readline()
    count = 1
    while line:
      if count>1:
        row = line.split(",")
        disks.add(row[1])
        if row[10] != "" and  float(row[10])>0:
          if row[1] in errors:
            if row[1] not in double_errs:
              double_errs.add(row[1])
          else:
            errors.add(row[1])  

      line = ff.readline()
      count += 1

  total.append(count)
  print(count,len(disks), len(errors), len(double_errs), np.round(len(errors)/len(disks), 4))

print(np.sum(total))

In [None]:
## Transfer Learning
models = ["ST3000DM001", "ST8000NM0055", "ST8000DM002", "ST4000DM000"]
df = pd.read_csv("dataset/{0}.csv".format("ST12000NM0007"))
y_train = df["label"]
X_train = df.drop(columns=["date", "serial_number", "label",])
X_train, y_train = RandomUnderSampler(sampling_strategy=.33).fit_resample(X_train, y_train)
del df

clf = make_pipeline(StandardScaler(), XGBClassifier(n_estimators=125, max_depth=6, learning_rate=0.018, objective= 'binary:logistic', eval_metric="mlogloss"))
clf.fit(X_train, y_train)

for model in models:
  print("Model: {0}".format(model))
  df = pd.read_csv("dataset/{0}.csv".format(model))
  df.dropna(inplace=True)
  y_test = df["label"]
  X_test = df.drop(columns=["date", "serial_number", "label"])
  del df
  # X_test, y_test = RandomUnderSampler(sampling_strategy=.01).fit_resample(X_test, y_test)
  y_pred_proba = clf.predict_proba(X_test)[::,1]
  fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
  auc = metrics.roc_auc_score(y_test, y_pred_proba)

  print("XGB -- AUC:", np.round(auc, 3))
  with open("results/{0}_ST12000NM0007_{1}.csv".format("XGB", model), "w") as ff:
    ff.write("tpr,fpr\n")
    for i in range(min(len(tpr),len(fpr))):
      ff.write("{0},{1}\n".format(tpr[i],fpr[i]))


In [None]:
## Transfer Learning
models = ["ST4000DM000", "ST8000NM0055", "ST8000DM002", "ST12000NM0007"]
for model in models[3:]:
  print("model: {0}".format(model))
  df = pd.read_csv("dataset/{0}.csv".format(model))

  test = df[(df.date>"2020-11-30") & (df.date<"2021-01-01")].sort_values(by="date")
  y_test = test["label"]
  X_test = test.drop(columns=["date", "serial_number", "label"])
  for i in range(11,0,-1):
    print("\tperiod: {0}".format(i))
    if i>9:
      train = df[(df.date>="2020-{0}-01".format(i)) & (df.date<"2020-12-01")].sort_values(by="date")
    else:
      train = df[(df.date>="2020-0{0}-01".format(i)) & (df.date<"2020-12-01")].sort_values(by="date")

    y_train = train["label"]
    X_train = train.drop(columns=["date", "serial_number", "label",])
    X_train, y_train = RandomUnderSampler(sampling_strategy=.33).fit_resample(X_train, y_train) 
    clf = make_pipeline(StandardScaler(), XGBClassifier(n_estimators=125, max_depth=6, learning_rate=0.018, objective= 'binary:logistic', eval_metric="mlogloss"))
    clf.fit(X_train, y_train)
    y_pred_proba = clf.predict_proba(X_test)[::,1]
    fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
    auc = metrics.roc_auc_score(y_test, y_pred_proba)
    print("\tXGB -- AUC:", np.round(auc, 3))
    temp = pd.DataFrame({
        "tpr":tpr,
        "fpr":fpr
    })
    print("\ttpr=0.9 - fpr={0}".format(np.round(temp[temp.tpr>=0.9].fpr.min(),3)))