In [34]:
import os
import gc
import json
import pandas as pd
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler, MinMaxScaler, RobustScaler
from pyspark.ml.pipeline import PipelineModel
from pyspark.ml.functions import vector_to_array
from pyspark.ml.classification import GBTClassifier, GBTClassificationModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from utils import (
    CYCLE_ID_FAILURE_FILEPATH,
    CYCLE_ID_FILEPATH,
    NO_DUP_FILEPATH,
    PREPROCESSED_FILEPATH,
    RANDOM_SEED,
    SPARK_MEMORY,
    TRAINING_FILEPATH,
    cycle_id_col,
    date_col,
    get_const_columns,
    get_failed_cycles,
    get_less_na_cols,
    id_cols,
    make_cycle_id,
    parse_na,
    sort_cols,
    target_label,
    scale_features,
    lag_features,
    model_transform_features
)

pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 200)

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
DATA_DIR = '../data'
DATA_FILEPATH = os.path.join(DATA_DIR, 'harddrive.csv')
NO_DUP_FILEPATH = os.path.join(DATA_DIR, 'harddrive_nodup')
PREPROCESSED_FILEPATH = os.path.join(DATA_DIR, 'harddrive_preprocessed')
CYCLE_ID_FILEPATH = os.path.join(DATA_DIR, 'cycle_id.csv')
CYCLE_ID_FAILURE_FILEPATH = os.path.join(DATA_DIR, 'cycle_id_failure.csv')
TRAINING_FILEPATH = os.path.join(DATA_DIR, 'harddrive_training.csv')
TRAIN_FILEPATH = os.path.join(DATA_DIR, 'df_train.csv')
VAL_FILEPATH = os.path.join(DATA_DIR, 'df_val.csv')
TEST_FILEPATH = os.path.join(DATA_DIR, 'df_test.csv')


MODELS_DIR = '../models'
os.makedirs(MODELS_DIR, exist_ok=True)
SCALER_PIPELINE_FILEPATH = os.path.join(MODELS_DIR, 'scaler')
ML_MODEL_FILEPATH = os.path.join(MODELS_DIR, 'model')
ML_EVAL_SCORES_FILEPATH = os.path.join(MODELS_DIR, 'model_scores.json')
PREPROCESSED_COLUMNS_FILEPATH = os.path.join(MODELS_DIR,
                                             'preprocessed_columns.json')
LAGGED_COLS_FILEPATH = os.path.join(MODELS_DIR, 'lagged_columns.json')
NUM_FEATURE_COLUMNS_FILEPATH = os.path.join(MODELS_DIR,
                                            'num_feature_columns.json')
SCALED_COLUMNS_FILEPATH = os.path.join(MODELS_DIR, 'scaled_columns.json')
FEATURE_COLUMNS_FILEPATH = os.path.join(MODELS_DIR, 'feature_columns.json')

## Random seed

In [3]:
RANDOM_SEED = 42

## Load data with PySpark session

In [4]:
spark = (SparkSession.builder
         .config("spark.driver.memory", "12g")
         .appName("SparkSQL").getOrCreate())

your 131072x1 screen size is bogus. expect trouble
24/08/24 19:08:06 WARN Utils: Your hostname, cedric-yu-work resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
24/08/24 19:08:06 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/24 19:08:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
date_col = 'date'
target_label = 'failure'
id_cols = ['serial_number', 'model']
cycle_id_col = 'cycle_id'


In [6]:
with open(PREPROCESSED_COLUMNS_FILEPATH, 'r') as f:
    preprocessed_cols = json.load(f)
with open(LAGGED_COLS_FILEPATH, 'r') as f:
    lagged_cols = json.load(f)
with open(NUM_FEATURE_COLUMNS_FILEPATH, 'r') as f:
    num_feature_cols = json.load(f)
with open(SCALED_COLUMNS_FILEPATH, 'r') as f:
    scaled_cols = json.load(f)
with open(FEATURE_COLUMNS_FILEPATH, 'r') as f:
    feature_cols = json.load(f)
for list_ in [preprocessed_cols, lagged_cols, num_feature_cols, scaled_cols, feature_cols]:
    if target_label in list_:
        list_.remove(target_label)

In [7]:
df_inf = (spark.read.option("header", "true")
      .option("inferSchema", "true")
      .csv(PREPROCESSED_FILEPATH))
df_inf = df_inf.sort(sort_cols).limit(100).cache()

df_inf = df_inf.select(*preprocessed_cols)

24/08/24 19:08:14 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [8]:
df_inf.show()



+----------+-------------+------------------+--------------------+------------------+-----------+------------------+-----------+------------------+-----------+------------------+-----------+------------------+-----------+------------------+-----------+-------------------+------------+-------------------+------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+-------------+-------------+-------------+
|      date|serial_number|             model|      capacity_bytes|smart_1_normalized|smart_1_raw|smart_3_normalized|smart_3_raw|smart_4_normalized|smart_4_raw|smart_5_normalized|smart_5_raw|smart_7_normalized|smart_7_raw|smart_9_normalized|smart_9_raw|smart_10_normalized|smart_10_raw|smart_12_normalized|smart_12_raw|smart_188_raw|smart_192_normalized|smart_192_raw|smart_193_normalized|smart_193_raw|smart_194_nor

                                                                                

## Make cycle id

In [9]:
df_cycle_id = make_cycle_id(df_inf)
df_cycle_id.cache()

24/08/24 19:08:18 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/24 19:08:18 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/24 19:08:18 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/24 19:08:18 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/24 19:08:18 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


DataFrame[serial_number: string, model: string, date: date, cycle_id: int]

In [13]:
df_cycle_id.show()

+-------------+------------------+----------+--------+
|serial_number|             model|      date|cycle_id|
+-------------+------------------+----------+--------+
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-01|       1|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-02|       1|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-03|       1|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-04|       1|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-05|       1|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-06|       1|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-07|       1|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-08|       1|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-09|       1|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-10|       1|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-11|       1|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-12|       1|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-13|       1|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-14|       1|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-15|       1|
|    13H2B

In [10]:
df_inf = df_inf.join(
    df_cycle_id,
    on=id_cols + [date_col], how='inner'
).sort(id_cols + [date_col, cycle_id_col])

## Lag features

In [11]:
# lag features
num_lags = 3
df_inf = lag_features(df_inf, num_lags=num_lags)
df_inf = df_inf.dropna().sort(sort_cols)

In [12]:
df_inf = df_inf.select(*lagged_cols)

In [14]:
df_inf.show()

+-------------+------------------+----------+--------------------+------------------+-----------+------------------+-----------+------------------+-----------+------------------+-----------+------------------+-----------+------------------+-----------+-------------------+------------+-------------------+------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+-------------+-------------+-------------+-------------+--------+--------------------+--------------------+--------------------+------------------------+------------------------+------------------------+-----------------+-----------------+-----------------+------------------------+------------------------+------------------------+-----------------+-----------------+-----------------+------------------------+------------------------+------------------------+-----------------+--------------

## Scale features

In [16]:
index_cols = df_inf.select(*(id_cols + [date_col])).cache()

In [18]:
index_cols.show()

+-------------+------------------+----------+
|serial_number|             model|      date|
+-------------+------------------+----------+
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-04|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-05|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-06|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-07|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-08|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-09|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-10|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-11|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-12|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-13|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-14|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-15|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-16|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-17|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-18|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-19|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-20|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-21|
|    13H2B97AS|TOSHIBA DT01ACA300|

In [19]:
scalerModel = PipelineModel.load(SCALER_PIPELINE_FILEPATH)

In [20]:
scalerModel

PipelineModel_125d7a922609

In [35]:
df_inf_scaled = scale_features(df=df_inf,
                                scalerModel=scalerModel,
                                num_feature_cols=num_feature_cols,
                                inference=True)

In [36]:
df_inf_scaled = df_inf_scaled.select(*(id_cols + [date_col] + scaled_cols))

In [37]:
df_inf_scaled.show()

+-------------+------------------+----------+------------------+--------------------+------------------+-----------+------------------+------------------+------------------+-----------+------------------+------------------+------------------+-------------------+------------+-------------------+------------+-------------+--------------------+-------------+--------------------+--------------------+--------------------+------------------+--------------------+-------------+--------------------+-------------+-------------+------------------------+------------------------+------------------------+--------------------+--------------------+--------------------+------------------------+------------------------+------------------------+-----------------+-----------------+-----------------+------------------------+------------------------+------------------------+------------------+------------------+------------------+------------------------+------------------------+------------------------+---

## ML model inference

In [29]:
model = GBTClassificationModel.load(ML_MODEL_FILEPATH)

In [38]:
df_inf_scaled, assembler_inf = model_transform_features(
    df=df_inf_scaled,
    feature_cols=feature_cols,
    inference=True)

In [39]:
df_inf_scaled.show()

+-------------+------------------+----------+--------------------+
|serial_number|             model|      date|            features|
+-------------+------------------+----------+--------------------+
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-04|(108,[0,2,5,8,9,1...|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-05|(108,[0,2,5,8,9,1...|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-06|(108,[0,1,2,5,8,9...|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-07|(108,[0,1,2,5,8,9...|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-08|(108,[0,2,5,8,9,1...|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-09|(108,[0,1,2,5,8,9...|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-10|(108,[0,2,5,8,9,1...|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-11|(108,[0,1,2,5,8,9...|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-12|(108,[0,2,5,8,9,1...|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-13|(108,[0,2,5,8,9,1...|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-14|(108,[0,1,2,5,8,9...|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-15|(108,[0,2,5,8,9,1

In [25]:
model

GBTClassificationModel: uid = GBTClassifier_559b68a00181, numTrees=20, numClasses=2, numFeatures=108

In [40]:
pred_inf = model.transform(df_inf_scaled)

In [41]:
pred_inf.show()

+-------------+------------------+----------+--------------------+--------------------+--------------------+----------+
|serial_number|             model|      date|            features|       rawPrediction|         probability|prediction|
+-------------+------------------+----------+--------------------+--------------------+--------------------+----------+
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-04|(108,[0,2,5,8,9,1...|[1.21542319000156...|[0.91914945276257...|       0.0|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-05|(108,[0,2,5,8,9,1...|[1.21542319000156...|[0.91914945276257...|       0.0|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-06|(108,[0,1,2,5,8,9...|[1.21542319000156...|[0.91914945276257...|       0.0|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-07|(108,[0,1,2,5,8,9...|[1.15980173125695...|[0.9104876281582,...|       0.0|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-01-08|(108,[0,2,5,8,9,1...|[1.21542319000156...|[0.91914945276257...|       0.0|
|    13H2B97AS|TOSHIBA DT01ACA300|2016-0

24/08/24 19:13:20 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


In [42]:
pred_inf.toPandas()

Unnamed: 0,serial_number,model,date,features,rawPrediction,probability,prediction
0,13H2B97AS,TOSHIBA DT01ACA300,2016-01-04,"(14.285714285714285, 0.0, 16.75, 0.0, 0.0, 1.0...","[1.2154231900015608, -1.2154231900015608]","[0.9191494527625703, 0.08085054723742968]",0.0
1,13H2B97AS,TOSHIBA DT01ACA300,2016-01-05,"(14.285714285714285, 0.0, 16.75, 0.0, 0.0, 1.0...","[1.2154231900015608, -1.2154231900015608]","[0.9191494527625703, 0.08085054723742968]",0.0
2,13H2B97AS,TOSHIBA DT01ACA300,2016-01-06,"(14.142857142857142, 0.0007978138683865501, 16...","[1.2154231900015608, -1.2154231900015608]","[0.9191494527625703, 0.08085054723742968]",0.0
3,13H2B97AS,TOSHIBA DT01ACA300,2016-01-07,"(13.714285714285714, 0.002792333322377111, 16....","[1.159801731256955, -1.159801731256955]","[0.9104876281582, 0.08951237184180005]",0.0
4,13H2B97AS,TOSHIBA DT01ACA300,2016-01-08,"(14.285714285714285, 0.0, 16.75, 0.0, 0.0, 1.0...","[1.2154231900015608, -1.2154231900015608]","[0.9191494527625703, 0.08085054723742968]",0.0
...,...,...,...,...,...,...,...
83,13H32WEAS,TOSHIBA DT01ACA300,2016-04-25,"(14.285714285714285, 0.0, 12.5, 0.0, 0.0, 0.52...","[0.8605891359793435, -0.8605891359793435]","[0.8482805429642534, 0.15171945703574663]",0.0
84,13H32WEAS,TOSHIBA DT01ACA300,2016-04-26,"(14.285714285714285, 0.0, 12.5, 0.0, 0.0, 0.52...","[0.8605891359793435, -0.8605891359793435]","[0.8482805429642534, 0.15171945703574663]",0.0
85,13H32WEAS,TOSHIBA DT01ACA300,2016-04-27,"(14.285714285714285, 0.0, 12.5, 0.0, 0.0, 0.52...","[0.8605891359793435, -0.8605891359793435]","[0.8482805429642534, 0.15171945703574663]",0.0
86,13H32WEAS,TOSHIBA DT01ACA300,2016-04-28,"(14.285714285714285, 0.0, 12.5, 0.0, 0.0, 0.52...","[0.8605891359793435, -0.8605891359793435]","[0.8482805429642534, 0.15171945703574663]",0.0
