In [23]:
import os
import gc

import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler, MinMaxScaler, RobustScaler
from pyspark.ml import Pipeline
from pyspark.ml.functions import vector_to_array
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 200)

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
DATA_DIR = '../data'
TRAIN_FILEPATH = os.path.join(DATA_DIR, 'df_train.csv')
VAL_FILEPATH = os.path.join(DATA_DIR, 'df_val.csv')
TEST_FILEPATH = os.path.join(DATA_DIR, 'df_test.csv')

MODELS_DIR = '../models'

## Random seed

In [4]:
RANDOM_SEED = 42

## Load data with PySpark session

In [5]:
spark = (SparkSession.builder
         .config("spark.driver.memory", "12g")
         .appName("SparkSQL").getOrCreate())

your 131072x1 screen size is bogus. expect trouble
24/08/24 17:10:15 WARN Utils: Your hostname, cedric-yu-work resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
24/08/24 17:10:15 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/24 17:10:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
date_col = 'date'
target_label = 'failure'
id_cols = ['serial_number', 'model']
cycle_id_col = 'cycle_id'


In [7]:
df_train = (spark.read.option("header", "true")
      .option("inferSchema", "true")
      .csv(TRAIN_FILEPATH))
df_val = (spark.read.option("header", "true")
      .option("inferSchema", "true")
      .csv(VAL_FILEPATH))
df_test = (spark.read.option("header", "true")
      .option("inferSchema", "true")
      .csv(TEST_FILEPATH))


In [8]:
df_train.show(10)

24/08/24 17:10:52 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+-------+------------------+--------------------+------------------+-----------+------------------+------------------+------------------+-----------+------------------+------------------+------------------+-------------------+------------+-------------------+------------+-------------+--------------------+-----------------+--------------------+--------------------+--------------------+------------------+--------------------+-------------+--------------------+-------------+-------------+------------------------+------------------------+------------------------+--------------------+--------------------+--------------------+------------------------+------------------------+------------------------+-----------------+-----------------+-----------------+------------------------+------------------------+------------------------+------------------+------------------+------------------+------------------------+------------------------+------------------------+-----------------+-----------------

## Prepare datasets for ML model

In [10]:
feature_cols = [col for col in df_train.columns
                if col not in id_cols + [date_col, cycle_id_col, target_label]]

In [12]:
assembler_train = VectorAssembler(inputCols=feature_cols,
                                outputCol='features')
assembler_val = VectorAssembler(inputCols=feature_cols,
                                outputCol='features')
assembler_test = VectorAssembler(inputCols=feature_cols,
                                outputCol='features')

In [15]:
df_train1 = assembler_train.transform(df_train).select([target_label, 'features'])
df_val1 = assembler_val.transform(df_val).select([target_label, 'features'])
df_test1 = assembler_test.transform(df_test).select([target_label, 'features'])

In [21]:
gbtc = GBTClassifier(labelCol=target_label, featuresCol='features')
gbtc = gbtc.fit(df_train1)

In [22]:
pred_train = gbtc.transform(df_train1)
pred_val = gbtc.transform(df_val1)
pred_test = gbtc.transform(df_test1)

In [24]:
pred_train.show()

+-------+--------------------+--------------------+--------------------+----------+
|failure|            features|       rawPrediction|         probability|prediction|
+-------+--------------------+--------------------+--------------------+----------+
|      0|(108,[0,1,2,5,8,9...|[1.18900284930927...|[0.91513467861905...|       0.0|
|      0|(108,[0,1,2,5,8,9...|[1.18900284930927...|[0.91513467861905...|       0.0|
|      0|(108,[0,1,2,5,8,9...|[1.16357792623223...|[0.91110124201837...|       0.0|
|      0|(108,[0,1,2,5,8,9...|[1.16357792623223...|[0.91110124201837...|       0.0|
|      0|(108,[0,1,2,5,8,9...|[1.16357792623223...|[0.91110124201837...|       0.0|
|      0|(108,[0,1,2,5,8,9...|[1.16357792623223...|[0.91110124201837...|       0.0|
|      0|(108,[0,1,2,5,8,9...|[1.18900284930927...|[0.91513467861905...|       0.0|
|      0|(108,[0,1,2,5,8,9...|[1.18900284930927...|[0.91513467861905...|       0.0|
|      0|(108,[0,1,2,5,8,9...|[1.16357792623223...|[0.91110124201837...|    

24/08/24 17:22:26 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


In [26]:
evaluator = MulticlassClassificationEvaluator(
    labelCol=target_label,
    predictionCol='prediction', metricName='f1')

In [27]:
evaluator.evaluate(pred_train)

0.93245717112637

In [28]:
evaluator.evaluate(pred_val)

0.8581985592123039

In [29]:
evaluator.evaluate(pred_test)

0.9121424894252763