In [40]:
import os
import gc

import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 200)

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
DATA_DIR = '../data'
DATA_FILEPATH = os.path.join(DATA_DIR, 'harddrive_nodup')
CYCLE_ID_FILEPATH = os.path.join(DATA_DIR, 'cycle_id.csv')
CYCLE_ID_FAILURE_FILEPATH = os.path.join(DATA_DIR, 'cycle_id_failure.csv')
os.path.exists(DATA_FILEPATH)
PREPROCESSED_FILEPATH = os.path.join(DATA_DIR, 'harddrive_preprocessed')
TRAINING_FILEPATH = os.path.join(DATA_DIR, 'harddrive_training.csv')
os.path.exists(TRAINING_FILEPATH)

True

## Random seed

In [3]:
RANDOM_SEED = 42
test_frac = 0.1

## Load data with PySpark session

In [4]:
spark = (SparkSession.builder
         .config("spark.driver.memory", "12g")
         .appName("SparkSQL").getOrCreate())

your 131072x1 screen size is bogus. expect trouble
24/08/24 13:50:14 WARN Utils: Your hostname, cedric-yu-work resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
24/08/24 13:50:14 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/24 13:50:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
date_col = 'date'
target_label = 'failure'
id_cols = ['serial_number', 'model']
cycle_id_col = 'cycle_id'


In [6]:
df = (spark.read.option("header", "true")
      .option("inferSchema", "true")
      .csv(TRAINING_FILEPATH))
df = df.sort(id_cols + [date_col])


In [7]:
df.show(10)

24/08/24 13:50:19 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+-------------+------------------+----------+-------+--------------------+------------------+-----------+------------------+-----------+------------------+-----------+------------------+-----------+------------------+-----------+------------------+-----------+-------------------+------------+-------------------+------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+-------------+-------------+-------------+-------------+--------+--------------------+--------------------+--------------------+------------------------+------------------------+------------------------+-----------------+-----------------+-----------------+------------------------+------------------------+------------------------+-----------------+-----------------+-----------------+------------------------+------------------------+------------------------+-----------------+------

In [8]:
df.count()

2130

In [9]:
gc.collect()

0

In [10]:
df.groupby([target_label]).count().show()

+-------+-----+
|failure|count|
+-------+-----+
|      1|  175|
|      0| 1955|
+-------+-----+



## Train-validation-test split, grouped by cycle_id

In [14]:
df.select(cycle_id_col).dropDuplicates().sort(cycle_id_col).show()

+--------+
|cycle_id|
+--------+
|      14|
|    1340|
|    1927|
|    2176|
|    3043|
|    3594|
|    3605|
|    4323|
|    5635|
|    5905|
|    6514|
|    9017|
|    9623|
|   14154|
|   14446|
|   14841|
|   17279|
|   17507|
|   23583|
|   37632|
+--------+
only showing top 20 rows



In [15]:
[df_train_cycle, df_val_cycle, df_test_cycle] = (
    df.select(cycle_id_col).dropDuplicates().sort(cycle_id_col)
    .randomSplit([1.- 2 * test_frac, test_frac, test_frac],
                 seed=RANDOM_SEED))

In [19]:
df_val_cycle.count()

18

In [20]:
for ind_ in [df_train_cycle, df_val_cycle, df_test_cycle]:
    print(ind_.count())

146
18
11


In [33]:
df_train = df.join(df_train_cycle, on=cycle_id_col, how='inner').drop(*(id_cols + [date_col, cycle_id_col]))
df_val = df.join(df_val_cycle, on=cycle_id_col, how='inner').drop(*(id_cols + [date_col, cycle_id_col]))
df_test = df.join(df_test_cycle, on=cycle_id_col, how='inner').drop(*(id_cols + [date_col, cycle_id_col]))


In [34]:
df_val.count()

174

In [35]:
num_cols = [col for col in df_train.columns
            if col not in id_cols + [cycle_id_col, date_col, target_label]]

In [41]:
assemblers = [VectorAssembler(inputCols=[col], outputCol=col + "_vec") for col in num_cols]
scalers = [MinMaxScaler(inputCol=col + "_vec", outputCol=col + "_scaled") for col in num_cols]
pipeline = Pipeline(stages=assemblers + scalers)
scalerModel = pipeline.fit(df_train)
df_train_scaled = scalerModel.transform(df_train).select(*[col for col in df_train_scaled.columns
                                           if col.endswith("_scaled") or col == target_label])

In [42]:
df_train_scaled = df_train_scaled.select(*[col for col in df_train_scaled.columns
                                           if col.endswith("_scaled") or col == target_label])

In [43]:
df_train_scaled.show()

[Stage 1535:>                                                       (0 + 1) / 1]

+---------------------+-------------------------+--------------------+-------------------------+--------------------+-------------------------+--------------------+-------------------------+--------------------+-------------------------+------------------+-------------------------+--------------------+--------------------------+-------------------+--------------------------+--------------------+--------------------+---------------------------+--------------------+---------------------------+--------------------+---------------------------+--------------------+---------------------------+--------------------+---------------------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------------+---------------------------+---------------------------+-------------------------------+-------------------------------+-------------------------------+------------------------+------------------------+----------------------

                                                                                

In [2]:
spark.stop()

NameError: name 'spark' is not defined