In [1]:
import os
import gc

import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 200)

%load_ext autoreload
%autoreload 2

In [2]:
data_dir = '../data'
data_filepath = os.path.join(data_dir, 'harddrive_nodup')
os.path.exists(data_filepath)

True

## Random seed

In [3]:
RANDOM_SEED = 42
test_frac = 0.1

## Load data with PySpark session

In [4]:
spark = (SparkSession.builder
         .config("spark.driver.memory", "12g")
         .appName("SparkSQL").getOrCreate())

your 131072x1 screen size is bogus. expect trouble
24/08/23 16:00:26 WARN Utils: Your hostname, cedric-yu-work resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
24/08/23 16:00:26 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/23 16:00:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
date_col = 'date'
target_label = 'failure'
id_cols = ['serial_number', 'model']


In [6]:
df = (spark.read.option("header", "true")
      .option("inferSchema", "true")
      .csv(data_filepath))
df = df.sort(id_cols + [date_col])


                                                                                

In [7]:
df.show(10)

24/08/23 16:00:41 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

+----------+-------------+------------------+--------------------+-------+------------------+-----------+------------------+-----------+------------------+-----------+------------------+-----------+------------------+-----------+------------------+-----------+------------------+-----------+------------------+-----------+-------------------+------------+-------------------+------------+-------------------+------------+-------------------+------------+-------------------+------------+-------------------+------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+----

                                                                                

In [8]:
df.count()

3058798

In [9]:
gc.collect()

166

In [10]:
df.groupby([target_label]).count().show()



+-------+-------+
|failure|  count|
+-------+-------+
|      1|    205|
|      0|3058593|
+-------+-------+



                                                                                

Missing values by column

In [28]:
df.select([F.count(F.when(F.isnan(c), c)).alias(c) for c in df.columns
           if c != 'date']).show()



+-------------+-----+--------------+-------+------------------+-----------+------------------+-----------+------------------+-----------+------------------+-----------+------------------+-----------+------------------+-----------+------------------+-----------+------------------+-----------+-------------------+------------+-------------------+------------+-------------------+------------+-------------------+------------+-------------------+------------+-------------------+------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------

                                                                                

## Add 'id_index' column corresponding to unique id column values and identify existence of failure for each

In [11]:
id_cols_hasfailure = df.where(df[target_label] == 1).select(id_cols).distinct().sort('model', ascending=False)
id_cols_hasfailure = id_cols_hasfailure.withColumn("has_failure", F.lit(1))

id_cols_unique = df.select(id_cols).distinct().sort('model', ascending=False)

id_cols_unique = id_cols_unique.join(id_cols_hasfailure, on=id_cols, how='outer')
id_cols_unique = id_cols_unique.na.fill(0, subset='has_failure')
w = Window().orderBy(F.lit('A'))
id_cols_unique = id_cols_unique.withColumn("id_index", F.row_number().over(w))
id_cols_unique = id_cols_unique.cache()

24/08/23 16:00:59 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/23 16:00:59 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/23 16:00:59 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [12]:
id_cols_unique.toPandas()

24/08/23 16:01:03 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/23 16:01:03 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/23 16:01:03 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/23 16:01:03 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/23 16:01:04 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/23 16:01:04 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
          

Unnamed: 0,serial_number,model,has_failure,id_index
0,13H85BMGS,TOSHIBA DT01ACA300,0,1
1,2EG3NZ2P,HGST HUH728080ALE600,0,2
2,2EG4B1VR,HGST HUH728080ALE600,0,3
3,2EG4L4VP,HGST HUH728080ALE600,0,4
4,4596K0X6FMYB,TOSHIBA MD04ABA400V,0,5
...,...,...,...,...
65988,Z4D0DNLY,ST6000DX000,0,65989
65989,Z4D0E5MF,ST6000DX000,0,65990
65990,Z4D0E7MS,ST6000DX000,0,65991
65991,Z4D1CBVD,ST6000DX000,0,65992


In [13]:
df = df.join(id_cols_unique.drop('has_failure'), on=id_cols, how='left')

In [14]:
df.show(10)

+-------------+--------------------+----------+-------------------+-------+------------------+-----------+------------------+-----------+------------------+-----------+------------------+-----------+------------------+-----------+------------------+-----------+------------------+-----------+------------------+-----------+-------------------+------------+-------------------+------------+-------------------+------------+-------------------+------------+-------------------+------------+-------------------+------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+---

In [15]:
id_cols_unique.groupby(['has_failure']).count().show()

+-----------+-----+
|has_failure|count|
+-----------+-----+
|          0|65788|
|          1|  205|
+-----------+-----+



## Train-validation-test split, grouped by id_index and stratified by target label

In [16]:
yes_failure_ind = id_cols_unique.where(id_cols_unique['has_failure'] == 1).select('id_index')
no_failure_ind = id_cols_unique.where(id_cols_unique['has_failure'] == 0).select('id_index')

[yes_failure_ind_train, yes_failure_ind_val, yes_failure_ind_test] = (
    yes_failure_ind
    .randomSplit([1.- 2 * test_frac, test_frac, test_frac],
                 seed=RANDOM_SEED))
[no_failure_ind_train, no_failure_ind_val, no_failure_ind_test] = (
    no_failure_ind
    .randomSplit([1.- 2 * test_frac, test_frac, test_frac],
                 seed=RANDOM_SEED))

In [17]:
for ind_ in [yes_failure_ind_train, yes_failure_ind_val, yes_failure_ind_test,
             no_failure_ind_train, no_failure_ind_val, no_failure_ind_test]:
    print(ind_.count())

173
18
14
52482
6629
6677


In [18]:
yes_failure_ind_train.show(20)

+--------+
|id_index|
+--------+
|      85|
|     353|
|    2100|
|    2567|
|    2627|
|    2756|
|    2917|
|    2940|
|    3338|
|    3383|
|    4094|
|    5003|
|    5153|
|    7307|
|    7428|
|    8012|
|    8255|
|    8811|
|    9011|
|    9700|
+--------+
only showing top 20 rows



In [22]:
df.where(df['id_index'].isin(yes_failure_ind_train['id_index'])).show(10)

+-------------+--------------------+----------+-------------------+-------+------------------+-----------+------------------+-----------+------------------+-----------+------------------+-----------+------------------+-----------+------------------+-----------+------------------+-----------+------------------+-----------+-------------------+------------+-------------------+------------+-------------------+------------+-------------------+------------+-------------------+------------+-------------------+------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+---

In [2]:
spark.stop()

NameError: name 'spark' is not defined