In [1]:
import pyspark
from pyspark.sql.types import *
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("data_exploration") \
    .getOrCreate()

#og df schema
steeltrain_schema = StructType([StructField("id", IntegerType(), True),
                           StructField("X_minimum", IntegerType(), True),
                           StructField("X_maximum", IntegerType(), True),
                           StructField("Y_minimum", IntegerType(), True),
                           StructField("Y_maximum", IntegerType(), True),
                           StructField("Pixels_areas", IntegerType(), True),
                           StructField("X_perimeter", IntegerType(), True),
                           StructField("Y_perimeter", IntegerType(), True),
                           StructField("Sum_of_luminosity", IntegerType(), True),
                           StructField("Minimum_luminosity", IntegerType(), True),
                           StructField("Maximum_luminosity", IntegerType(), True),
                           StructField("Length_conveyer", IntegerType(), True),
                           StructField("TypeSteel_A300", IntegerType(), True),
                           StructField("TypeSteel_A400", IntegerType(), True),
                           StructField("SteelPlate_thickness", IntegerType(), True),
                           StructField("Edges_index", FloatType(), True),
                           StructField("Empty_index", FloatType(), True),
                           StructField("Square_index", FloatType(), True),
                           StructField("OutsideX_index", FloatType(), True),
                           StructField("EdgesX_index", FloatType(), True),
                           StructField("EdgesY_index", FloatType(), True),
                           StructField("OutsideGlobal_index", FloatType(), True),
                           StructField("LogOfAreas", FloatType(), True),
                           StructField("LogX_index", FloatType(), True),
                           StructField("LogY_index", FloatType(), True),
                           StructField("Orientation_index", FloatType(), True),
                           StructField("Luminosity_index", FloatType(), True),
                           StructField("SigmoidOfAreas", FloatType(), True),
                           StructField("Pastry", IntegerType(), True),
                           StructField("Z_scratch", IntegerType(), True),
                           StructField("K_scratch", IntegerType(), True),
                           StructField("Stains", IntegerType(), True),
                           StructField("Dirtiness", IntegerType(), True),
                           StructField("Bumps", IntegerType(), True),
                           StructField("Other_faults", IntegerType(), True),
                           ])
steeltest_schema = StructType([StructField("id", IntegerType(), True),
                           StructField("X_minimum", IntegerType(), True),
                           StructField("X_maximum", IntegerType(), True),
                           StructField("Y_minimum", IntegerType(), True),
                           StructField("Y_maximum", IntegerType(), True),
                           StructField("Pixels_areas", IntegerType(), True),
                           StructField("X_perimeter", IntegerType(), True),
                           StructField("Y_perimeter", IntegerType(), True),
                           StructField("Sum_of_luminosity", IntegerType(), True),
                           StructField("Minimum_luminosity", IntegerType(), True),
                           StructField("Maximum_luminosity", IntegerType(), True),
                           StructField("Length_conveyer", IntegerType(), True),
                           StructField("TypeSteel_A300", IntegerType(), True),
                           StructField("TypeSteel_A400", IntegerType(), True),
                           StructField("SteelPlate_thickness", IntegerType(), True),
                           StructField("Edges_index", FloatType(), True),
                           StructField("Empty_index", FloatType(), True),
                           StructField("Square_index", FloatType(), True),
                           StructField("OutsideX_index", FloatType(), True),
                           StructField("EdgesX_index", FloatType(), True),
                           StructField("EdgesY_index", FloatType(), True),
                           StructField("OutsideGlobal_index", FloatType(), True),
                           StructField("LogOfAreas", FloatType(), True),
                           StructField("LogX_index", FloatType(), True),
                           StructField("LogY_index", FloatType(), True),
                           StructField("Orientation_index", FloatType(), True),
                           StructField("Luminosity_index", FloatType(), True),
                           StructField("SigmoidOfAreas", FloatType(), True),])

#import dataset
steeltrain_path = "dataset/train.csv"
steeltest_path = "dataset/test.csv"
steel_train = spark.read.csv(steeltrain_path, header=True, schema=steeltrain_schema)
steel_test = spark.read.csv(steeltest_path, header=True, schema=steeltest_schema)

#extract col names
target_cols = steel_train.columns[-7:]
feature_cols = steel_train.columns[1:-7]
print(feature_cols,"\n", target_cols)

['X_minimum', 'X_maximum', 'Y_minimum', 'Y_maximum', 'Pixels_areas', 'X_perimeter', 'Y_perimeter', 'Sum_of_luminosity', 'Minimum_luminosity', 'Maximum_luminosity', 'Length_conveyer', 'TypeSteel_A300', 'TypeSteel_A400', 'SteelPlate_thickness', 'Edges_index', 'Empty_index', 'Square_index', 'OutsideX_index', 'EdgesX_index', 'EdgesY_index', 'OutsideGlobal_index', 'LogOfAreas', 'LogX_index', 'LogY_index', 'Orientation_index', 'Luminosity_index', 'SigmoidOfAreas'] 
 ['Pastry', 'Z_scratch', 'K_scratch', 'Stains', 'Dirtiness', 'Bumps', 'Other_faults']


In [2]:
## SCHEMA
train_schema = steel_train.printSchema()
test_schema = steel_test.printSchema()

root
 |-- id: integer (nullable = true)
 |-- X_minimum: integer (nullable = true)
 |-- X_maximum: integer (nullable = true)
 |-- Y_minimum: integer (nullable = true)
 |-- Y_maximum: integer (nullable = true)
 |-- Pixels_areas: integer (nullable = true)
 |-- X_perimeter: integer (nullable = true)
 |-- Y_perimeter: integer (nullable = true)
 |-- Sum_of_luminosity: integer (nullable = true)
 |-- Minimum_luminosity: integer (nullable = true)
 |-- Maximum_luminosity: integer (nullable = true)
 |-- Length_conveyer: integer (nullable = true)
 |-- TypeSteel_A300: integer (nullable = true)
 |-- TypeSteel_A400: integer (nullable = true)
 |-- SteelPlate_thickness: integer (nullable = true)
 |-- Edges_index: float (nullable = true)
 |-- Empty_index: float (nullable = true)
 |-- Square_index: float (nullable = true)
 |-- OutsideX_index: float (nullable = true)
 |-- EdgesX_index: float (nullable = true)
 |-- EdgesY_index: float (nullable = true)
 |-- OutsideGlobal_index: float (nullable = true)
 |--

In [7]:
## SUMMARY
exclude_cols = [11,12] #exclude boolean pred features
selected_cols = [col_name for i, col_name in enumerate(feature_cols) if i not in exclude_cols]
train_summary = steel_train.select(selected_cols).describe()
test_summary = steel_test.select(selected_cols).describe()

In [8]:
train_summary.show()

+-------+-----------------+-----------------+------------------+-----------------+------------------+-----------------+------------------+------------------+------------------+------------------+-----------------+--------------------+-------------------+-------------------+------------------+--------------------+-------------------+-------------------+-------------------+------------------+-------------------+------------------+-------------------+--------------------+------------------+
|summary|        X_minimum|        X_maximum|         Y_minimum|        Y_maximum|      Pixels_areas|      X_perimeter|       Y_perimeter| Sum_of_luminosity|Minimum_luminosity|Maximum_luminosity|  Length_conveyer|SteelPlate_thickness|        Edges_index|        Empty_index|      Square_index|      OutsideX_index|       EdgesX_index|       EdgesY_index|OutsideGlobal_index|        LogOfAreas|         LogX_index|        LogY_index|  Orientation_index|    Luminosity_index|    SigmoidOfAreas|
+-------+-----

In [9]:
test_summary.show()

+-------+-----------------+-----------------+------------------+------------------+------------------+-----------------+------------------+-----------------+------------------+------------------+------------------+--------------------+-------------------+-------------------+------------------+--------------------+-------------------+------------------+-------------------+------------------+-------------------+-------------------+-------------------+--------------------+-------------------+
|summary|        X_minimum|        X_maximum|         Y_minimum|         Y_maximum|      Pixels_areas|      X_perimeter|       Y_perimeter|Sum_of_luminosity|Minimum_luminosity|Maximum_luminosity|   Length_conveyer|SteelPlate_thickness|        Edges_index|        Empty_index|      Square_index|      OutsideX_index|       EdgesX_index|      EdgesY_index|OutsideGlobal_index|        LogOfAreas|         LogX_index|         LogY_index|  Orientation_index|    Luminosity_index|     SigmoidOfAreas|
+-------+-

In [10]:
## MISSING VALUES
from pyspark.sql.functions import col, isnan, when, count

missing_train = steel_train.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) 
                                    for c in steel_train.columns])

missing_test = steel_test.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) 
                                  for c in steel_test.columns])

In [11]:
missing_train.show()

+---+---------+---------+---------+---------+------------+-----------+-----------+-----------------+------------------+------------------+---------------+--------------+--------------+--------------------+-----------+-----------+------------+--------------+------------+------------+-------------------+----------+----------+----------+-----------------+----------------+--------------+------+---------+---------+------+---------+-----+------------+
| id|X_minimum|X_maximum|Y_minimum|Y_maximum|Pixels_areas|X_perimeter|Y_perimeter|Sum_of_luminosity|Minimum_luminosity|Maximum_luminosity|Length_conveyer|TypeSteel_A300|TypeSteel_A400|SteelPlate_thickness|Edges_index|Empty_index|Square_index|OutsideX_index|EdgesX_index|EdgesY_index|OutsideGlobal_index|LogOfAreas|LogX_index|LogY_index|Orientation_index|Luminosity_index|SigmoidOfAreas|Pastry|Z_scratch|K_scratch|Stains|Dirtiness|Bumps|Other_faults|
+---+---------+---------+---------+---------+------------+-----------+-----------+-----------------+

In [12]:
missing_test.show()

+---+---------+---------+---------+---------+------------+-----------+-----------+-----------------+------------------+------------------+---------------+--------------+--------------+--------------------+-----------+-----------+------------+--------------+------------+------------+-------------------+----------+----------+----------+-----------------+----------------+--------------+
| id|X_minimum|X_maximum|Y_minimum|Y_maximum|Pixels_areas|X_perimeter|Y_perimeter|Sum_of_luminosity|Minimum_luminosity|Maximum_luminosity|Length_conveyer|TypeSteel_A300|TypeSteel_A400|SteelPlate_thickness|Edges_index|Empty_index|Square_index|OutsideX_index|EdgesX_index|EdgesY_index|OutsideGlobal_index|LogOfAreas|LogX_index|LogY_index|Orientation_index|Luminosity_index|SigmoidOfAreas|
+---+---------+---------+---------+---------+------------+-----------+-----------+-----------------+------------------+------------------+---------------+--------------+--------------+--------------------+-----------+---------

In [18]:
## OUTLIER DETECTION
#function to calculate outliers
def outliers_iqr(df, col_name, threshold=1.5):
    quantiles = df.approxQuantile(col_name, [0.25, 0.75], 0.01)
    q1 = quantiles[0]
    q3 = quantiles[1]
    iqr = q3 - q1
    lb = q1 - threshold * iqr
    ub = q3 + threshold * iqr
    return df.filter((col(col_name) < lb) | (col(col_name) > ub))

#train data 
outliers_train = {}
print("Outliers in train data:")
for col_name in selected_cols:
    outliers_train[col_name] = outliers_iqr(steel_train, col_name)
for col_name, outlier_df in outliers_train.items():
    print(f"Outliers in '{col_name}':")
    outlier_count = outlier_df.count()
    if outlier_count > 0:
        print(f"Number of outliers: {outlier_count}")
    else:
        print("0")

#test data
outliers_test = {}
print("\nOutliers in test data:")
for col_name in selected_cols:
    outliers_test[col_name] = outliers_iqr(steel_test, col_name)
for col_name, outlier_df in outliers_test.items():
    print(f"Outliers in '{col_name}':")
    outlier_count = outlier_df.count()
    if outlier_count > 0:
        print(f"Number of outliers: {outlier_count}")
    else:
        print("0")

Outliers in train data:
Outliers in column 'X_minimum':
No outliers found.
Outliers in column 'X_maximum':
No outliers found.
Outliers in column 'Y_minimum':
Number of outliers: 1126
Outliers in column 'Y_maximum':
Number of outliers: 1120
Outliers in column 'Pixels_areas':
Number of outliers: 3742
Outliers in column 'X_perimeter':
Number of outliers: 3737
Outliers in column 'Y_perimeter':
Number of outliers: 3008
Outliers in column 'Sum_of_luminosity':
Number of outliers: 3837
Outliers in column 'Minimum_luminosity':
Number of outliers: 135
Outliers in column 'Maximum_luminosity':
Number of outliers: 1555
Outliers in column 'Length_conveyer':
No outliers found.
Outliers in column 'SteelPlate_thickness':
Number of outliers: 2173
Outliers in column 'Edges_index':
No outliers found.
Outliers in column 'Empty_index':
Number of outliers: 61
Outliers in column 'Square_index':
No outliers found.
Outliers in column 'OutsideX_index':
Number of outliers: 3654
Outliers in column 'EdgesX_index':


In [19]:
spark.stop()