In [1]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import DataFrameReader
from pyspark.sql.functions import col
from pyspark.ml.stat import Correlation
from pyspark.ml.stat import Summarizer
from pyspark.mllib.linalg import Vectors, VectorUDT
from pyspark.sql.functions import udf

In [2]:
path_parquet = "/mnt/d/desktop/Copernicus/classes/3-semester/bigdata/fractal/TRAIN-0934_6301-008094186.parquet"

### Pandas Exploration

In [3]:
df = pd.read_parquet("/mnt/d/desktop/Copernicus/classes/3-semester/bigdata/fractal/TRAIN-0934_6301-008094186.parquet",
                     engine='fastparquet')
df.head()

Unnamed: 0,x,y,z,intensity,returnnumber,numberofreturns,scandirectionflag,edgeofflightline,classification,synthetic,...,overlap,scananglerank,userdata,pointsourceid,gpstime,scanchannel,red,green,blue,infrared
0,934250.15,6300499.18,470.05,1646.0,2.0,2.0,1.0,0.0,2.0,0.0,...,0.0,3.0,0.0,41.0,308745607.159,0.0,10496.0,14592.0,16896.0,14336.0
1,934250.23,6300499.78,470.18,2186.0,2.0,2.0,1.0,0.0,2.0,0.0,...,0.0,3.0,0.0,41.0,308745607.159,0.0,9728.0,12544.0,15360.0,9984.0
2,934250.02,6300493.54,468.12,2733.0,1.0,1.0,1.0,0.0,2.0,0.0,...,0.0,3.0,0.0,41.0,308745607.165,0.0,13312.0,17152.0,16384.0,14848.0
3,934250.02,6300494.15,468.52,2376.0,2.0,2.0,1.0,0.0,3.0,0.0,...,0.0,3.0,0.0,41.0,308745607.165,0.0,17920.0,22016.0,20480.0,20992.0
4,934250.07,6300494.43,468.52,1390.0,2.0,2.0,1.0,0.0,2.0,0.0,...,0.0,3.0,0.0,41.0,308745607.165,0.0,19968.0,24064.0,22528.0,25088.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80953 entries, 0 to 80952
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   x                  80953 non-null  object
 1   y                  80953 non-null  object
 2   z                  80953 non-null  object
 3   intensity          80953 non-null  object
 4   returnnumber       80953 non-null  object
 5   numberofreturns    80953 non-null  object
 6   scandirectionflag  80953 non-null  object
 7   edgeofflightline   80953 non-null  object
 8   classification     80953 non-null  object
 9   synthetic          80953 non-null  object
 10  keypoint           80953 non-null  object
 11  withheld           80953 non-null  object
 12  overlap            80953 non-null  object
 13  scananglerank      80953 non-null  object
 14  userdata           80953 non-null  object
 15  pointsourceid      80953 non-null  object
 16  gpstime            80953 non-null  objec

In [32]:
df['red'] = df['red'].astype(float)

In [8]:
df['intensity'].describe()

count       80953
unique       3895
top       310.000
freq           78
Name: intensity, dtype: object

# Spark Apache

### Schema Custom

In [9]:
from pyspark.sql.types import *

In [None]:
## Build a session on pyspark
spark = SparkSession \
    .builder \
    .appName("Exploratory Analysis") \
    .master('local[*]') \ 
    ##.config("spark.some.config.option", "some-value") \
    .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/10/28 07:43:56 WARN Utils: Your hostname, DESKTOP-95V5VE8, resolves to a loopback address: 127.0.1.1; using 172.30.46.218 instead (on interface eth0)
25/10/28 07:43:56 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/28 07:44:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [19]:
# Step 1: Read everything as strings (matching what's actually in the file)
string_schema = StructType([
    StructField("x", StringType(), nullable=True),
    StructField("y", StringType(), nullable=True),
    StructField("z", StringType(), nullable=True),
    StructField("intensity", StringType(), nullable=True),
    StructField("returnnumber", StringType(), nullable=True),
    StructField("numberofreturns", StringType(), nullable=True),
    StructField("scandirectionflag", StringType(), nullable=True),
    StructField("edgeofflightline", StringType(), nullable=True),
    StructField("classification", StringType(), nullable=True),
    StructField("synthetic", StringType(), nullable=True),
    StructField("keypoint", StringType(), nullable=True),
    StructField("withheld", StringType(), nullable=True),
    StructField("overlap", StringType(), nullable=True),
    StructField("scananglerank", StringType(), nullable=True),
    StructField("userdata", StringType(), nullable=True),
    StructField("pointsourceid", StringType(), nullable=True),
    StructField("gpstime", StringType(), nullable=True),
    StructField("scanchannel", StringType(), nullable=True),
    StructField("red", StringType(), nullable=True),
    StructField("green", StringType(), nullable=True),
    StructField("blue", StringType(), nullable=True),
    StructField("infrared", StringType(), nullable=True)
])

# Step 2: Read the parquet file
spp = spark.read.schema(string_schema).parquet(path_parquet)

# Step 3: Convert to proper types
# Convert with proper casting - use IntegerType for values with decimals
spp = spp.select(
    col("x").cast("double").alias("x"),
    col("y").cast("double").alias("y"),
    col("z").cast("double").alias("z"),
    col("intensity").cast("double").alias("intensity"),
    
    # For fields with decimal points like '2.000', cast to int (not byte)
    col("returnnumber").cast("double").alias("returnnumber"),
    col("numberofreturns").cast("double").alias("numberofreturns"),
    col("scandirectionflag").cast("double").alias("scandirectionflag"),
    col("edgeofflightline").cast("double").alias("edgeofflightline"),
    col("classification").cast("double").alias("classification"),  # THIS ONE for your aggregation
    col("synthetic").cast("double").alias("synthetic"),
    col("keypoint").cast("double").alias("keypoint"),
    col("withheld").cast("double").alias("withheld"),
    col("overlap").cast("double").alias("overlap"),
    col("scananglerank").cast("double").alias("scananglerank"),
    col("userdata").cast("double").alias("userdata"),
    col("pointsourceid").cast("double").alias("pointsourceid"),
    col("gpstime").cast("double").alias("gpstime"),
    col("scanchannel").cast("double").alias("scanchannel"),
    col("red").cast("double").alias("red"),
    col("green").cast("double").alias("green"),
    col("blue").cast("double").alias("blue"),
    col("infrared").cast("double").alias("infrared")
)

spp.printSchema()

root
 |-- x: double (nullable = true)
 |-- y: double (nullable = true)
 |-- z: double (nullable = true)
 |-- intensity: double (nullable = true)
 |-- returnnumber: double (nullable = true)
 |-- numberofreturns: double (nullable = true)
 |-- scandirectionflag: double (nullable = true)
 |-- edgeofflightline: double (nullable = true)
 |-- classification: double (nullable = true)
 |-- synthetic: double (nullable = true)
 |-- keypoint: double (nullable = true)
 |-- withheld: double (nullable = true)
 |-- overlap: double (nullable = true)
 |-- scananglerank: double (nullable = true)
 |-- userdata: double (nullable = true)
 |-- pointsourceid: double (nullable = true)
 |-- gpstime: double (nullable = true)
 |-- scanchannel: double (nullable = true)
 |-- red: double (nullable = true)
 |-- green: double (nullable = true)
 |-- blue: double (nullable = true)
 |-- infrared: double (nullable = true)



In [15]:
spp.groupBy("classification").count().show(truncate=True)

                                                                                

+--------------+-----+
|classification|count|
+--------------+-----+
|           1.0|    1|
|           4.0| 2966|
|           3.0| 4175|
|           2.0|18978|
|           5.0|54831|
|          65.0|    2|
+--------------+-----+



## Null Values

In [22]:
## Count null values for all columns
from pyspark.sql import functions as fn 
spp.select([fn.count(fn.when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()

[Stage 6:>                                                          (0 + 1) / 1]

+---+---+---+---------+------------+---------------+-----------------+----------------+--------------+---------+--------+--------+-------+-------------+--------+-------------+-------+-----------+---+-----+----+--------+
|  x|  y|  z|intensity|returnnumber|numberofreturns|scandirectionflag|edgeofflightline|classification|synthetic|keypoint|withheld|overlap|scananglerank|userdata|pointsourceid|gpstime|scanchannel|red|green|blue|infrared|
+---+---+---+---------+------------+---------------+-----------------+----------------+--------------+---------+--------+--------+-------+-------------+--------+-------------+-------+-----------+---+-----+----+--------+
|  0|  0|  0|        0|           0|              0|                0|               0|             0|        0|       0|       0|      0|            0|       0|            0|      0|          0|  0|    0|   0|       0|
+---+---+---+---------+------------+---------------+-----------------+----------------+--------------+---------+--------

                                                                                

In [None]:
## Drop nan values in case
spp = spp.dropna()  # drop rows with missing values

In [20]:
spp.head()

                                                                                

Row(x=934250.15, y=6300499.18, z=470.05, intensity=1646.0, returnnumber=2.0, numberofreturns=2.0, scandirectionflag=1.0, edgeofflightline=0.0, classification=2.0, synthetic=0.0, keypoint=0.0, withheld=0.0, overlap=0.0, scananglerank=3.0, userdata=0.0, pointsourceid=41.0, gpstime=308745607.159, scanchannel=0.0, red=10496.0, green=14592.0, blue=16896.0, infrared=14336.0)

### UDF 
UDF stands for user-defined scalar/vectorized functions

In [None]:
from pyspark.sql.functions import udf

@udf(returnType='int')
def slen(s: str):
    return len(s)

## some process returning a scalar value in a column
spp.withColumn("process_row", process_row(df["text_column"], df["list_column"])).show(truncate=False)

## Summarize

In [41]:
from pyspark.ml.feature import VectorAssembler

In [None]:
# Create a vector from your double column
cols_to_summarize = ['x','y','z','intensity','red','green','blue','infrared']
assembler = VectorAssembler(inputCols=cols_to_summarize, outputCol="features")
spp_vector = assembler.transform(spp)

# Now apply the summarizer
summarizer = Summarizer.metrics("mean", "count", "std", "min", "max")
spp_vector.select(summarizer.summary(spp_vector.x_vector)).show(truncate=False)

[Stage 9:>                                                          (0 + 1) / 1]

+--------------------------------------------------------------------------+
|aggregate_metrics(x_vector, 1.0)                                          |
+--------------------------------------------------------------------------+
|{[934274.1193548144], 80953, [14.141179186942335], [934250.0], [934300.0]}|
+--------------------------------------------------------------------------+



                                                                                

In [46]:
spp.describe().show()

25/10/28 10:28:39 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 14:>                                                         (0 + 1) / 1]

+-------+------------------+------------------+-----------------+------------------+------------------+------------------+-------------------+----------------+------------------+---------+--------+--------+-------+------------------+--------+-------------+-------------------+-----------+------------------+-----------------+------------------+-----------------+
|summary|                 x|                 y|                z|         intensity|      returnnumber|   numberofreturns|  scandirectionflag|edgeofflightline|    classification|synthetic|keypoint|withheld|overlap|     scananglerank|userdata|pointsourceid|            gpstime|scanchannel|               red|            green|              blue|         infrared|
+-------+------------------+------------------+-----------------+------------------+------------------+------------------+-------------------+----------------+------------------+---------+--------+--------+-------+------------------+--------+-------------+------------------

                                                                                

### Rename Target Label

In [None]:
spp.rename()

## Pipeline

In [None]:
from pyspark.ml import Pipeline
from pyspark.mllib.feature import StandardScaler

In [48]:
spp.columns

['x',
 'y',
 'z',
 'intensity',
 'returnnumber',
 'numberofreturns',
 'scandirectionflag',
 'edgeofflightline',
 'classification',
 'synthetic',
 'keypoint',
 'withheld',
 'overlap',
 'scananglerank',
 'userdata',
 'pointsourceid',
 'gpstime',
 'scanchannel',
 'red',
 'green',
 'blue',
 'infrared']

In [52]:
[col for col in spp.columns if col not in ['label','classification']]

['x',
 'y',
 'z',
 'intensity',
 'returnnumber',
 'numberofreturns',
 'scandirectionflag',
 'edgeofflightline',
 'synthetic',
 'keypoint',
 'withheld',
 'overlap',
 'scananglerank',
 'userdata',
 'pointsourceid',
 'gpstime',
 'scanchannel',
 'red',
 'green',
 'blue',
 'infrared']

In [None]:
## Features

## Rename classification to label
spp = spp.withColumnRenamed('Classification', 'label')
target = spp.select("label")

## Select all features

## Drop unecessary columns
## A lot of metadata seems to be passed to the dataframe, this operation try to get rid off 
remove_cols = ['synthetic','keypoint','withheld','overlap','scananglerank','userdata',
                'pointsourceid','edgeofflightline']
feature_cols = [col for col in spp.columns if col not in remove_cols]
features = spp.select(feature_cols)



In [54]:
spp.columns

['x',
 'y',
 'z',
 'intensity',
 'returnnumber',
 'numberofreturns',
 'scandirectionflag',
 'edgeofflightline',
 'classification',
 'synthetic',
 'keypoint',
 'withheld',
 'overlap',
 'scananglerank',
 'userdata',
 'pointsourceid',
 'gpstime',
 'scanchannel',
 'red',
 'green',
 'blue',
 'infrared']

In [None]:
## Standard Scaler
scaler1 = StandardScaler().fit(features)

In [53]:
spp.head()

Row(x=934250.15, y=6300499.18, z=470.05, intensity=1646.0, returnnumber=2.0, numberofreturns=2.0, scandirectionflag=1.0, edgeofflightline=0.0, classification=2.0, synthetic=0.0, keypoint=0.0, withheld=0.0, overlap=0.0, scananglerank=3.0, userdata=0.0, pointsourceid=41.0, gpstime=308745607.159, scanchannel=0.0, red=10496.0, green=14592.0, blue=16896.0, infrared=14336.0)