### ML PYspark

The first thing to do is to create a .env file in the root of the directory. Add to the file the following two varibles 
ACCESS_KEY, ACCESS_SECRET. 
Check for more detailed explanation here: [dotenv]("https://pypi.org/project/python-dotenv/), he explains how the .env should look like. After that, the variables are add to the os.environ and can be access as a simple dict structure

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as spark_sum, when, input_file_name
from functools import reduce
import sys
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StandardScaler
import os
from dotenv import load_dotenv

In [2]:
load_dotenv()

## check
print(f"{os.environ['ACCESS_KEY']}")

AKIATL5DQEXAENZHWCKT


## Console Login

The following classes are to handle the spark on the AWS 

In [3]:
from src.s3handler import Sparker

In [4]:
## Initialize the class
spark = Sparker(os.environ['ACCESS_KEY'],os.environ['ACCESS_SECRET'])

## local session
spark._create_local_session()


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/10/30 13:26:02 WARN Utils: Your hostname, DESKTOP-95V5VE8, resolves to a loopback address: 127.0.1.1; using 172.30.46.218 instead (on interface eth0)
25/10/30 13:26:02 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
:: loading settings :: url = jar:file:/home/manecomaneca/venv/spark/.venv/lib/python3.13/site-packages/pyspark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /home/manecomaneca/.ivy2.5.2/cache
The jars for the packages stored in: /home/manecomaneca/.ivy2.5.2/jars
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-6692e31b-c8b2-4e84-8263-b9416c14cd3f;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.3.1 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.901 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
:: resolution 

In [5]:
## Read the parquet and stored it 
df = spark.read_parquet("ubs-datasets",
                    "FRACTAL/data/train/TRAIN-1200_6136-008972557.parquet",
                    read_all=False)

Reading from: s3a://ubs-datasets/FRACTAL/data/train/TRAIN-1200_6136-008972557.parquet


                                                                                

In [6]:
df.printSchema()


root
 |-- xyz: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- Intensity: integer (nullable = true)
 |-- ReturnNumber: short (nullable = true)
 |-- NumberOfReturns: short (nullable = true)
 |-- ScanDirectionFlag: short (nullable = true)
 |-- EdgeOfFlightLine: short (nullable = true)
 |-- Classification: short (nullable = true)
 |-- Synthetic: short (nullable = true)
 |-- KeyPoint: short (nullable = true)
 |-- Withheld: short (nullable = true)
 |-- Overlap: short (nullable = true)
 |-- ScanAngleRank: float (nullable = true)
 |-- UserData: short (nullable = true)
 |-- PointSourceId: integer (nullable = true)
 |-- GpsTime: double (nullable = true)
 |-- ScanChannel: short (nullable = true)
 |-- Red: integer (nullable = true)
 |-- Green: integer (nullable = true)
 |-- Blue: integer (nullable = true)
 |-- Infrared: integer (nullable = true)
 |-- wkb: binary (nullable = true)



see that the schema here was infered by the spark and it is totally different from when I had to download the file

In [7]:
df.head()

                                                                                

Row(xyz=[1200800.1400000001, 6135380.53, 1171.64], Intensity=342, ReturnNumber=2, NumberOfReturns=3, ScanDirectionFlag=1, EdgeOfFlightLine=0, Classification=5, Synthetic=0, KeyPoint=0, Withheld=0, Overlap=0, ScanAngleRank=16.00200080871582, UserData=0, PointSourceId=3125, GpsTime=304320466.8181704, ScanChannel=0, Red=11264, Green=15360, Blue=15872, Infrared=40192, wkb=bytearray(b'\x01\xe9\x03\x00\x00>\n\xd7#\xa0R2A\x1f\x85\xeb!\x95gWA\xc3\xf5(\\\x8fN\x92@'))

In [9]:
print(f"Number of rows: {df.count()}")

Number of rows: 209895


In [None]:
conditions = [col(c).isNull() for c in df.columns]

##combined condition returns True for any \
# row where at least one column is NULL
combined_condition = reduce(lambda a, b: a | b, conditions)

print(f"Number of cols with null values:{df.filter(combined_condition).count()}")

                                                                                

0

In [11]:
from pyspark.ml.feature import StandardScaler

In [19]:
# Split the array column into three separate columns
df = df.withColumn("x", col("xyz")[0]) \
       .withColumn("y", col("xyz")[1]) \
       .withColumn("z", col("xyz")[2])

In [13]:
scaler = StandardScaler(inputCol="features",
                        outputCol="scaledFeatures",
                        withStd=True,
                        withMean=False)

In [20]:
## Select cols
feature_cols = ['x','y','z', 'Intensity', 'Red','Green','Blue','Infrared']  

## Create an Vector Assembler
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

## scaler
scaler = StandardScaler(inputCol="features", outputCol="scaled_features")


In [21]:
output = assembler.transform(df)

In [22]:
output.select("features").show(truncate=False)

[Stage 11:>                                                         (0 + 1) / 1]

+--------------------------------------------------------------------------------+
|features                                                                        |
+--------------------------------------------------------------------------------+
|[1200800.1400000001,6135380.53,1171.64,342.0,11264.0,15360.0,15872.0,40192.0]   |
|[1200800.19,6135380.37,1171.34,301.0,11776.0,16128.0,16384.0,39168.0]           |
|[1200800.27,6135379.54,1171.03,259.0,9216.0,12800.0,12288.0,27648.0]            |
|[1200800.06,6135381.44,1164.2,317.0,9984.0,13568.0,14336.0,26624.0]             |
|[1200800.45,6135381.19,1166.57,281.0,9472.0,12800.0,13312.0,32000.0]            |
|[1200800.24,6135381.48,1164.3600000000001,826.0,9984.0,13568.0,14336.0,28416.0] |
|[1200801.3,6135379.9,1170.14,328.0,9472.0,12800.0,11776.0,27136.0]              |
|[1200801.43,6135379.59,1170.25,434.0,9472.0,12800.0,11776.0,24576.0]            |
|[1200801.53,6135379.33,1170.3,387.0,8704.0,11776.0,11008.0,25856.0]             |
|[12

                                                                                