### ML PYspark

The first thing to do is to create a .env file in the root of the directory. Add to the file the following two varibles 
ACCESS_KEY, ACCESS_SECRET. 
Check for more detailed explanation here: [dotenv]("https://pypi.org/project/python-dotenv/), he explains how the .env should look like. After that, the variables are add to the os.environ and can be access as a simple dict structure

In [33]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as spark_sum, when, input_file_name
from functools import reduce
import sys
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StandardScaler
import os
from pyspark.ml.classification import LogisticRegression
from dotenv import load_dotenv
from sparkmeasure import TaskMetrics
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
import random
## load .env
load_dotenv()


True

## Console Login

The following classes are to handle the spark on the AWS 

In [3]:
from src.s3handler import Sparker, PreProcessing, FeatureEngineering

In [4]:
## Initialize the class
spark = Sparker(os.environ['ACCESS_KEY'],os.environ['ACCESS_SECRET'])

## local session
session = spark._create_local_session()

:: loading settings :: url = jar:file:/Users/devseed/Documents/repos/FRACTAL_Big_Data/.venv/lib/python3.13/site-packages/pyspark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /Users/devseed/.ivy2.5.2/cache
The jars for the packages stored in: /Users/devseed/.ivy2.5.2/jars
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-b219db10-4109-4039-9ae6-d7a639a37c69;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.3.1 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.901 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
:: resolution report :: resolve 110ms :: artifacts dl 3ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.11.901 from central in [default]
	org.apache.hadoop#hadoop-aws;3.3.1 from central in [default]
	org.wildfly.openssl#wildfly-openssl;1.0.7.Final from central in [default]
	----------------------------------------

## Read parquet

In [5]:
parquet_cols = ["xyz","Intensity","Classification","Red","Green","Blue","Infrared","ReturnNumber","NumberOfReturns"]

## Read the parquet and stored it 
df = spark.read_parquet("ubs-datasets",
                    "FRACTAL/data/train/TRAIN-0436_6399-002955400.parquet",
                    read_all=False) \
                    .select(*parquet_cols)

# # Read the list of parquet files
# list_s3 = ["FRACTAL/data/train/TRAIN-1200_6136-008972557.parquet", "FRACTAL/data/train/TRAIN-0436_6399-002955400.parquet"]
# df = spark.read_parquet("ubs-datasets",
#                     list_s3,
#                     read_all=False)

Reading from: ['s3a://ubs-datasets/FRACTAL/data/train/TRAIN-0436_6399-002955400.parquet']


                                                                                

In [6]:
df.printSchema()
print(f"Number of rows: {df.count()}")

root
 |-- xyz: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- Intensity: integer (nullable = true)
 |-- Classification: short (nullable = true)
 |-- Red: integer (nullable = true)
 |-- Green: integer (nullable = true)
 |-- Blue: integer (nullable = true)
 |-- Infrared: integer (nullable = true)
 |-- ReturnNumber: short (nullable = true)
 |-- NumberOfReturns: short (nullable = true)



[Stage 1:>                                                          (0 + 1) / 1]

Number of rows: 90090


                                                                                

## Preprocessing & Feature Engineering

In [7]:
df.columns

['xyz',
 'Intensity',
 'Classification',
 'Red',
 'Green',
 'Blue',
 'Infrared',
 'ReturnNumber',
 'NumberOfReturns']

In [8]:
preprocessing = PreProcessing(df)
df = preprocessing.split_xyz()

# ## feature engineering
engfeature = FeatureEngineering(df)
df = engfeature.apply_all()

In [9]:
df.columns

['Intensity',
 'Classification',
 'Red',
 'Green',
 'Blue',
 'Infrared',
 'ReturnNumber',
 'NumberOfReturns',
 'x',
 'y',
 'z',
 'height_above_ground',
 'local_density',
 'local_z_std',
 'local_z_range',
 'roughness',
 'return_ratio',
 'is_single_return',
 'is_last_return',
 'ndvi',
 'green_red_ratio',
 'ndwi']

In [10]:
from pyspark.sql.functions import col, sum as _sum

feature_cols = df.columns

null_counts = df.select([
    _sum(col(c).isNull().cast("int")).alias(c) 
    for c in feature_cols
])
null_counts.show()

[Stage 4:>                                                          (0 + 1) / 1]

+---------+--------------+---+-----+----+--------+------------+---------------+---+---+---+-------------------+-------------+-----------+-------------+---------+------------+----------------+--------------+----+---------------+----+
|Intensity|Classification|Red|Green|Blue|Infrared|ReturnNumber|NumberOfReturns|  x|  y|  z|height_above_ground|local_density|local_z_std|local_z_range|roughness|return_ratio|is_single_return|is_last_return|ndvi|green_red_ratio|ndwi|
+---------+--------------+---+-----+----+--------+------------+---------------+---+---+---+-------------------+-------------+-----------+-------------+---------+------------+----------------+--------------+----+---------------+----+
|        0|             0|  0|    0|   0|       0|           0|              0|  0|  0|  0|                  0|            0|          0|            0|        0|           0|               0|             0|   0|              0|   0|
+---------+--------------+---+-----+----+--------+------------+-----

                                                                                

### Load | Models
Prepare the variable for the models

In [11]:
feature_cols = df.drop("Classification").columns  
assembler = VectorAssembler(inputCols=feature_cols,
                            outputCol="features",
                           # handleInvalid="skip" 
                           ) 

scaler = StandardScaler(inputCol="features", outputCol="scaled_features")


In [36]:
rf = RandomForestClassifier(featuresCol="scaled_features", 
                            labelCol="Classification",
                            bootstrap=True)

### Pipeline

In [None]:
pipeline = Pipeline(stages=[assembler, scaler, rf])

### Train 

In [45]:
## Import taskmetrics to see how the model is performing
# taskmetrics = TaskMetrics(session)
# taskmetrics.begin()
# model = pipeline.fit(df)
# taskmetrics.end()
# taskmetrics.print_report()

In [46]:
model = pipeline.fit(df)

[Stage 497:>                                                        (0 + 1) / 1]

In [47]:
model.transform(df).show(2)

[Stage 500:>                                                        (0 + 1) / 1]

+---------+--------------+-----+-----+-----+--------+------------+---------------+----------+-----------+------+--------------------+-------------+-------------------+-----------------+-------------------+------------+----------------+--------------+-------------------+------------------+------------------+--------------------+--------------------+--------------------+--------------------+----------+
|Intensity|Classification|  Red|Green| Blue|Infrared|ReturnNumber|NumberOfReturns|         x|          y|     z| height_above_ground|local_density|        local_z_std|    local_z_range|          roughness|return_ratio|is_single_return|is_last_return|               ndvi|   green_red_ratio|              ndwi|            features|     scaled_features|       rawPrediction|         probability|prediction|
+---------+--------------+-----+-----+-----+--------+------------+---------------+----------+-----------+------+--------------------+-------------+-------------------+-----------------+-------

                                                                                

## Hyperparameter tunning

In [19]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

## Inference

In [50]:
df_test = spark.read_parquet("ubs-datasets",
                    "FRACTAL/data/train/TRAIN-1200_6136-008972557.parquet",
                    read_all=False) \
                    .select(*parquet_cols)

preprocessing = PreProcessing(df_test)
df_test = preprocessing.split_xyz()
eng_feature = FeatureEngineering(df_test)
df_test = eng_feature.apply_all()

Reading from: ['s3a://ubs-datasets/FRACTAL/data/train/TRAIN-1200_6136-008972557.parquet']


In [58]:
predictions = model.transform(df_test)
model.transform(df_test).show(2)



+---------+--------------+-----+-----+-----+--------+------------+---------------+----------+----------+------------------+-------------------+-------------+-----------------+------------------+-------------------+------------+----------------+--------------+-------------------+------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|Intensity|Classification|  Red|Green| Blue|Infrared|ReturnNumber|NumberOfReturns|         x|         y|                 z|height_above_ground|local_density|      local_z_std|     local_z_range|          roughness|return_ratio|is_single_return|is_last_return|               ndvi|   green_red_ratio|                ndwi|            features|     scaled_features|       rawPrediction|         probability|prediction|
+---------+--------------+-----+-----+-----+--------+------------+---------------+----------+----------+------------------+-------------------+-------------+-------------

                                                                                

## Evaluation

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="Classification", 
                                              predictionCol="prediction", 
                                              metricName="accuracy"
                                              )

accuracy = evaluator.evaluate(predictions)
print(f"Test Accuracy = {accuracy:.2f}")