### ML PYspark

The first thing to do is to create a .env file in the root of the directory. Add to the file the following two varibles 
ACCESS_KEY, ACCESS_SECRET. 
Check for more detailed explanation here: [dotenv]("https://pypi.org/project/python-dotenv/), he explains how the .env should look like. After that, the variables are add to the os.environ and can be access as a simple dict structure

In [129]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as spark_sum, when, input_file_name
from functools import reduce
import sys
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StandardScaler
import os
from pyspark.ml.classification import LogisticRegression
from dotenv import load_dotenv
from sparkmeasure import TaskMetrics
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [130]:
import random
## load .env
load_dotenv()


True

## Console Login

The following classes are to handle the spark on the AWS 

In [131]:
from src.s3handler import Sparker, PreProcessing, FeatureEngineering

In [132]:
## Initialize the class
spark = Sparker(os.environ['ACCESS_KEY'],os.environ['ACCESS_SECRET'])

## local session
session = spark._create_local_session()

## Read parquet

In [133]:
parquet_cols = ["xyz","Intensity","Classification","Red","Green","Blue","Infrared","ReturnNumber","NumberOfReturns"]

## Read the parquet and stored it 
df = spark.read_parquet("ubs-datasets",
                    "FRACTAL/data/train/TRAIN-0436_6399-002955400.parquet",
                    read_all=False) \
                    .select(*parquet_cols)

# Read the list of parquet files
# list_s3 = ["FRACTAL/data/train/TRAIN-1200_6136-008972557.parquet", "FRACTAL/data/train/TRAIN-0436_6399-002955400.parquet"]
# df = spark.read_parquet("ubs-datasets",
#                     list_s3,
#                     read_all=False) \
#                     .select(*parquet_cols)

Reading from: ['s3a://ubs-datasets/FRACTAL/data/train/TRAIN-0436_6399-002955400.parquet']


In [134]:
df.printSchema()
print(f"Number of rows: {df.count()}")

root
 |-- xyz: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- Intensity: integer (nullable = true)
 |-- Classification: short (nullable = true)
 |-- Red: integer (nullable = true)
 |-- Green: integer (nullable = true)
 |-- Blue: integer (nullable = true)
 |-- Infrared: integer (nullable = true)
 |-- ReturnNumber: short (nullable = true)
 |-- NumberOfReturns: short (nullable = true)

Number of rows: 90090


## Preprocessing & Feature Engineering

In [135]:
df.columns

['xyz',
 'Intensity',
 'Classification',
 'Red',
 'Green',
 'Blue',
 'Infrared',
 'ReturnNumber',
 'NumberOfReturns']

In [136]:
preprocessing = PreProcessing(df)
df = preprocessing.split_xyz()

# ## feature engineering
engfeature = FeatureEngineering(df)
df = engfeature.apply_all()

In [137]:
df.columns

['Intensity',
 'Classification',
 'Red',
 'Green',
 'Blue',
 'Infrared',
 'ReturnNumber',
 'NumberOfReturns',
 'x',
 'y',
 'z',
 'height_above_ground',
 'local_density',
 'local_z_std',
 'local_z_range',
 'roughness',
 'return_ratio',
 'is_single_return',
 'is_last_return',
 'ndvi',
 'green_red_ratio',
 'ndwi']

In [138]:
from pyspark.sql.functions import col, sum as _sum

feature_cols = df.columns

null_counts = df.select([
    _sum(col(c).isNull().cast("int")).alias(c) 
    for c in feature_cols
])
null_counts.show()

[Stage 760:>                                                        (0 + 1) / 1]

+---------+--------------+---+-----+----+--------+------------+---------------+---+---+---+-------------------+-------------+-----------+-------------+---------+------------+----------------+--------------+----+---------------+----+
|Intensity|Classification|Red|Green|Blue|Infrared|ReturnNumber|NumberOfReturns|  x|  y|  z|height_above_ground|local_density|local_z_std|local_z_range|roughness|return_ratio|is_single_return|is_last_return|ndvi|green_red_ratio|ndwi|
+---------+--------------+---+-----+----+--------+------------+---------------+---+---+---+-------------------+-------------+-----------+-------------+---------+------------+----------------+--------------+----+---------------+----+
|        0|             0|  0|    0|   0|       0|           0|              0|  0|  0|  0|                  0|            0|          0|            0|        0|           0|               0|             0|   0|              0|   0|
+---------+--------------+---+-----+----+--------+------------+-----

                                                                                

### Load | Models
Prepare the variable for the models

In [139]:
feature_cols = df.drop("Classification").columns  
assembler = VectorAssembler(inputCols=feature_cols,
                            outputCol="features",
                           # handleInvalid="skip" 
                           ) 

scaler = StandardScaler(inputCol="features", outputCol="scaled_features")


In [140]:
rf = RandomForestClassifier(featuresCol="scaled_features", 
                            labelCol="Classification",
                            bootstrap=True, 
                            numTrees=60,
                            maxDepth=10)

### Pipeline

In [141]:
pipeline = Pipeline(stages=[assembler, scaler, rf])

### Train 

In [142]:
## Import taskmetrics to see how the model is performing
# taskmetrics = TaskMetrics(session)
# taskmetrics.begin()
# model = pipeline.fit(df)
# taskmetrics.end()
# taskmetrics.print_report()

In [143]:
model = pipeline.fit(df)

[Stage 839:>                                                        (0 + 1) / 1]

In [144]:
model.transform(df).show(2)

[Stage 842:>                                                        (0 + 1) / 1]

+---------+--------------+-----+-----+-----+--------+------------+---------------+----------+-----------+------+--------------------+-------------+-------------------+-----------------+-------------------+------------+----------------+--------------+-------------------+------------------+------------------+--------------------+--------------------+--------------------+--------------------+----------+
|Intensity|Classification|  Red|Green| Blue|Infrared|ReturnNumber|NumberOfReturns|         x|          y|     z| height_above_ground|local_density|        local_z_std|    local_z_range|          roughness|return_ratio|is_single_return|is_last_return|               ndvi|   green_red_ratio|              ndwi|            features|     scaled_features|       rawPrediction|         probability|prediction|
+---------+--------------+-----+-----+-----+--------+------------+---------------+----------+-----------+------+--------------------+-------------+-------------------+-----------------+-------

                                                                                

## Hyperparameter tunning

In [145]:
# from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
# from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [146]:
# df_val = spark.read_parquet("ubs-datasets",
#                     "FRACTAL/data/val/VAL-0436_6406-003134108.parquet",
#                     read_all=False) \
#                     .select(*parquet_cols)

# preprocessing = PreProcessing(df_val)
# df_val = preprocessing.split_xyz()
# eng_feature = FeatureEngineering(df_val)
# df_val = eng_feature.apply_all()

In [147]:
# from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
# from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# paramGrid = (ParamGridBuilder()
#     .addGrid(rf.numTrees, [20, 50])
#     .addGrid(rf.maxDepth, [5, 20])
#     .addGrid(rf.featureSubsetStrategy, ['auto', 'sqrt'])
#     .build()
# )

# evaluator = MulticlassClassificationEvaluator(labelCol="Classification", 
#                                               predictionCol="prediction", 
#                                               metricName="accuracy"
#                                               )

# # 5️⃣ Cross Validator (3 folds)
# crossval = CrossValidator(
#     estimator=pipeline,
#     estimatorParamMaps=paramGrid,
#     evaluator=evaluator,
#     numFolds=3,
#     parallelism=2  # number of parallel tasks
# )

# # 7️⃣ Fit cross validator
# cv_model = crossval.fit(df_val)

# # 8️⃣ Get best model
# best_model = cv_model.bestModel
# print("Best parameters:")
# print(f"  numTrees = {best_model.stages[-1]._java_obj.getNumTrees()}")
# print(f"  maxDepth = {best_model.stages[-1]._java_obj.getMaxDepth()}")
# print(f"  featureSubsetStrategy = {best_model.stages[-1]._java_obj.getFeatureSubsetStrategy()}")

# # # 9️⃣ Evaluate on test data
# # predictions = best_model.transform(test_df)
# # accuracy = evaluator.evaluate(predictions)
# # print(f"Test Accuracy = {accuracy:.3f}")


## Inference

In [148]:
df_test = spark.read_parquet("ubs-datasets",
                    "FRACTAL/data/val/VAL-0436_6406-003134108.parquet",
                    read_all=False) \
                    .select(*parquet_cols)

preprocessing = PreProcessing(df_test)
df_test = preprocessing.split_xyz()
eng_feature = FeatureEngineering(df_test)
df_test = eng_feature.apply_all()

Reading from: ['s3a://ubs-datasets/FRACTAL/data/val/VAL-0436_6406-003134108.parquet']


In [149]:
predictions = model.transform(df_test)
model.transform(df_test).show(2)

[Stage 849:>                                                        (0 + 1) / 1]

+---------+--------------+-----+-----+-----+--------+------------+---------------+----------+-----------+------+-------------------+-------------+-------------------+-------------------+-------------------+------------+----------------+--------------+--------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+----------+
|Intensity|Classification|  Red|Green| Blue|Infrared|ReturnNumber|NumberOfReturns|         x|          y|     z|height_above_ground|local_density|        local_z_std|      local_z_range|          roughness|return_ratio|is_single_return|is_last_return|                ndvi|   green_red_ratio|               ndwi|            features|     scaled_features|       rawPrediction|         probability|prediction|
+---------+--------------+-----+-----+-----+--------+------------+---------------+----------+-----------+------+-------------------+-------------+-------------------+-------------------+

                                                                                

## Evaluation

In [150]:
evaluator = MulticlassClassificationEvaluator(labelCol="Classification", 
                                              predictionCol="prediction", 
                                              metricName="accuracy"
                                              )

accuracy = evaluator.evaluate(predictions)
print(f"Test Accuracy = {accuracy:.2f}")

[Stage 855:>                                                        (0 + 1) / 1]

Test Accuracy = 0.36


                                                                                