```
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements.  See the NOTICE file
distributed with this work for additional information
regarding copyright ownership.  The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License.  You may obtain a copy of the License at
  http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied.  See the License for the
specific language governing permissions and limitations
under the License.
```

In [1]:
# pip install sklearn
# pip install pyarrow
# pip install fsspec

In [2]:
from IPython.display import display, HTML
from pyspark.sql import SparkSession
from pyspark import StorageLevel
import pandas as pd
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    LongType,
    IntegerType,
    DoubleType,
    ArrayType,
)
from pyspark.sql.functions import regexp_replace
from sedona.spark import SedonaRegistrator
from sedona.spark import SedonaKryoRegistrator, KryoSerializer
from pyspark.sql.functions import col, split, expr
from pyspark.sql.functions import udf, lit
from sedona.spark import SedonaKryoRegistrator, KryoSerializer
from pyspark.sql.functions import col, split, expr
from pyspark.sql.functions import udf, lit, flatten
from pywebhdfs.webhdfs import PyWebHdfsClient
from datetime import date

In [3]:
analise_folder = "analise_teste_" + str(date.today())
hdfs = PyWebHdfsClient(host="179.106.229.159", port="50070", user_name="root")
hdfs.delete_file_dir(analise_folder, recursive=True)

True

In [4]:
# spark.scheduler.mode', 'FAIR'
spark = (
    SparkSession.builder.appName("Sentinel-app")
    .enableHiveSupport()
    .master("local[*]")
    .master("spark://spark-master:7077")
    .config("spark.executor.memory", "15G")
    .config("spark.driver.maxResultSize", "135G")
    .config("spark.sql.shuffle.partitions", "500")
    .config(" spark.sql.adaptive.coalescePartitions.enabled", True)
    .config("spark.sql.adaptive.enabled", True)
    .config("spark.sql.adaptive.coalescePartitions.initialPartitionNum", 125)
    .config("spark.sql.execution.arrow.pyspark.enabled", True)
    .config("spark.sql.execution.arrow.fallback.enabled", True)
    .config("spark.kryoserializer.buffer.max", 2047)
    .config("spark.serializer", KryoSerializer.getName)
    .config("spark.kryo.registrator", SedonaKryoRegistrator.getName)
    .config(
        "spark.jars.packages",
        "org.apache.sedona:sedona-python-adapter-3.0_2.12:1.1.0-incubating,org.datasyslab:geotools-wrapper:1.1.0-25.2",
    )
    .enableHiveSupport()
    .getOrCreate()
)

SedonaRegistrator.registerAll(spark)
sc = spark.sparkContext

Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
:: loading settings :: url = jar:file:/usr/local/lib/python3.9/dist-packages/pyspark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml
org.apache.sedona#sedona-python-adapter-3.0_2.12 added as a dependency
org.datasyslab#geotools-wrapper added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-475a8539-e626-41a7-8cca-cb3c72ad1694;1.0
	confs: [default]
	found org.apache.sedona#sedona-python-adapter-3.0_2.12;1.1.0-incubating in central
	found org.locationtech.jts#jts-core;1.18.0 in central
	found org.wololo#jts2geojson;0.16.1 in central
	found com.fasterxml.jackson.core#jackson-databind;2.12.2 in central
	found com.fasterxml.jackson.core#jackson-annotations;2.12.2 in central
	found com.fasterxml.jackson.core#jackson-core;2.12.2 in central
	found org.apache.sedona#sedona-core-3.0_2.12;1.1.0-incubating in central
	found org.apache.sedona#sedona-sql

In [5]:
# Path to directory of geotiff images
DATA_DIR = "hdfs://776faf4d6a1e:8020/sentinel2_tmp/*"
df = spark.read.format("geotiff").option("dropInvalid", True).load(DATA_DIR)

                                                                                

In [6]:
# SUPER IMPORTANT ULTRA MEGA POWER FOR MEMORY PROBLENS SOLVE
rdd = spark.sparkContext.parallelize((0, 20))
print("From local[5]" + str(rdd.getNumPartitions()))

From local[5]4


In [7]:
df.cache()
df.printSchema()

root
 |-- image: struct (nullable = true)
 |    |-- origin: string (nullable = true)
 |    |-- wkt: string (nullable = true)
 |    |-- height: integer (nullable = true)
 |    |-- width: integer (nullable = true)
 |    |-- nBands: integer (nullable = true)
 |    |-- data: array (nullable = true)
 |    |    |-- element: double (containsNull = true)


22/01/06 20:04:57 WARN SQLConf: The SQL config 'spark.sql.execution.arrow.fallback.enabled' has been deprecated in Spark v3.0 and may be removed in the future. Use 'spark.sql.execution.arrow.pyspark.fallback.enabled' instead of it.


In [8]:
from pyspark.sql.functions import monotonically_increasing_id

# add ID
df_index = df.select("*").withColumn("id", monotonically_increasing_id())
df_index.explain()
df_index.show(5)

== Physical Plan ==
*(1) Project [image#14, monotonically_increasing_id() AS id#22L]
+- InMemoryTableScan [image#14]
      +- InMemoryRelation [image#14], StorageLevel(disk, memory, deserialized, 1 replicas)
            +- FileScan geotiff [image#14] Batched: false, DataFilters: [], Format: org.apache.spark.sql.sedona_sql.io.GeotiffFileFormat@5f9f8a31, Location: InMemoryFileIndex[hdfs://776faf4d6a1e:8020/sentinel2_tmp/1, hdfs://776faf4d6a1e:8020/sentinel2_tm..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<image:struct<origin:string,wkt:string,height:int,width:int,nBands:int,data:array<double>>>



[Stage 4:>                                                          (0 + 1) / 1]

+--------------------+---+
|               image| id|
+--------------------+---+
|[hdfs://776faf4d6...|  0|
|[hdfs://776faf4d6...|  1|
|[hdfs://776faf4d6...|  2|
|[hdfs://776faf4d6...|  3|
|[hdfs://776faf4d6...|  4|
+--------------------+---+


                                                                                

In [9]:
# "image.wkt as Geom",
df_export = df_index.selectExpr(
    "id",
    "image.origin as origin",
    "image.height as height",
    "image.width as width",
    "cast(image.data as string) as data",
    "image.nBands as bands",
)
print(df_export.dtypes)
df_export.explain()
df_export.createOrReplaceTempView("df_export")

[('id', 'bigint'), ('origin', 'string'), ('height', 'int'), ('width', 'int'), ('data', 'string'), ('bands', 'int')]
== Physical Plan ==
*(1) Project [id#22L, image#14.origin AS origin#65, image#14.height AS height#66, image#14.width AS width#67, cast(image#14.data as string) AS data#68, image#14.nBands AS bands#69]
+- *(1) Project [image#14, monotonically_increasing_id() AS id#22L]
   +- InMemoryTableScan [image#14]
         +- InMemoryRelation [image#14], StorageLevel(disk, memory, deserialized, 1 replicas)
               +- FileScan geotiff [image#14] Batched: false, DataFilters: [], Format: org.apache.spark.sql.sedona_sql.io.GeotiffFileFormat@5f9f8a31, Location: InMemoryFileIndex[hdfs://776faf4d6a1e:8020/sentinel2_tmp/1, hdfs://776faf4d6a1e:8020/sentinel2_tm..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<image:struct<origin:string,wkt:string,height:int,width:int,nBands:int,data:array<double>>>



In [10]:
# df_export.repartition("origin").write.format('csv').option('header', True).partitionBy("origin").mode('overwrite').option('sep', ',').save("hdfs://776faf4d6a1e:8020/"+analise_folder)
# df_export.write.format('csv').option('header', True).option('sep', ',').save("hdfs://776faf4d6a1e:8020/"+analise_folder)
# start = 0
# end = 10
# part_df_export =  spark.sql('select * from df_export where id between '+str(start)+' and '+str(end))
# part_df_export.show(7)

In [11]:
# df_writer = part_df_export.write.format('csv').option('header', True).option('sep', ',')
# df_writer.save("hdfs://776faf4d6a1e:8020/"+analise_folder)

In [12]:
# POR 1 LINHA SER GRANDE O SUFICIENTE PARA ESTOURO DE MEMORIA O COLLECT NÂO FUNCIONA E NEM SALVAR O DF_SPARK DIRETO
# (NECESSÀRIO TRANFORMAR PARA PANDAS LINHA A LINHA)
# part_df_export.take(3)
part_df_export = df_export.take(1)
# print(part_df_export)
pd.DataFrame(part_df_export).to_csv("teste.csv", sep=",", encoding="utf-8")

In [13]:
df = df.selectExpr(
    "image.origin as origin",
    "ST_GeomFromWkt(image.wkt) as Geom",
    "image.height as height",
    "image.width as width",
    "image.data as data",
    "image.nBands as bands",
).cache()
df.show(5)
print(df.dtypes)
df.explain()

22/01/06 20:05:02 WARN SQLConf: The SQL config 'spark.sql.execution.arrow.fallback.enabled' has been deprecated in Spark v3.0 and may be removed in the future. Use 'spark.sql.execution.arrow.pyspark.fallback.enabled' instead of it.


+--------------------+--------------------+------+-----+--------------------+-----+
|              origin|                Geom|height|width|                data|bands|
+--------------------+--------------------+------+-----+--------------------+-----+
|hdfs://776faf4d6a...|POLYGON ((-54.546...|   186|  300|[409.0, 404.0, 41...|    4|
|hdfs://776faf4d6a...|POLYGON ((-54.546...|   186|  300|[1838.0, 1778.0, ...|    4|
|hdfs://776faf4d6a...|POLYGON ((-54.274...|   199|  257|[931.0, 971.0, 95...|    4|
|hdfs://776faf4d6a...|POLYGON ((-54.274...|   199|  257|[957.0, 995.0, 97...|    4|
|hdfs://776faf4d6a...|POLYGON ((-54.274...|   199|  257|[428.0, 428.0, 43...|    4|
+--------------------+--------------------+------+-----+--------------------+-----+
only showing top 5 rows

[('origin', 'string'), ('Geom', 'udt'), ('height', 'int'), ('width', 'int'), ('data', 'array<double>'), ('bands', 'int')]
== Physical Plan ==
InMemoryTableScan [origin#111, Geom#112, height#113, width#114, data#115, ban

In [14]:
df = df.selectExpr(
    "origin",
    "Geom",
    "RS_GetBand(data, 1,bands) as B2",
    "RS_GetBand(data, 2,bands) as B3",
    "RS_GetBand(data, 3,bands) as B4",
    "RS_GetBand(data, 4,bands) as B8",
    "RS_Array(height * width, 2.4) as constant_evi_2",
    "RS_Array(height * width, 2.5) as constant_evi_1",
    "RS_Array(height * width, 1.0) as constant_evi_3",
    "RS_Array(height * width, -0.5) as constant_tgi_1",
    "RS_Array(height * width, 120.0) as constant_tgi_2",
    "RS_Array(height * width, 0.001) as corrector",
).cache()
df.createOrReplaceTempView("allbands")
df.show(5)

22/01/06 20:05:03 WARN SQLConf: The SQL config 'spark.sql.execution.arrow.fallback.enabled' has been deprecated in Spark v3.0 and may be removed in the future. Use 'spark.sql.execution.arrow.pyspark.fallback.enabled' instead of it.
[Stage 7:>                                                          (0 + 1) / 1]

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|              origin|                Geom|                  B2|                  B3|                  B4|                  B8|      constant_evi_2|      constant_evi_1|      constant_evi_3|      constant_tgi_1|      constant_tgi_2|           corrector|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|hdfs://776faf4d6a...|POLYGON ((-54.546...|[409.0, 404.0, 41...|[713.0, 673.0, 70...|[0.0, 0.0, 0.0, 0...|[0.0, 0.0, 0.0, 0...|[2.4, 2.4, 2.4, 2...|[2.5, 2.5, 2.5, 2...|[1.0, 1.0, 1.0, 1...|[-0.5, -0.5, -0.5...|[120.0, 120.0, 12...|[0.001

                                                                                

In [15]:
# Não tem data da imagem
# Não tem parte a qual ela se refere
# Necessário adicionar
origin = df.selectExpr("origin")
split_origin = origin.select(split(col("origin"), "/"))
split_origin.head()
# 20211226T134212
split_origin = spark.sql(
    "select to_timestamp(REPLACE(SPLIT(SPLIT(origin,'/')[5], '_')[1],'T',' '),'yyyyMMdd HHmmss') as image_date, SPLIT(origin,'/')[4] as feature_name, * from allbands"
)
split_origin.show(5)

[Stage 9:>                                                          (0 + 1) / 1]

+-------------------+------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|         image_date|feature_name|              origin|                Geom|                  B2|                  B3|                  B4|                  B8|      constant_evi_2|      constant_evi_1|      constant_evi_3|      constant_tgi_1|      constant_tgi_2|           corrector|
+-------------------+------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|2021-12-26 13:42:12|          70|hdfs://776faf4d6a...|POLYGON ((-54.546...|[409.0, 404.0, 41...|[713.0, 673.0, 70...|[0.0, 0.0, 0.0, 0...|

                                                                                

In [16]:
# Fator de correcao da banda para ficar com valores entre 0 e 1
correct_origin = split_origin.selectExpr(
    "RS_MultiplyBands(B2, corrector) as bluen",
    "RS_MultiplyBands(B3, corrector) as greenn",
    "RS_MultiplyBands(B4, corrector) as redn",
    "RS_MultiplyBands(B8, corrector) as nirn",
    "*",
).cache()
correct_origin = correct_origin.selectExpr(
    "RS_NormalizedDifference(nirn, redn) as gndvi",
    "RS_SubtractBands(nirn, redn) as sub_nirn_redn",
    "RS_AddBands(nirn,constant_evi_2) as add_nirn_contant_evi_2",
    "RS_AddBands(redn, constant_evi_3) as add_redn_contant_evi_3",
    "RS_DivideBands(nirn, greenn) as div_nirn_greenn",
    "RS_SubtractBands(greenn, redn) as sub_greenn_redn",
    "RS_SubtractBands(redn, greenn) as sub_redn_greenn",
    "RS_SubtractBands(redn, bluen) as sub_redn_bluen",
    "RS_AddBands(greenn, redn) as add_greenn_redn",
    "*",
).cache()

correct_origin = correct_origin.selectExpr(
    "RS_SubtractBands(add_greenn_redn, bluen) as greenn_redn_sub_bluen",
    "RS_AddBands(add_greenn_redn, bluen) as greenn_redn_add_bluen",
    "RS_SubtractBands(sub_greenn_redn, bluen) as sub_greenn_redn_bluen",
    "RS_SubtractBands(sub_redn_greenn, constant_tgi_2) as sub_red_gren_tgi_2",
    "*",
).cache()
correct_origin = correct_origin.selectExpr(
    "RS_MultiplyFactor(sub_redn_bluen,120) as ms_redn_bluen_120", "*"
).cache()
correct_origin = correct_origin.selectExpr(
    "RS_MultiplyFactor(sub_redn_greenn,190) as ms_redn_greenn_190", "*"
).cache()
correct_origin = correct_origin.selectExpr(
    "RS_SubtractBands(ms_redn_greenn_190,ms_redn_bluen_120) as sub_msrg_190_msrb_120",
    "*",
).cache()

22/01/06 20:05:06 WARN SQLConf: The SQL config 'spark.sql.execution.arrow.fallback.enabled' has been deprecated in Spark v3.0 and may be removed in the future. Use 'spark.sql.execution.arrow.pyspark.fallback.enabled' instead of it.
22/01/06 20:05:06 WARN SQLConf: The SQL config 'spark.sql.execution.arrow.fallback.enabled' has been deprecated in Spark v3.0 and may be removed in the future. Use 'spark.sql.execution.arrow.pyspark.fallback.enabled' instead of it.
22/01/06 20:05:06 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
22/01/06 20:05:06 WARN SQLConf: The SQL config 'spark.sql.execution.arrow.fallback.enabled' has been deprecated in Spark v3.0 and may be removed in the future. Use 'spark.sql.execution.arrow.pyspark.fallback.enabled' instead of it.
22/01/06 20:05:06 WARN SQLConf: The SQL config 'spark.sql.execution.arrow.fallback.enabled' has been deprecated in Spark v3.

In [17]:
# bluen = src.read(1, masked=True) / 10000
# greenn = src.read(2, masked=True) / 10000
# redn = src.read(3, masked=True) / 10000
# nirn = src.read(4, masked=True) / 10000
# evi = 2.5 * (nirn - redn) / (nirn + 2.4 * redn + 1)
# gci = (nirn / greenn) - 1
# gli = (2 * greenn - redn - bluen) / (2 * greenn + redn + bluen)
# gndvi = (nirn - greenn) / (nirn + greenn)
# tgi = (-0.5) * (190 * (redn - greenn) - 120 * (redn - bluen))
# vari = (greenn - redn) / (greenn + redn - bluen)


calculated = correct_origin.selectExpr(
    "RS_NormalizedDifference(nirn, redn) as gndvi",
    "RS_DivideBands(RS_MultiplyBands(constant_evi_1, sub_nirn_redn), RS_MultiplyBands(add_nirn_contant_evi_2, add_redn_contant_evi_3)) as evi",
    "RS_SubtractBands(div_nirn_greenn, constant_evi_3) as gci",
    "RS_DivideBands(sub_greenn_redn, greenn_redn_sub_bluen) as vari",
    "RS_DivideBands(RS_MultiplyFactor(sub_greenn_redn_bluen,2),RS_MultiplyFactor(greenn_redn_add_bluen, 2)) as gli",
    "RS_MultiplyBands(constant_tgi_1,sub_msrg_190_msrb_120)  as tgi",
    "origin",
    "image_date",
    "feature_name",
).cache()
calculated.show(5)
calculated.printSchema()

22/01/06 20:05:06 WARN SQLConf: The SQL config 'spark.sql.execution.arrow.fallback.enabled' has been deprecated in Spark v3.0 and may be removed in the future. Use 'spark.sql.execution.arrow.pyspark.fallback.enabled' instead of it.
[Stage 10:>                                                         (0 + 1) / 1]

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------------+------------+
|               gndvi|                 evi|                 gci|                vari|                 gli|                 tgi|              origin|         image_date|feature_name|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------------+------------+
|[0.0, 0.0, 0.0, 0...|[0.0, 0.0, 0.0, 0...|[1.0, 1.0, 1.0, 1...|[2.35, 2.5, 2.45,...|[1.0, 1.0, 1.0, 1...|[43.1949999999999...|hdfs://776faf4d6a...|2021-12-26 13:42:12|          70|
|[0.0, 0.0, 0.0, 0...|[0.0, 0.0, 0.0, 0...|[1.0, 1.0, 1.0, 1...|[-1.41, -1.36, -1...|[1.0, 1.0, 1.0, 1...|[-8.25, -9.210000...|hdfs://776faf4d6a...|2021-12-21 13:42:05|          70|
|[0.0, 0.0, 0.0, 0...|[0.0, 0.0, 0.0, 0...|[1.0, 1.0, 1.0, 1...|[3.65, 3.52, 3.9,...|[1.0,

                                                                                

In [18]:
calculated_mean = calculated.selectExpr(
    "RS_Mean(gndvi) as gndvi",
    "RS_Mean(evi) as evi",
    "RS_Mean(gci) as gci",
    "RS_Mean(vari) as vari",
    "RS_Mean(gli) as gli",
    "RS_Mean(tgi) as tgi",
    "origin",
    "image_date",
    "feature_name",
).cache()
calculated_mean.show(5)
calculated_mean.printSchema()
calculated_mean.createOrReplaceTempView("all_mean")

22/01/06 20:05:11 WARN SQLConf: The SQL config 'spark.sql.execution.arrow.fallback.enabled' has been deprecated in Spark v3.0 and may be removed in the future. Use 'spark.sql.execution.arrow.pyspark.fallback.enabled' instead of it.


+-----+---+---+----+---+------+--------------------+-------------------+------------+
|gndvi|evi|gci|vari|gli|   tgi|              origin|         image_date|feature_name|
+-----+---+---+----+---+------+--------------------+-------------------+------------+
|  0.0|0.0|1.0| 0.0|1.0| -64.0|hdfs://776faf4d6a...|2021-12-26 13:42:12|          70|
|  0.0|0.0|1.0| 0.0|1.0|-94.53|hdfs://776faf4d6a...|2021-12-21 13:42:05|          70|
|  0.0|0.0|1.0| 0.0|1.0|-49.08|hdfs://776faf4d6a...|2021-12-26 13:42:12|           3|
|  0.0|0.0|1.0| 0.0|1.0|-50.85|hdfs://776faf4d6a...|2021-12-26 13:42:12|           3|
|  0.0|0.0|1.0| 0.0|1.0| -23.8|hdfs://776faf4d6a...|2021-12-16 13:42:11|           3|
+-----+---+---+----+---+------+--------------------+-------------------+------------+
only showing top 5 rows

root
 |-- gndvi: double (nullable = false)
 |-- evi: double (nullable = false)
 |-- gci: double (nullable = false)
 |-- vari: double (nullable = false)
 |-- gli: double (nullable = false)
 |-- tgi: dou

In [35]:
# POR 1 LINHA SER GRANDE O SUFICIENTE PARA ESTOURO DE MEMORIA O COLLECT NÂO FUNCIONA E NEM SALVAR O DF_SPARK DIRETO
# (NECESSÀRIO TRANFORMAR PARA PANDAS LINHA A LINHA)
# part_df_export.take(3)
part_df_export = calculated_mean.limit(10).collect()
print(part_df_export)
pd.DataFrame(part_df_export).to_csv("teste.csv", sep=",", encoding="utf-8")

[Row(gndvi=0.0, evi=0.0, gci=1.0, vari=0.0, gli=1.0, tgi=-64.0, origin='hdfs://776faf4d6a1e:8020/sentinel2_tmp/70/20211226T134211_20211226T134212_T21JYM.tif', image_date=datetime.datetime(2021, 12, 26, 13, 42, 12), feature_name='70'), Row(gndvi=0.0, evi=0.0, gci=1.0, vari=0.0, gli=1.0, tgi=-94.53, origin='hdfs://776faf4d6a1e:8020/sentinel2_tmp/70/20211221T134209_20211221T134205_T21JYM.tif', image_date=datetime.datetime(2021, 12, 21, 13, 42, 5), feature_name='70'), Row(gndvi=0.0, evi=0.0, gci=1.0, vari=0.0, gli=1.0, tgi=-49.08, origin='hdfs://776faf4d6a1e:8020/sentinel2_tmp/3/20211226T134211_20211226T134212_T21KYP.tif', image_date=datetime.datetime(2021, 12, 26, 13, 42, 12), feature_name='3'), Row(gndvi=0.0, evi=0.0, gci=1.0, vari=0.0, gli=1.0, tgi=-50.85, origin='hdfs://776faf4d6a1e:8020/sentinel2_tmp/3/20211226T134211_20211226T134212_T21JYN.tif', image_date=datetime.datetime(2021, 12, 26, 13, 42, 12), feature_name='3'), Row(gndvi=0.0, evi=0.0, gci=1.0, vari=0.0, gli=1.0, tgi=-23.8, or

In [20]:
# SAVE COPY TO HDFS
# dá o mesmo problema de threadshod unsuficiente que ocorre no fit
import gc

collected = gc.collect()
print("Garbage collector: collected %d objects." % collected)

Garbage collector: collected 199 objects.


In [21]:
# calculated_mean.repartition("origin").write.format('csv').option('header', True).partitionBy("origin").mode('overwrite').option('sep', ',').save("hdfs://776faf4d6a1e:8020/"+analise_folder)

In [22]:
# Random Forest
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import (
    IndexToString,
    StringIndexer,
    VectorIndexer,
    VectorAssembler,
)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder
import numpy as np
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import OneHotEncoder

In [23]:
vari = calculated_mean.select("vari")
vari.printSchema()
vari.show(5)
df_rf_assembler = calculated_mean.selectExpr(
    "vari", "gndvi", "evi", "tgi", "gli", "cast(feature_name as long) as labels"
)
# FORMATO NECESSARIO PARA O FIT
feature_list = [col for col in df_rf_assembler.columns if col != "labels"]
assembler = VectorAssembler(inputCols=feature_list, outputCol="features")
# rf = RandomForestClassifier(labelCol="labels", featuresCol="features")
df_rf_assembler = assembler.transform(df_rf_assembler)
df_rf_assembler.show(5)
# (trainingData, testData) = df_rf_assembler.randomSplit([0.8, 0.2])
# trainingData.show(5)
# testData.show(5)

root
 |-- vari: double (nullable = false)

+----+
|vari|
+----+
| 0.0|
| 0.0|
| 0.0|
| 0.0|
| 0.0|
+----+
only showing top 5 rows

+----+-----+---+------+---+------+--------------------+
|vari|gndvi|evi|   tgi|gli|labels|            features|
+----+-----+---+------+---+------+--------------------+
| 0.0|  0.0|0.0| -64.0|1.0|    70|(5,[3,4],[-64.0,1...|
| 0.0|  0.0|0.0|-94.53|1.0|    70|(5,[3,4],[-94.53,...|
| 0.0|  0.0|0.0|-49.08|1.0|     3|(5,[3,4],[-49.08,...|
| 0.0|  0.0|0.0|-50.85|1.0|     3|(5,[3,4],[-50.85,...|
| 0.0|  0.0|0.0| -23.8|1.0|     3|(5,[3,4],[-23.8,1...|
+----+-----+---+------+---+------+--------------------+
only showing top 5 rows


In [24]:
hdfs.delete_file_dir("teste", recursive=True)

True

In [25]:
import numpy
from numpy import allclose
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import (
    RandomForestClassifier,
    RandomForestClassificationModel,
)

# df = spark.createDataFrame([
#     (1.0, Vectors.dense(1.0)),
#     (0.0, Vectors.sparse(1, [], []))], ["label", "features"])

# stringIndexer = StringIndexer(inputCol="labels", outputCol="indexed")
# si_model = stringIndexer.fit(df)
# td = si_model.transform(df)
# rf = RandomForestClassifier(numTrees=3, maxDepth=2, labelCol="indexed", seed=42,
#     leafCol="leafId")
# rf.getMinWeightFractionPerNode()

# model = rf.fit(td)
# model.getLabelCol()

# model.setFeaturesCol("features")

# model.setRawPredictionCol("newRawPrediction")

# model.getBootstrap()

# model.getRawPredictionCol()

# model.featureImportances

# allclose(model.treeWeights, [1.0, 1.0, 1.0])

# test0 = spark.createDataFrame([(Vectors.dense(-1.0),)], ["features"])
# model.predict(test0.head().features)

# model.predictRaw(test0.head().features)

# model.predictProbability(test0.head().features)

# result = model.transform(test0).head()
# result.prediction

# numpy.argmax(result.probability)

# numpy.argmax(result.newRawPrediction)

# result.leafId

# test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"])
# model.transform(test1).head().prediction

# model.trees
# temp_path= 'hdfs://776faf4d6a1e:8020/teste'
# rfc_path = temp_path + "/rfc"
# rf.save(rfc_path)
# rf2 = RandomForestClassifier.load(rfc_path)
# rf2.getNumTrees()

# model_path = temp_path + "/rfc_model"
# model.save(model_path)
# model2 = RandomForestClassificationModel.load(model_path)
# model.featureImportances == model2.featureImportances

# model.transform(test0).take(1) == model2.transform(test0).take(1)

In [26]:
# spark.stop()