# **Processing of geo-referenced data from the Main Cropping System Experiment at the LTER (Kellogg Biological Station Long-Term Ecological Research)**

Data link: https://lter.kbs.msu.edu/datatables/185

In [None]:
!pip install pyspark



In [None]:
import multiprocessing

Obtenemos el nº de cores

In [None]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer
cores

2

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Creamos la sesión de spark.

In [None]:
# Import dependences
from pyspark.sql import SparkSession
from pyspark.sql import functions

# Create a Spark session with default settings
spark_session = SparkSession \
        .builder \
        .getOrCreate()


Leemos el csv

In [None]:
# Read processed data
processed_data = spark_session\
        .read\
        .format("csv")\
        .options(header='true', inferschema='true') \
        .load("/content/drive/MyDrive/master/Modulo 6/datasets/185-geo+referenced+annual+crop+yields+processed+1738260472.csv")

Mostramos a ver el dataFrame obtenido del csv

In [None]:
processed_data.printSchema()
processed_data.show()

root
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- yield: double (nullable = true)
 |-- species: string (nullable = true)
 |-- moisture: double (nullable = true)
 |-- year: integer (nullable = true)

+------------------+------------------+-----+--------+--------+----+
|         longitude|          latitude|yield| species|moisture|year|
+------------------+------------------+-----+--------+--------+----+
|-85.37738389287209| 42.40794224482194|19.86|soybeans|    13.7|2012|
|-85.37738389287209| 42.40796205015469|11.95|soybeans|    13.7|2012|
|-85.37738024858697| 42.40763976337624|25.67|soybeans|    13.7|2012|
|-85.37738024858697|42.407658668466595|27.34|soybeans|    13.7|2012|
|-85.37738024858697| 42.40768117452654|22.31|soybeans|    13.7|2012|
|-85.37738024858697| 42.40770097985929|24.62|soybeans|    13.7|2012|
|-85.37738024858697| 42.40771988494965| 25.3|soybeans|    13.7|2012|
|-85.37738024858697|42.407739690282405|23.66|soybeans|    13.7|2012|
|

Vamos ver los tipos de cultivos que hay

In [None]:
# Find the types of products
processed_data.select("species")\
    .distinct()\
    .show()

+--------+
| species|
+--------+
|soybeans|
|    corn|
|   wheat|
+--------+



Si quisieramos pasarlo a un Dataframe de pandas

In [None]:
species = processed_data.select("species").distinct().toPandas()
species

Unnamed: 0,species
0,soybeans
1,corn
2,wheat


Vamos contar los datos, continuamos con el dataframe de spark

In [None]:
# Count the number of rows per specie
processed_data.groupBy("species")\
    .count()\
    .sort("count", ascending=False) \
    .show()


+--------+------+
| species| count|
+--------+------+
|   wheat|106953|
|soybeans| 94061|
|    corn| 84140|
+--------+------+



Podríamos obtener la cosecha media por tipo de cultivo

In [None]:
# Obtain the total and average yield by species
processed_data\
  .groupBy("species") \
  .agg(
      functions.min("yield").alias("Minimum yield"),
      functions.max("yield").alias("Maximumn yield"),
      functions.sum("yield").alias("Total yield"),
      functions.avg("yield").alias("Avg. yield")) \
  .show()


+--------+-------------+--------------+--------------------+------------------+
| species|Minimum yield|Maximumn yield|         Total yield|        Avg. yield|
+--------+-------------+--------------+--------------------+------------------+
|soybeans|          0.0|      20600.78|   2847072.830000039|30.268366591892914|
|    corn|          0.0|       28136.2|1.0091563540000062E7|119.93776491561758|
|   wheat|          0.0|      14799.12|   4791879.419999956|44.803599899020654|
+--------+-------------+--------------+--------------------+------------------+



# Example ilustrating the use of Folium to draw in a map the filve locations having the highest yield values

In [None]:
# Import Folium (https://python-visualization.github.io/folium/)
import folium

Se ordena de mayor a menos y me quedo con los 4 primeros

In [None]:
# Sort by yield and take the first five entries
higuest_yield = processed_data\
    .sort("yield", ascending=False)\
    .head(5)
higuest_yield

[Row(longitude=-85.36931668608149, latitude=42.4087856155255, yield=28136.2, species='corn', moisture=14.76, year=2005),
 Row(longitude=-85.36931668608149, latitude=42.4087856155255, yield=22129.59, species='corn', moisture=14.76, year=2005),
 Row(longitude=-85.37114882250998, latitude=42.40827121466143, yield=20600.78, species='soybeans', moisture=12.97, year=2003),
 Row(longitude=-85.37123750154909, latitude=42.40846296625799, yield=20109.12, species='soybeans', moisture=12.97, year=2003),
 Row(longitude=-85.36931668608149, latitude=42.4087856155255, yield=18652.08, species='corn', moisture=14.76, year=2005)]

In [None]:
 # Create a list of tuples with the coordinates and the yield value
location_list = []
for row in higuest_yield:
      #print(row)
      location_list.append([row[1], row[0], row[2]])

print(location_list)

[[42.4087856155255, -85.36931668608149, 28136.2], [42.4087856155255, -85.36931668608149, 22129.59], [42.40827121466143, -85.37114882250998, 20600.78], [42.40846296625799, -85.37123750154909, 20109.12], [42.4087856155255, -85.36931668608149, 18652.08]]


In [None]:
def map(locations):
    """ Function to plot the locations with folium
    Quick Guide: https://nbviewer.jupyter.org/github/python-visualization/folium/blob/master/examples/Quickstart.ipynb

    """
    folium_map = folium.Map(location=[locations[0][0], locations[0][1]])
    for location in locations:
        folium.Marker(
                location=[location[0], location[1]],
                icon=folium.Icon(icon='cloud')
                ).add_to(folium_map)

    return folium_map

In [None]:
result_map = map(location_list)
result_map

In [None]:
result_map.save("/content/drive/MyDrive/Colab Notebooks/map.html")