In [None]:
'''
En este ejercicio se usarán Kafka, Spark, CSV, JSON y Avro

- Leemos un CSV, lo serializamos en AVRO y lo enviamos a un topic de Kafka, todo ello con Spark

- Posteriormente leemos este topic de Kafka, deserializamos de Avro y lo convertiremos a JSON

El CSV es:

nombre;apellido;sexo;edad;peso;altura
Pedro;Pérez;m;30;60;1.70
María;Díaz;F;35;55;1.65
Marcos;Rojo;M;20;62;1.80
Carolina;Martínez;f;21;59;1.71

'''

In [1]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col, expr
from pyspark.sql.avro.functions import from_avro, to_avro
import findspark
import pandas as pd
from deltalake.writer import write_deltalake

# Inicializar findspark
findspark.init()

In [2]:
# Crear la sesión de Spark
# Necesitaremos incluir las dependencias de AVRO y Kafka en spark.jars.packages
spark = SparkSession.builder \
    .appName("EjercicioFinal") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.13:3.5.1,org.apache.spark:spark-avro_2.13:3.5.1") \
    .getOrCreate()

24/07/16 10:31:53 WARN Utils: Your hostname, bosonituser-VirtualBox resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
24/07/16 10:31:53 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/bosonituser/spark-3.5.1-bin-hadoop3-scala2.13/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/bosonituser/.ivy2/cache
The jars for the packages stored in: /home/bosonituser/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.13 added as a dependency
org.apache.spark#spark-avro_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-2d31b8be-c4b2-4b23-97ca-064aff63cbb3;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.13;3.5.1 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.13;3.5.1 in central
	found org.apache.kafka#kafka-clients;3.4.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.3 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.scala-lang.modules#scala-parallel-collection

In [4]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from pyspark.sql.functions import col, struct

# Definir el esquema del CSV
csv_schema = StructType([
    StructField("nombre", StringType(), True),
    StructField("apellido", StringType(), True),
    StructField("sexo", StringType(), True),
    StructField("edad", IntegerType(), True),
    StructField("peso", IntegerType(), True),
    StructField("altura", FloatType(), True)
])

# Leer el CSV en un DataFrame
df = spark.read.csv("/home/bosonituser/Desktop/CSVFiles/mydoc.csv", header=True, schema=csv_schema, sep=";")

df.show()

# Definir el esquema Avro en formato JSON
avro_schema = '''
{
  "type": "record",
  "name": "Person",
  "fields": [
    {"name": "nombre", "type": ["null", "string"], "default": null},
    {"name": "apellido", "type": ["null", "string"], "default": null},
    {"name": "sexo", "type": ["null", "string"], "default": null},
    {"name": "edad", "type": ["null", "int"], "default": null},
    {"name": "peso", "type": ["null", "int"], "default": null},
    {"name": "altura", "type": ["null", "float"], "default": null}
  ]
}
'''

# Convertir el DataFrame a formato Avro
df_avro = df.select(to_avro(struct("nombre", "apellido", "sexo", "edad", "peso", "altura"), avro_schema).alias("value"))

# Mostrar el DataFrame en formato Avro
df_avro.show(truncate=False)

+--------+--------+----+----+----+------+
|  nombre|apellido|sexo|edad|peso|altura|
+--------+--------+----+----+----+------+
|   Pedro|   Pérez|   m|  30|  60|   1.7|
|   María|    Díaz|   F|  35|  55|  1.65|
|  Marcos|    Rojo|   M|  20|  62|   1.8|
|Carolina|Martínez|   f|  21|  59|  1.71|
+--------+--------+----+----+----+------+

+----------------------------------------------------------------------------------------------------+
|value                                                                                               |
+----------------------------------------------------------------------------------------------------+
|[02 0A 50 65 64 72 6F 02 0C 50 C3 A9 72 65 7A 02 02 6D 02 3C 02 78 02 9A 99 D9 3F]                  |
|[02 0C 4D 61 72 C3 AD 61 02 0A 44 C3 AD 61 7A 02 02 46 02 46 02 6E 02 33 33 D3 3F]                  |
|[02 0C 4D 61 72 63 6F 73 02 08 52 6F 6A 6F 02 02 4D 02 28 02 7C 02 66 66 E6 3F]                     |
|[02 10 43 61 72 6F 6C 69 6E 61 02 12 4D 61 7

In [5]:
# Ahora lo mandamos a Kafka, al topic "my_topic2"
df_avro.write.format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("topic", "ejFinal") \
    .save()

                                                                                

In [16]:
'''
bin/kafka-console-consumer.sh --bootstrap-server 127.0.0.1:9092 --topic my_topic2 --from-beginning

Pedro
     Pérezm<x���?

María
DíazFFn33�?

MarcoRojoM(|ff�?
CarolinaMartínezf*vH��?
'''

'\nbin/kafka-console-consumer.sh --bootstrap-server 127.0.0.1:9092 --topic my_topic2 --from-beginning\n\nPedro\n     Pérezm<x���?\n\nMaría\nDíazFFn33�?\n\nMarcoRojoM(|ff�?\nCarolinaMartínezf*vH��?\n'

In [1]:
# Ahora leemos de kafka

df_kafka = spark.read.format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "my_topic") \
    .load()

df_kafka.show(truncate=False)

NameError: name 'spark' is not defined

In [19]:
# Y ahora deserializamos AVRO
avro_schema = '''
{
  "type": "record",
  "name": "Person",
  "fields": [
    {"name": "nombre", "type": ["null", "string"], "default": null},
    {"name": "apellido", "type": ["null", "string"], "default": null},
    {"name": "sexo", "type": ["null", "string"], "default": null},
    {"name": "edad", "type": ["null", "int"], "default": null},
    {"name": "peso", "type": ["null", "int"], "default": null},
    {"name": "altura", "type": ["null", "float"], "default": null}
  ]
}
'''

# Deserializar los datos Avro leídos de Kafka, ene ste caso la columna value
df_avro_deserialized = df_kafka.select(from_avro(col("value"), avro_schema).alias("person"))

# Seleccionar y mostrar los campos deserializados (person)
df_person = df_avro_deserialized.select("person.*")
df_person.show(truncate=False)

24/07/11 11:27:14 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
[Stage 9:>                                                          (0 + 1) / 1]

+--------+--------+----+----+----+------+
|nombre  |apellido|sexo|edad|peso|altura|
+--------+--------+----+----+----+------+
|Pedro   |Pérez   |m   |30  |60  |1.7   |
|María   |Díaz    |F   |35  |55  |1.65  |
|Marcos  |Rojo    |M   |20  |62  |1.8   |
|Carolina|Martínez|f   |21  |59  |1.71  |
+--------+--------+----+----+----+------+



                                                                                