In [1]:
import os
import sys
from pyspark import SparkConf
from pyspark.sql import SparkSession

# Define the paths
SPARK_HOME = r"/root/uni-projects/bdm2/.venv/lib/python3.10/site-packages/pyspark"
JAVA_HOME = r"/root/.sdkman/candidates/java/current"

# Ensure paths exist
if not os.path.exists(SPARK_HOME):
    raise ValueError(f"SPARK_HOME path does not exist: {SPARK_HOME}")
if not os.path.exists(JAVA_HOME):
    raise ValueError(f"JAVA_HOME path does not exist: {JAVA_HOME}")

# Set the environment variables
os.environ["SPARK_HOME"] = SPARK_HOME
os.environ["JAVA_HOME"] = JAVA_HOME
os.environ["PATH"] = os.pathsep.join([
    os.path.join(SPARK_HOME, "bin"),
    os.environ["PATH"],
])
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

# Debug prints to verify environment settings
print(f"SPARK_HOME = {os.environ['SPARK_HOME']}")
print(f"JAVA_HOME = {os.environ['JAVA_HOME']}")
print(f"PATH = {os.environ['PATH']}")
print(f"PYSPARK_PYTHON = {os.environ['PYSPARK_PYTHON']}")
print(f"PYSPARK_DRIVER_PYTHON = {os.environ['PYSPARK_DRIVER_PYTHON']}")

SPARK_HOME = /root/uni-projects/bdm2/.venv/lib/python3.10/site-packages/pyspark
JAVA_HOME = /root/.sdkman/candidates/java/current
PATH = /root/uni-projects/bdm2/.venv/lib/python3.10/site-packages/pyspark/bin:/root/uni-projects/bdm2/.venv/bin:/root/.vscode-server/bin/dc96b837cf6bb4af9cd736aa3af08cf8279f7685/bin/remote-cli:/root/.tfenv/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/usr/lib/wsl/lib:/mnt/c/Users/Akos Schneider/.jdks/oracle-8/bin:/mnt/c/Program Files/Common Files/Oracle/Java/javapath:/mnt/c/Program Files (x86)/Common Files/Oracle/Java/javapath:/mnt/c/WINDOWS/system32:/mnt/c/WINDOWS:/mnt/c/WINDOWS/System32/Wbem:/mnt/c/WINDOWS/System32/WindowsPowerShell/v1.0/:/mnt/c/WINDOWS/System32/OpenSSH/:/mnt/c/Program Files (x86)/NVIDIA Corporation/PhysX/Common:/mnt/c/Program Files/Git/cmd:/mnt/c/Program Files/TortoiseGit/bin:/mnt/c/Users/Akos Schneider/apache-maven-3.8.6/bin:/mnt/c/Program Files/nodejs/:/mnt/c/ProgramData/chocolatey/bin:/mn

In [3]:
spark = None

In [7]:
if spark:
    spark.stop()
# Create the configuration in the local machine and give a name to the application
conf = SparkConf() \
    .set("spark.master", "local") \
    .set("spark.app.name", "Spark Dataframes Tutorial") \
    .set("spark.jars.packages", "org.apache.spark:spark-avro_2.12:3.5.1,org.mongodb.spark:mongo-spark-connector_2.12:10.3.0")


# Create the session 
spark = SparkSession.builder \
    .config(conf=conf) \
    .config("spark.mongodb.read.connection.uri", "mongodb://127.0.0.1:27017/") \
    .config("spark.mongodb.write.connection.uri", "mongodb://127.0.0.1:27017/") \
    .getOrCreate()
print(f"Python version = {spark.sparkContext.pythonVer}")
print(f"Spark version = {spark.version}")
print(spark.sparkContext.getConf().getAll())

Python version = 3.10
Spark version = 3.5.1
[('spark.master', 'local'), ('spark.driver.extraJavaOptions', '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -Djdk.reflect.useDirectMethodHandle=false'), ('spark.app.initial.jar.urls', 'sp

In [8]:

df = spark.read.format("avro").load("data/persistent-landing-zone/lookup/2018_qualitat_aire_estacions_49d67292d3b070494d872ad69f21f33606716bb9.avro")

df.show()

# to pandas dataframe
df.write.format("mongodb") \
    .mode("append") \
    .option("database", "test") \
    .option("collection", "test") \
    .save()

+--------------------+---------+---+--------+--------+-------+--------------------+--------------+--------------+----------+--------------------+------------+------------------+-------------+-------------+-------------+
|          nom_cabina|codi_dtes|zqa|codi_eoi|longitud|latitud|            Ubicacio|Codi_Districte| Nom_Districte|Codi_Barri|           Nom_Barri|Ocupacio_sol|Emissions_Properes|Contaminant_1|Contaminant_2|Contaminant_3|
+--------------------+---------+---+--------+--------+-------+--------------------+--------------+--------------+----------+--------------------+------------+------------------+-------------+-------------+-------------+
|Barcelona - Ciuta...|       IL|  1| 8019050|  2.1874|41.3864|Parc de la Ciutad...|             1|  Ciutat Vella|         4|Sant Pere, Santa ...|      Urbana|              Fons|          NO2|           O3|         NULL|
|Barcelona - Eixample|       IH|  1| 8019043|  2.1538|41.3853|Av. Roma - c/ Com...|             5|      Eixample|       

In [9]:
dataFrame = spark.read \
    .format("mongodb") \
    .option("database", "test") \
    .option("collection", "test") \
    .load() \
    .show()

+----------+--------------+-------------+-------------+-------------+------------------+--------------------+--------------+------------+--------------------+--------------------+---------+--------+-------+--------+--------------------+---+
|Codi_Barri|Codi_Districte|Contaminant_1|Contaminant_2|Contaminant_3|Emissions_Properes|           Nom_Barri| Nom_Districte|Ocupacio_sol|            Ubicacio|                 _id|codi_dtes|codi_eoi|latitud|longitud|          nom_cabina|zqa|
+----------+--------------+-------------+-------------+-------------+------------------+--------------------+--------------+------------+--------------------+--------------------+---------+--------+-------+--------+--------------------+---+
|         4|             1|          NO2|           O3|         NULL|              Fons|Sant Pere, Santa ...|  Ciutat Vella|      Urbana|Parc de la Ciutad...|6640cb7315a864518...|       IL| 8019050|41.3864|  2.1874|Barcelona - Ciuta...|  1|
|         9|             5|         