Spark Connectivity (Please ensure python version 3.10)

In [1]:
import pyspark
from pyspark.sql import SparkSession

In [4]:
# Create a Spark session connected to local Spark container
spark = SparkSession.builder \
    .appName("CS5305 Spark Connection") \
    .master("spark://localhost:7077") \
    .getOrCreate()

# Get Spark context
sc = spark.sparkContext

# Issue some basic commands
print(f"Spark Version: {spark.version}")
print(f"Application Name: {spark.sparkContext.appName}")

# Display cluster information
print(f"Master: {sc.master}")
print(f"Default Parallelism: {sc.defaultParallelism}")

Spark Version: 4.1.1
Application Name: CS5305 Spark Connection


In [7]:
# Query Spark environment and cluster information
print("=== Spark Environment Information ===")
print(f"Spark Version: {spark.version}")
print(f"Spark Home: {sc._jvm.System.getenv('SPARK_HOME')}")
print(f"Python Version: {sc.pythonVer}")

print("\n=== Cluster Information ===")
print(f"Master URL: {sc.master}")
print(f"Application ID: {sc.applicationId}")
print(f"Application Name: {sc.appName}")
print(f"Default Parallelism: {sc.defaultParallelism}")
print(f"Default Min Partitions: {sc.defaultMinPartitions}")

=== Spark Environment Information ===
Spark Version: 4.1.1
Spark Home: D:\git\LUMS\cs5305\python310env\Lib\site-packages\pyspark\bin\..
Python Version: 3.10

=== Cluster Information ===
Master URL: spark://localhost:7077
Application ID: app-20260118085431-0002
Application Name: CS5305 Spark Connection
Default Parallelism: 2
Default Min Partitions: 2


In [8]:
print("\n=== Spark Configuration ===")
for item in spark.sparkContext.getConf().getAll():
    print(f"{item[0]}: {item[1]}")


=== Spark Configuration ===
spark.rdd.compress: True
spark.hadoop.fs.s3a.vectored.read.min.seek.size: 128K
spark.sql.artifact.isolation.enabled: false
spark.app.startTime: 1768726471196
spark.executor.extraJavaOptions: -Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-modules=jdk.incubator.vector --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-

In [5]:
# Create a simple RDD to test the connection
data = [1, 2, 3, 4, 5]
rdd = sc.parallelize(data)
print(f"Sum of data: {rdd.sum()}")

Sum of data: 15


In [3]:

# Create a simple DataFrame to test the connection
df = spark.createDataFrame([(1, "Alice"), (2, "Bob"), (3, "Charlie")], ["id", "name"])
df.show()


+---+-------+
| id|   name|
+---+-------+
|  1|  Alice|
|  2|    Bob|
|  3|Charlie|
+---+-------+

Master: spark://localhost:7077
Default Parallelism: 2
