In [32]:
# 1. Install the core PySpark library
!pip install pyspark

# 2. **CRITICAL:** Install the EXACT version of py4j that matches your Spark 3.5.7 installation.
# The Py4J version bundled with Spark 3.5.7 is 0.10.9.7
!pip install py4j==0.10.9.7

# 3. Install your supporting libraries
!pip install pandas
!pip install matplotlib



In [33]:
import os

# Only set JAVA_HOME explicitly here, as SPARK_HOME and PYTHONPATH were set in the terminal
os.environ['JAVA_HOME'] = "/usr/local/opt/openjdk@21" 



In [34]:
# 1. Imports
# from pyspark.sql.stat import Correlation # (Note: Correlation is imported differently, see below)
from pyspark.sql.functions import col, mean, stddev, min, max, count
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession

# 2. Create the SparkSession (This must run first!)
spark = SparkSession.builder.appName("EDA").getOrCreate()

# 3. Create your DataFrame (Assuming this is defined elsewhere, like by reading a file)
# dataset1 = spark.read.csv("file.csv", header=True, inferSchema=True) 

# 4. RUN SPARK CODE ONLY AFTER SESSION IS CREATED
# summary = dataset1.select(mean(col('x1')), stddev(col('x1')), min(col('x1')), mean(col('y1')))

In [35]:
anscombe_data = [

    (10.0, 8.04, 10.0, 9.14, 10.0, 7.46, 8.0, 6.58),

    (8.0, 6.95, 8.0, 8.14, 8.0, 6.77, 8.0, 5.76),

    (13.0, 7.58, 13.0, 8.74, 13.0, 12.74, 8.0, 7.71),

    (9.0, 8.81, 9.0, 8.77, 9.0, 7.11, 8.0, 8.84),

    (11.0, 8.33, 11.0, 9.26, 11.0, 7.81, 8.0, 8.47),

    (14.0, 9.96, 14.0, 8.10, 14.0, 8.84, 8.0, 7.04),

    (6.0, 7.24, 6.0, 6.13, 6.0, 6.08, 8.0, 5.25),

    (4.0, 4.26, 4.0, 3.10, 4.0, 5.39, 19.0, 12.50),

    (12.0, 10.84, 12.0, 9.13, 12.0, 8.15, 8.0, 5.56),

    (7.0, 4.82, 7.0, 7.26, 7.0, 6.42, 8.0, 7.91),

    (5.0, 5.68, 5.0, 4.74, 5.0, 5.73, 8.0, 6.89)

]

In [36]:
columns = ['x1','y1','x2','y2','x3','y3','x4','y4']
anscombe_df = spark.createDataFrame(anscombe_data, columns)
anscombe_df.show()

+----+-----+----+----+----+-----+----+----+
|  x1|   y1|  x2|  y2|  x3|   y3|  x4|  y4|
+----+-----+----+----+----+-----+----+----+
|10.0| 8.04|10.0|9.14|10.0| 7.46| 8.0|6.58|
| 8.0| 6.95| 8.0|8.14| 8.0| 6.77| 8.0|5.76|
|13.0| 7.58|13.0|8.74|13.0|12.74| 8.0|7.71|
| 9.0| 8.81| 9.0|8.77| 9.0| 7.11| 8.0|8.84|
|11.0| 8.33|11.0|9.26|11.0| 7.81| 8.0|8.47|
|14.0| 9.96|14.0| 8.1|14.0| 8.84| 8.0|7.04|
| 6.0| 7.24| 6.0|6.13| 6.0| 6.08| 8.0|5.25|
| 4.0| 4.26| 4.0| 3.1| 4.0| 5.39|19.0|12.5|
|12.0|10.84|12.0|9.13|12.0| 8.15| 8.0|5.56|
| 7.0| 4.82| 7.0|7.26| 7.0| 6.42| 8.0|7.91|
| 5.0| 5.68| 5.0|4.74| 5.0| 5.73| 8.0|6.89|
+----+-----+----+----+----+-----+----+----+



In [37]:
dataset1 = anscombe_df.select('x1','y1')
dataset1.show()

+----+-----+
|  x1|   y1|
+----+-----+
|10.0| 8.04|
| 8.0| 6.95|
|13.0| 7.58|
| 9.0| 8.81|
|11.0| 8.33|
|14.0| 9.96|
| 6.0| 7.24|
| 4.0| 4.26|
|12.0|10.84|
| 7.0| 4.82|
| 5.0| 5.68|
+----+-----+



In [38]:
# Use col() for explicit column reference
summary = dataset1.select(
    mean(col('x1')),
    stddev(col('x1')),
    min(col('x1')),
    mean(col('y1'))
)

In [39]:
dataset1_pd = dataset1.toPandas()

In [40]:
dataset1_pd.describe()

Unnamed: 0,x1,y1
count,11.0,11.0
mean,9.0,7.500909
std,3.316625,2.031568
min,4.0,4.26
25%,6.5,6.315
50%,9.0,7.58
75%,11.5,8.57
max,14.0,10.84


In [41]:
correlation = np.corrcoef(dataset1_pd['x1'], dataset1_pd['y1'])

In [None]:
plt.figure(figsize = (10,10))
plt.scatter(dataset1_pd['x1'], dataset1_pd['y1'],aplha=0.6,s=100)
plt.xlabel('x1')
plt.ylabel('