<a href="https://colab.research.google.com/github/bhargav-joshi/pyspark/blob/main/Hands_on_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Objective:**

In this notebook, you will be able to:

*   Load the Iris dataset from CSV, JSON, and Parquet formats using both Pandas and PySpark
*   Inspect and compare the schemas and structures across different formats
*   Understand key differences in how Pandas and Spark handle data loading

In [3]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
!pip install pyspark



In [4]:
from pyspark.sql import SparkSession

# Create SparkSession
spark = SparkSession.builder \
    .appName("Iris Data Comparison") \
    .getOrCreate()

In [8]:
# Load datasets using Pandas

import pandas as pd

iris_csv_pd = pd.read_csv("https://raw.githubusercontent.com/bhargav-joshi/pyspark-datasets/refs/heads/main/Iris.csv")
iris_json_pd = pd.read_json("https://raw.githubusercontent.com/bhargav-joshi/pyspark-datasets/refs/heads/main/Iris.json", lines=True)
iris_parquet_pd = pd.read_parquet("https://raw.githubusercontent.com/bhargav-joshi/pyspark-datasets/refs/heads/main/Iris.parquet")

# Display a sample
print("CSV Sample (Pandas):")
print(iris_csv_pd.head())

CSV Sample (Pandas):
   sepal.length  sepal.width  petal.length  petal.width variety
0           5.1          3.5           1.4          0.2  Setosa
1           4.9          3.0           1.4          0.2  Setosa
2           4.7          3.2           1.3          0.2  Setosa
3           4.6          3.1           1.5          0.2  Setosa
4           5.0          3.6           1.4          0.2  Setosa


In [11]:
# Load datasets using Spark
iris_csv_spark = spark.read.csv("/content/Iris.csv", header=True, inferSchema=True)
iris_json_spark = spark.read.json("/content/Iris.json")
iris_parquet_spark = spark.read.parquet("/content/Iris.parquet")

# Show a few rows
print("CSV Sample (Spark):")
iris_csv_spark.show(10)

CSV Sample (Spark):
+------------+-----------+------------+-----------+-------+
|sepal.length|sepal.width|petal.length|petal.width|variety|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| Setosa|
|         4.9|        3.0|         1.4|        0.2| Setosa|
|         4.7|        3.2|         1.3|        0.2| Setosa|
|         4.6|        3.1|         1.5|        0.2| Setosa|
|         5.0|        3.6|         1.4|        0.2| Setosa|
|         5.4|        3.9|         1.7|        0.4| Setosa|
|         4.6|        3.4|         1.4|        0.3| Setosa|
|         5.0|        3.4|         1.5|        0.2| Setosa|
|         4.4|        2.9|         1.4|        0.2| Setosa|
|         4.9|        3.1|         1.5|        0.1| Setosa|
+------------+-----------+------------+-----------+-------+
only showing top 10 rows


In [12]:
print("Pandas CSV Schema:")
print(iris_csv_pd.dtypes)

print("\nSpark CSV Schema:")
iris_csv_spark.printSchema()

print("\nSpark JSON Schema:")
iris_json_spark.printSchema()

print("\nSpark Parquet Schema:")
iris_parquet_spark.printSchema()

Pandas CSV Schema:
sepal.length    float64
sepal.width     float64
petal.length    float64
petal.width     float64
variety          object
dtype: object

Spark CSV Schema:
root
 |-- sepal.length: double (nullable = true)
 |-- sepal.width: double (nullable = true)
 |-- petal.length: double (nullable = true)
 |-- petal.width: double (nullable = true)
 |-- variety: string (nullable = true)


Spark JSON Schema:
root
 |-- petal.length: double (nullable = true)
 |-- petal.width: double (nullable = true)
 |-- sepal.length: double (nullable = true)
 |-- sepal.width: double (nullable = true)
 |-- variety: string (nullable = true)


Spark Parquet Schema:
root
 |-- sepal.length: double (nullable = true)
 |-- sepal.width: double (nullable = true)
 |-- petal.length: double (nullable = true)
 |-- petal.width: double (nullable = true)
 |-- variety: string (nullable = true)



In [13]:
for col_name in iris_csv_spark.columns:
    print(col_name)

sepal.length
sepal.width
petal.length
petal.width
variety


In [14]:
iris_csv_spark = iris_csv_spark.withColumnRenamed("sepal.width", "sepal_width")

In [15]:
iris_csv_spark = iris_csv_spark.withColumnRenamed("sepal.length", "sepal_length")
iris_csv_spark = iris_csv_spark.withColumnRenamed("petal.length", "petal_length")
iris_csv_spark = iris_csv_spark.withColumnRenamed("petal.width", "petal_width")

In [16]:
#Compare Average Sepal Width

# Pandas
avg_sw_pd = iris_csv_pd["sepal.width"].mean()
print("Average Sepal Width (Pandas):", avg_sw_pd)

# PySpark
from pyspark.sql.functions import col, avg

avg_sw_spark = iris_csv_spark.select(avg(col("sepal_width"))).collect()[0][0]
print("Average Sepal Width (Spark - CSV):", avg_sw_spark)

Average Sepal Width (Pandas): 3.0573333333333337
Average Sepal Width (Spark - CSV): 3.057333333333334


In [17]:
#Filter Rows Where Petal Length > 1.50

# Pandas
filtered_pd = iris_csv_pd[iris_csv_pd["petal.length"] > 1.50]
print("Filtered Pandas Rows:")
print(filtered_pd.head())

# PySpark
filtered_spark = iris_csv_spark.filter(iris_csv_spark["petal_length"] > 1.50)
print("Filtered Spark Rows:")
filtered_spark.show(5)

Filtered Pandas Rows:
    sepal.length  sepal.width  petal.length  petal.width variety
5            5.4          3.9           1.7          0.4  Setosa
11           4.8          3.4           1.6          0.2  Setosa
18           5.7          3.8           1.7          0.3  Setosa
20           5.4          3.4           1.7          0.2  Setosa
23           5.1          3.3           1.7          0.5  Setosa
Filtered Spark Rows:
+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|variety|
+------------+-----------+------------+-----------+-------+
|         5.4|        3.9|         1.7|        0.4| Setosa|
|         4.8|        3.4|         1.6|        0.2| Setosa|
|         5.7|        3.8|         1.7|        0.3| Setosa|
|         5.4|        3.4|         1.7|        0.2| Setosa|
|         5.1|        3.3|         1.7|        0.5| Setosa|
+------------+-----------+------------+-----------+-------+
only showing top 5 rows
