In [11]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, StructField, StructType, IntegerType, DoubleType

In [2]:
os.environ['SPARK_HOME'] = r'C:\spark\spark-3.5.4-bin-hadoop3'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

In [3]:
spark = SparkSession.builder.appName('Spark').getOrCreate()

In [4]:
%%bash
head -10 ./data/products.csv

id,name,category,quantity,price
1,iPhone 12,Electronics,10,899.99
2,Nike Air Max 90,Clothing,25,119.99
3,KitchenAid Stand Mixer,Home Appliances,5,299.99
4,The Great Gatsby,Books,50,12.99
5,L'Oreal Paris Mascara,Beauty,100,9.99
6,Yoga Mat,Sports,30,29.99
7,Samsung 4K Smart TV,Electronics,8,799.99
8,Levi's Jeans,Clothing,15,49.99
9,Dyson Vacuum Cleaner,Home Appliances,3,399.99


In [5]:
csv_file_path = './data/products.csv'

df = spark.read.csv(csv_file_path, header=True)

In [8]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- quantity: string (nullable = true)
 |-- price: string (nullable = true)



In [10]:
df.show(5)

+---+--------------------+---------------+--------+------+
| id|                name|       category|quantity| price|
+---+--------------------+---------------+--------+------+
|  1|           iPhone 12|    Electronics|      10|899.99|
|  2|     Nike Air Max 90|       Clothing|      25|119.99|
|  3|KitchenAid Stand ...|Home Appliances|       5|299.99|
|  4|    The Great Gatsby|          Books|      50| 12.99|
|  5|L'Oreal Paris Mas...|         Beauty|     100|  9.99|
+---+--------------------+---------------+--------+------+
only showing top 5 rows



In [12]:
schema = StructType([
    StructField(name='id', dataType=IntegerType(), nullable=True),
    StructField(name='name', dataType=StringType(), nullable=True),
    StructField(name='category', dataType=StringType(), nullable=True),
    StructField(name='quantity', dataType=IntegerType(), nullable=True),
    StructField(name='price', dataType=DoubleType(), nullable=True),
])

In [13]:
df = spark.read.csv(csv_file_path, header=True, schema=schema)

In [14]:
df.schema

StructType([StructField('id', IntegerType(), True), StructField('name', StringType(), True), StructField('category', StringType(), True), StructField('quantity', IntegerType(), True), StructField('price', DoubleType(), True)])

In [16]:
df.show(5)

+---+--------------------+---------------+--------+------+
| id|                name|       category|quantity| price|
+---+--------------------+---------------+--------+------+
|  1|           iPhone 12|    Electronics|      10|899.99|
|  2|     Nike Air Max 90|       Clothing|      25|119.99|
|  3|KitchenAid Stand ...|Home Appliances|       5|299.99|
|  4|    The Great Gatsby|          Books|      50| 12.99|
|  5|L'Oreal Paris Mas...|         Beauty|     100|  9.99|
+---+--------------------+---------------+--------+------+
only showing top 5 rows



In [17]:
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)

In [18]:
df

DataFrame[id: int, name: string, category: string, quantity: int, price: double]