In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, StructField, IntegerType, DoubleType, StructType

In [2]:
os.environ['SPARK_HOME'] = r'C:\spark\spark-3.5.4-bin-hadoop3'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

In [4]:
spark = SparkSession.builder.appName('Operations').getOrCreate()

In [10]:
schema = StructType([
    StructField(name='id', dataType=IntegerType(), nullable=True),
    StructField(name='name', dataType=StringType(), nullable=True),
    StructField(name='category', dataType=StringType(), nullable=True),
    StructField(name='quantity', dataType=IntegerType(), nullable=True),
    StructField(name='price', dataType=DoubleType(), nullable=True),
])

In [11]:
file_path = r'./data/stocks.txt'

df = spark.read.csv(file_path, header=True, schema=schema)

In [12]:
df.schema

StructType([StructField('id', IntegerType(), True), StructField('name', StringType(), True), StructField('category', StringType(), True), StructField('quantity', IntegerType(), True), StructField('price', DoubleType(), True)])

In [13]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- price: double (nullable = true)



In [14]:
df.show(5)

+---+----------+-----------+--------+-------+
| id|      name|   category|quantity|  price|
+---+----------+-----------+--------+-------+
|  1|    iPhone|Electronics|      10| 899.99|
|  2|   Macbook|Electronics|       5|1299.99|
|  3|      iPad|Electronics|      15| 499.99|
|  4|Samsung TV|Electronics|       8| 799.99|
|  5|     LG TV|Electronics|      10| 699.99|
+---+----------+-----------+--------+-------+
only showing top 5 rows



In [16]:
select_columns = df.select(['id', 'name', 'category'])
select_columns.show(5)

+---+----------+-----------+
| id|      name|   category|
+---+----------+-----------+
|  1|    iPhone|Electronics|
|  2|   Macbook|Electronics|
|  3|      iPad|Electronics|
|  4|Samsung TV|Electronics|
|  5|     LG TV|Electronics|
+---+----------+-----------+
only showing top 5 rows



In [23]:
filtered_data = df.filter(df['quantity'] > 20)
filtered_data.show()

+---+--------------+-----------+--------+-----+
| id|          name|   category|quantity|price|
+---+--------------+-----------+--------+-----+
|  6|    Nike Shoes|   Clothing|      30|99.99|
|  7|  Adidas Shoes|   Clothing|      25|89.99|
| 12|        Apples|       Food|     100|  0.5|
| 13|       Bananas|       Food|     150| 0.25|
| 14|       Oranges|       Food|     120| 0.75|
| 15|Chicken Breast|       Food|      50| 3.99|
| 16| Salmon Fillet|       Food|      30| 5.99|
| 24|    Laptop Bag|Accessories|      25|29.99|
| 25|      Backpack|Accessories|      30|24.99|
| 28|         Jeans|   Clothing|      30|59.99|
| 29|       T-shirt|   Clothing|      50|14.99|
| 30|      Sneakers|   Clothing|      40|79.99|
+---+--------------+-----------+--------+-----+



In [25]:
grouped_data = df.groupBy('category').agg({'quantity': 'sum', 'price': 'avg'})
grouped_data.show()

+-----------+-------------+------------------+
|   category|sum(quantity)|        avg(price)|
+-----------+-------------+------------------+
|       Food|          450|2.2960000000000003|
|     Sports|           35|             34.99|
|Electronics|           98| 586.6566666666665|
|   Clothing|          200|  99.2757142857143|
|  Furniture|           41|            141.99|
|Accessories|           55|             27.49|
+-----------+-------------+------------------+



In [26]:
df2 = df.select('id', 'category').limit(10)
joined_data = df.join(df2, 'id', 'inner')
joined_data.show()

+---+----------------+-----------+--------+-------+-----------+
| id|            name|   category|quantity|  price|   category|
+---+----------------+-----------+--------+-------+-----------+
|  1|          iPhone|Electronics|      10| 899.99|Electronics|
|  2|         Macbook|Electronics|       5|1299.99|Electronics|
|  3|            iPad|Electronics|      15| 499.99|Electronics|
|  4|      Samsung TV|Electronics|       8| 799.99|Electronics|
|  5|           LG TV|Electronics|      10| 699.99|Electronics|
|  6|      Nike Shoes|   Clothing|      30|  99.99|   Clothing|
|  7|    Adidas Shoes|   Clothing|      25|  89.99|   Clothing|
|  8| Sony Headphones|Electronics|      12| 149.99|Electronics|
|  9|Beats Headphones|Electronics|      20| 199.99|Electronics|
| 10|    Dining Table|  Furniture|      10| 249.99|  Furniture|
+---+----------------+-----------+--------+-------+-----------+



In [34]:
sorted_data = df.orderBy('price', ascending=False)
sorted_data.show(5)

+---+----------+-----------+--------+-------+
| id|      name|   category|quantity|  price|
+---+----------+-----------+--------+-------+
|  2|   Macbook|Electronics|       5|1299.99|
|  1|    iPhone|Electronics|      10| 899.99|
|  4|Samsung TV|Electronics|       8| 799.99|
|  5|     LG TV|Electronics|      10| 699.99|
| 26|    Camera|Electronics|      10| 599.99|
+---+----------+-----------+--------+-------+
only showing top 5 rows



In [35]:
from pyspark.sql.functions import col, desc
sorted_data = df.orderBy(col('price').desc(), col('id').desc())
sorted_data.show()

+---+----------------+-----------+--------+-------+
| id|            name|   category|quantity|  price|
+---+----------------+-----------+--------+-------+
|  2|         Macbook|Electronics|       5|1299.99|
|  1|          iPhone|Electronics|      10| 899.99|
|  4|      Samsung TV|Electronics|       8| 799.99|
|  5|           LG TV|Electronics|      10| 699.99|
| 26|          Camera|Electronics|      10| 599.99|
|  3|            iPad|Electronics|      15| 499.99|
| 10|    Dining Table|  Furniture|      10| 249.99|
| 17|  Leather Jacket|   Clothing|      15| 199.99|
|  9|Beats Headphones|Electronics|      20| 199.99|
| 18|     Winter Coat|   Clothing|      10| 149.99|
| 11|      Study Desk|  Furniture|       8| 149.99|
|  8| Sony Headphones|Electronics|      12| 149.99|
| 27|         Printer|Electronics|       8| 129.99|
| 21|    Coffee Table|  Furniture|       5| 129.99|
| 23|       Bookshelf|  Furniture|      10|  99.99|
|  6|      Nike Shoes|   Clothing|      30|  99.99|
|  7|    Adi

In [36]:
distinct_rows = df.select('category').distinct()
distinct_rows.show()

+-----------+
|   category|
+-----------+
|       Food|
|     Sports|
|Electronics|
|   Clothing|
|  Furniture|
|Accessories|
+-----------+



In [38]:
dropped_columns = df.drop('quantity', 'category')
dropped_columns.show()

+---+----------------+-------+
| id|            name|  price|
+---+----------------+-------+
|  1|          iPhone| 899.99|
|  2|         Macbook|1299.99|
|  3|            iPad| 499.99|
|  4|      Samsung TV| 799.99|
|  5|           LG TV| 699.99|
|  6|      Nike Shoes|  99.99|
|  7|    Adidas Shoes|  89.99|
|  8| Sony Headphones| 149.99|
|  9|Beats Headphones| 199.99|
| 10|    Dining Table| 249.99|
| 11|      Study Desk| 149.99|
| 12|          Apples|    0.5|
| 13|         Bananas|   0.25|
| 14|         Oranges|   0.75|
| 15|  Chicken Breast|   3.99|
| 16|   Salmon Fillet|   5.99|
| 17|  Leather Jacket| 199.99|
| 18|     Winter Coat| 149.99|
| 19|        Yoga Mat|  19.99|
| 20|    Dumbbell Set|  49.99|
+---+----------------+-------+
only showing top 20 rows



In [39]:
df_with_column = df.withColumn('revenue', df['quantity'] * df['price'])
df_with_column.show()

+---+----------------+-----------+--------+-------+------------------+
| id|            name|   category|quantity|  price|           revenue|
+---+----------------+-----------+--------+-------+------------------+
|  1|          iPhone|Electronics|      10| 899.99|            8999.9|
|  2|         Macbook|Electronics|       5|1299.99|           6499.95|
|  3|            iPad|Electronics|      15| 499.99|           7499.85|
|  4|      Samsung TV|Electronics|       8| 799.99|           6399.92|
|  5|           LG TV|Electronics|      10| 699.99|            6999.9|
|  6|      Nike Shoes|   Clothing|      30|  99.99|            2999.7|
|  7|    Adidas Shoes|   Clothing|      25|  89.99|           2249.75|
|  8| Sony Headphones|Electronics|      12| 149.99|           1799.88|
|  9|Beats Headphones|Electronics|      20| 199.99|            3999.8|
| 10|    Dining Table|  Furniture|      10| 249.99|            2499.9|
| 11|      Study Desk|  Furniture|       8| 149.99|           1199.92|
| 12| 

In [40]:
df_with_alias = df.withColumnRenamed('price', 'product_price')
df_with_alias.show()

+---+----------------+-----------+--------+-------------+
| id|            name|   category|quantity|product_price|
+---+----------------+-----------+--------+-------------+
|  1|          iPhone|Electronics|      10|       899.99|
|  2|         Macbook|Electronics|       5|      1299.99|
|  3|            iPad|Electronics|      15|       499.99|
|  4|      Samsung TV|Electronics|       8|       799.99|
|  5|           LG TV|Electronics|      10|       699.99|
|  6|      Nike Shoes|   Clothing|      30|        99.99|
|  7|    Adidas Shoes|   Clothing|      25|        89.99|
|  8| Sony Headphones|Electronics|      12|       149.99|
|  9|Beats Headphones|Electronics|      20|       199.99|
| 10|    Dining Table|  Furniture|      10|       249.99|
| 11|      Study Desk|  Furniture|       8|       149.99|
| 12|          Apples|       Food|     100|          0.5|
| 13|         Bananas|       Food|     150|         0.25|
| 14|         Oranges|       Food|     120|         0.75|
| 15|  Chicken

In [41]:
spark.stop()