In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-1.8.0-openjdk-amd64"
os.environ["SPARK_HOME"] = "/home/hadoop/work/spark-3.2.0-bin-hadoop2.7"
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.8"
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3.8"

In [10]:
import findspark
findspark.init()

In [11]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[1]") \
                    .appName('SparkByExamples.com') \
                    .getOrCreate()
df = spark.read.option("inferSchema",True) \
                .option("delimiter",",") \
                .option("header",True)\
  .csv("D://Saurabh//Spark//resources//cereal.csv")
df.printSchema()
df.show(5)


root
 |-- name: string (nullable = true)
 |-- mfr: string (nullable = true)
 |-- type: string (nullable = true)
 |-- calories: integer (nullable = true)
 |-- protein: integer (nullable = true)
 |-- fat: integer (nullable = true)
 |-- sodium: integer (nullable = true)
 |-- fiber: double (nullable = true)
 |-- carbo: double (nullable = true)
 |-- sugars: integer (nullable = true)
 |-- potass: integer (nullable = true)
 |-- vitamins: integer (nullable = true)
 |-- shelf: integer (nullable = true)
 |-- weight: double (nullable = true)
 |-- cups: double (nullable = true)
 |-- rating: double (nullable = true)

+--------------------+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+---------+
|                name|mfr|type|calories|protein|fat|sodium|fiber|carbo|sugars|potass|vitamins|shelf|weight|cups|   rating|
+--------------------+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+---------+
|           100% Br

In [12]:
from pyspark.sql.functions import *
df.select('name', 'mfr', 'rating').show(10)
#Alternatively, you can also use where() function to filter the rows on PySpark DataFrame.
df.filter(df.calories == "100").show(5)
#Filter with Multiple Conditions
df.filter((df.calories == "100") & (df.mfr != "K") ).show(5)
#filter data by null values
df.filter(df.name.isNotNull()).show(5)
df.filter(df.name.isNull()).show()
# Filter Based on List Values
fiber_li = [1.0,2.0,3.0]
df.filter(df.fiber.isin(fiber_li)).show(5)
#not in list
df.filter(df.fiber.isin(fiber_li)==False).show(5)

+--------------------+---+---------+
|                name|mfr|   rating|
+--------------------+---+---------+
|           100% Bran|  N|68.402973|
|   100% Natural Bran|  Q|33.983679|
|            All-Bran|  K|59.425505|
|All-Bran with Ext...|  K|93.704912|
|      Almond Delight|  R|34.384843|
|Apple Cinnamon Ch...|  G|29.509541|
|         Apple Jacks|  K|33.174094|
|             Basic 4|  G|37.038562|
|           Bran Chex|  R|49.120253|
|         Bran Flakes|  P|53.313813|
+--------------------+---+---------+
only showing top 10 rows

+--------------------+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+---------+
|                name|mfr|type|calories|protein|fat|sodium|fiber|carbo|sugars|potass|vitamins|shelf|weight|cups|   rating|
+--------------------+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+---------+
|         Corn Flakes|  K|   C|     100|      2|  0|   290|  1.0| 21.0|     2|    35|  

In [13]:
#Filter Based on Starts With, Ends With, Contains
df.filter(df.name.startswith("C")).show(5)
#using endswith
df.filter(df.name.endswith("n")).show()
#contains
df.filter(df.name.contains("E")).show()
#Filter like and rlike
df.filter(df.name.like("%Bran%")).show(5)
# rlike - SQL RLIKE pattern (LIKE with Regex)
#This check case insensitive
df.filter(df.name.rlike("(?i)^*bran$")).show(5)

+--------------------+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+---------+
|                name|mfr|type|calories|protein|fat|sodium|fiber|carbo|sugars|potass|vitamins|shelf|weight|cups|   rating|
+--------------------+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+---------+
|        Cap'n'Crunch|  Q|   C|     120|      1|  2|   220|  0.0| 12.0|    12|    35|      25|    2|   1.0|0.75|18.042851|
|            Cheerios|  G|   C|     110|      6|  2|   290|  2.0| 17.0|     1|   105|      25|    1|   1.0|1.25|50.764999|
|Cinnamon Toast Cr...|  G|   C|     120|      1|  3|   210|  0.0| 13.0|     9|    45|      25|    2|   1.0|0.75|19.823573|
|            Clusters|  G|   C|     110|      3|  2|   140|  2.0| 13.0|     7|   105|      25|    3|   1.0| 0.5|40.400208|
|         Cocoa Puffs|  G|   C|     110|      1|  1|   180|  0.0| 12.0|    13|    55|      25|    2|   1.0| 1.0|22.736446|
+---------------

PySpark RDD/DataFrame collect() is an action operation that is used to retrieve all the elements of the dataset 
(from all nodes) to the driver node. We should use the collect() on smaller dataset usually after filter(), group() e.t.c. 
Retrieving larger datasets results in OutOfMemory error. 

**select() is a transformation that returns a new DataFrame and holds the columns that are selected whereas collect() is an action that returns the entire data set in an Array to the driver.**

In [14]:
#withColumn(): The withColumn function is used to manipulate a column or to create a new column with the existing column. 
#It is a transformation function, we can also change the datatype of any existing column.
#df.withColumn("Calories",df['calories'].cast("Integer")).printSchema()
#OR
df.withColumn("Calories",col("calories").cast("Integer"))
df.show(5)
# Create a Column from an Existing
df.select('weight').withColumn("New weight",col("weight")*2).show(5)
#Add a New Column using withColumn()
df.select('calories').withColumn(" New calories",lit("100")).show(5)
# Rename Column Name
df.withColumnRenamed("name","Full Name").show(5)

+--------------------+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+---------+
|                name|mfr|type|calories|protein|fat|sodium|fiber|carbo|sugars|potass|vitamins|shelf|weight|cups|   rating|
+--------------------+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+---------+
|           100% Bran|  N|   C|      70|      4|  1|   130| 10.0|  5.0|     6|   280|      25|    3|   1.0|0.33|68.402973|
|   100% Natural Bran|  Q|   C|     120|      3|  5|    15|  2.0|  8.0|     8|   135|       0|    3|   1.0| 1.0|33.983679|
|            All-Bran|  K|   C|      70|      4|  1|   260|  9.0|  7.0|     5|   320|      25|    3|   1.0|0.33|59.425505|
|All-Bran with Ext...|  K|   C|      50|      4|  0|   140| 14.0|  8.0|     0|   330|      25|    3|   1.0| 0.5|93.704912|
|      Almond Delight|  R|   C|     110|      2|  2|   200|  1.0| 14.0|     8|    -1|      25|    3|   1.0|0.75|34.384843|
+---------------

In [15]:
df.groupBy("name", "calories").count().show(5)
df.orderBy("protein").show(5)

+--------------------+--------+-----+
|                name|calories|count|
+--------------------+--------+-----+
|Just Right Fruit ...|     140|    1|
|         Raisin Bran|     120|    1|
|Shredded Wheat sp...|      90|    1|
|           Corn Pops|     110|    1|
|  Honey Nut Cheerios|     110|    1|
+--------------------+--------+-----+
only showing top 5 rows

+--------------+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+---------+
|          name|mfr|type|calories|protein|fat|sodium|fiber|carbo|sugars|potass|vitamins|shelf|weight|cups|   rating|
+--------------+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+---------+
|   Cocoa Puffs|  G|   C|     110|      1|  1|   180|  0.0| 12.0|    13|    55|      25|    2|   1.0| 1.0|22.736446|
|Golden Grahams|  G|   C|     110|      1|  1|   280|  0.0| 15.0|     9|    45|      25|    2|   1.0|0.75|23.804043|
|     Corn Pops|  K|   C|     110|      1|  0|  

In [16]:
from pyspark.sql.functions import split,lit,col,when
df1 = df.withColumn('Name1', split(df['name'], " ").getItem(0)).withColumn('Name2', split(df['name'], " ").getItem(1))
df1.select("name", "Name1", "Name2").show(5)
#The lit function is used to add a new column to the dataframe that contains literals or some constant value.
df2 = df.select(col("name"),lit("75 gm").alias("intake quantity"))
df2.show(5)
df.select("name", when(df.vitamins >= "25", "rich in vitamins")).show(5)

+--------------------+--------+-------+
|                name|   Name1|  Name2|
+--------------------+--------+-------+
|           100% Bran|    100%|   Bran|
|   100% Natural Bran|    100%|Natural|
|            All-Bran|All-Bran|   null|
|All-Bran with Ext...|All-Bran|   with|
|      Almond Delight|  Almond|Delight|
+--------------------+--------+-------+
only showing top 5 rows

+--------------------+---------------+
|                name|intake quantity|
+--------------------+---------------+
|           100% Bran|          75 gm|
|   100% Natural Bran|          75 gm|
|            All-Bran|          75 gm|
|All-Bran with Ext...|          75 gm|
|      Almond Delight|          75 gm|
+--------------------+---------------+
only showing top 5 rows

+--------------------+----------------------------------------------------+
|                name|CASE WHEN (vitamins >= 25) THEN rich in vitamins END|
+--------------------+----------------------------------------------------+
|          