In [1]:
from pyspark.sql import SparkSession

In [2]:
# SparkSession is preferred rather than SparkContext
spark = SparkSession.builder.appName("Maps and Lazy Evaluation Example").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/23 17:11:54 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Starting off with a regular python list
log_of_songs = [
        "Despacito",
        "Nice for what",
        "No tears left to cry",
        "Despacito",
        "Havana",
        "In my feelings",
        "Nice for what",
        "despacito",
        "All the stars"
]

In [4]:
# parallelize the log_of_songs to use with Spark
distributed_song_log_rdd = spark.sparkContext.parallelize(log_of_songs)
distributed_song_log_rdd

ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:289

In [5]:
# notice we DO NOT use the .collect() method. What is the difference between
# .collect() and .foreach() ? 
# .collect() forces all the data from the entire RDD on all nodes 
# to be collected from ALL the nodes, which kills productivity, and could crash
# .foreach() allows the data to stay on each of the independent nodes

# show the original input data is preserved
distributed_song_log_rdd.foreach(print)

Nice for what                                                     (0 + 12) / 12]
In my feelings
All the stars
Havana
No tears left to cry
Despacito
Despacito
despacito
Nice for what
                                                                                

In [6]:
# use the map function to transform the list of songs with the python function that converts strings to lowercase
lower_case_songs = distributed_song_log_rdd.map(lambda x: x.lower())
lower_case_songs

PythonRDD[2] at RDD at PythonRDD.scala:53

In [7]:
# show the lowercase data
lower_case_songs.foreach(print)

in my feelings
all the stars
despacito
havana
no tears left to cry
nice for what
despacito
despacito
nice for what


In [8]:
# show the original input data is still mixed case
distributed_song_log_rdd.foreach(print)

In my feelings
Despacito
All the stars
Despacito
No tears left to cry
Nice for what
despacito
Nice for what
Havana
