In [17]:
from pyspark.sql import SparkSession
from sklearn import datasets
import time

spark = SparkSession.builder \
    .appName("SimModeExample") \
    .master("local[*]") \
    .getOrCreate()

In [6]:
#Printing Configs
for k, v in spark.sparkContext.getConf().getAll():
    print(f"{k} = {v}")

spark.driver.extraJavaOptions = -Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -Djdk.reflect.useDirectMethodHandle=false
spark.driver.port = 43483
spark.executor.id = driver
spark.sql.warehouse.dir = file:/content/spark-warehouse
spa

In [9]:
#Map Reduce Word Frequency
text_rdd = spark.sparkContext.parallelize([
    "PySpark is great",
    "PySpark runs locally",
    "Word count is a classic example"
])

# Word count
word_counts = (
    text_rdd
    .flatMap(lambda line: line.split(" "))
    .map(lambda word: (word, 1))
    .reduceByKey(lambda a, b: a + b)
)

print("\n=== Word Count ===")
for word, count in word_counts.collect():
    print(f"{word}: {count}")


=== Word Count ===
PySpark: 2
runs: 1
Word: 1
is: 2
great: 1
locally: 1
count: 1
a: 1
classic: 1
example: 1


In [18]:
sample_text = """Apache Spark is an open-source distributed general-purpose cluster-computing framework.
It provides an interface for programming entire clusters with implicit data parallelism and fault-tolerance.
Spark is designed to cover a wide range of workloads such as batch applications, iterative algorithms, interactive queries, and streaming."""

with open("sample_text.txt", "w") as f:
    f.write(sample_text)

sc = spark.sparkContext

# Load text file into RDD
text_rdd = sc.textFile("sample_text.txt")

def top_words_no_cache():
    word_counts = (
        text_rdd
        .flatMap(lambda line: line.split())
        .map(lambda w: (w.lower().strip(".,!?"), 1))
        .reduceByKey(lambda a, b: a + b)
    )
    top_10 = word_counts.takeOrdered(10, key=lambda x: -x[1])
    return top_10

def top_words_with_cache():
    cached_rdd = (
        text_rdd
        .flatMap(lambda line: line.split())
        .map(lambda w: (w.lower().strip(".,!?"), 1))
        .cache()  # Cache the intermediate RDD
    )
    word_counts = cached_rdd.reduceByKey(lambda a, b: a + b)
    top_10 = word_counts.takeOrdered(10, key=lambda x: -x[1])
    return top_10

# Measure time without caching
start = time.time()
result_no_cache = top_words_no_cache()
time_no_cache = time.time() - start

# Measure time with caching
start = time.time()
result_with_cache = top_words_with_cache()
time_with_cache = time.time() - start

# Print results
print("\nTop 10 words without caching:")
for word, count in result_no_cache:
    print(f"{word}: {count}")

print(f"\nTime taken without caching: {time_no_cache:.4f} seconds")

print("\nTop 10 words with caching:")
for word, count in result_with_cache:
    print(f"{word}: {count}")

print(f"\nTime taken with caching: {time_with_cache:.4f} seconds")



Top 10 words without caching:
an: 2
and: 2
spark: 2
is: 2
apache: 1
open-source: 1
distributed: 1
cluster-computing: 1
framework: 1
it: 1

Time taken without caching: 1.1633 seconds

Top 10 words with caching:
an: 2
and: 2
spark: 2
is: 2
apache: 1
open-source: 1
distributed: 1
cluster-computing: 1
framework: 1
it: 1

Time taken with caching: 1.3649 seconds


In [12]:
print(text_rdd.flatMap(lambda line: line.split(" ")))

PythonRDD[27] at RDD at PythonRDD.scala:53


In [4]:
dataset_iris = datasets.load_iris()
spdf = spark.createDataFrame(dataset_iris.data, schema = dataset_iris.feature_names)

In [5]:
spdf.head(5)

[Row(sepal length (cm)=5.1, sepal width (cm)=3.5, petal length (cm)=1.4, petal width (cm)=0.2),
 Row(sepal length (cm)=4.9, sepal width (cm)=3.0, petal length (cm)=1.4, petal width (cm)=0.2),
 Row(sepal length (cm)=4.7, sepal width (cm)=3.2, petal length (cm)=1.3, petal width (cm)=0.2),
 Row(sepal length (cm)=4.6, sepal width (cm)=3.1, petal length (cm)=1.5, petal width (cm)=0.2),
 Row(sepal length (cm)=5.0, sepal width (cm)=3.6, petal length (cm)=1.4, petal width (cm)=0.2)]