In [None]:
!pip install pyspark

In [None]:
# @title Sample code to implement Spark Context
from pyspark import SparkContext

sc = SparkContext("local", "RDD Example")

numbers = [1, 2, 3, 4, 5]
rdd = sc.parallelize(numbers)

squared_rdd = rdd.map(lambda x: x**2)
filtered_rdd = squared_rdd.filter(lambda x: x > 10)

result = filtered_rdd.collect()
print(result)

In [None]:
# @title Initialise SparkSession
from pyspark.sql import SparkSession
import json, re

spark = SparkSession.builder \
            .appName("WordCount") \
            .master("local[*]") \
            .getOrCreate()
sc = spark.sparkContext

In [None]:
# @title Creating input files
# Word Count input
with open("input.txt", "w") as f:
    f.write("hello world\nhello spark\nhello pyspark world")

# CSV files (users & purchases)
with open("users.csv", "w") as f:
    f.write("user_id,name\n1,Alice\n2,Bob\n3,Charlie")

with open("purchases.csv", "w") as f:
    f.write("user_id,product,amount\n1,Book,100.0\n2,Laptop,800.5\n1,Pen,50.0\n3,Phone,200.0")

# JSON file
with open("people.json", "w") as f:
    f.write('{"name": "Alice", "age": 30}\n')
    f.write('{"name": "Bob", "age": 40}\n')
    f.write('{"name": "Eve", "age": 29}\n')
    f.write('{"name": "Charlie", "age": 40}\n')

# Apache log file
with open("apache.access.log", "w") as f:
    f.write('127.0.0.1 - - [10/Oct/2000:13:55:36 -0700] "GET /index.html HTTP/1.0" 200 2326\n')
    f.write('127.0.0.1 - - [10/Oct/2000:13:55:56 -0700] "GET /products HTTP/1.0" 200 1234\n')
    f.write('127.0.0.1 - - [10/Oct/2000:13:56:01 -0700] "GET /index.html HTTP/1.0" 200 2326\n')
    f.write('127.0.0.1 - - [10/Oct/2000:13:56:16 -0700] "GET /login HTTP/1.0" 200 543\n')
    f.write('127.0.0.1 - - [10/Oct/2000:13:56:36 -0700] "GET /index.html HTTP/1.0" 200 2326\n')
    f.write('127.0.0.1 - - [10/Oct/2000:13:56:56 -0700] "GET /products HTTP/1.0" 200 1234\n')

In [None]:
# @title 6.1 - Word Count (RDD)
print("\n=== Word Count (RDD) ===")
text_file = sc.textFile("input.txt")
counts = text_file.flatMap(lambda line: line.split(" ")) \
                  .map(lambda word: (word, 1)) \
                  .reduceByKey(lambda a, b: a + b)
print(counts.collect())

In [None]:
# @title 6.2 — CSV Join & Aggregation (RDD)
print("\n=== CSV Join & Aggregation (RDD) ===")
purchases = sc.textFile("purchases.csv")
header = purchases.first()
purchasesRDD = purchases.filter(lambda line: line != header) \
                        .map(lambda line: line.split(",")) \
                        .map(lambda cols: (cols[0], float(cols[2])))
users = sc.textFile("users.csv")
header2 = users.first()
usersRDD = users.filter(lambda line: line != header2) \
                .map(lambda line: line.split(",")) \
                .map(lambda cols: (cols[0], cols[1]))

joinedRDD = usersRDD.join(purchasesRDD)  # (user_id, (name, amount))
totalByUser = joinedRDD.map(lambda x: (x[1][0], x[1][1])) \
                       .reduceByKey(lambda a, b: a + b)
print(totalByUser.collect())

In [None]:
# @title 6.3 - JSONL processing (RDD)
print("\n=== JSON Processing (RDD) ===")
logs = sc.textFile("people.json")
eventsRDD = logs.map(lambda line: json.loads(line)) \
                .map(lambda rec: (rec["age"], 1)) \
                .reduceByKey(lambda a, b: a + b)
print(eventsRDD.collect())
# sc.stop()

In [None]:
# @title 6.4 - Apache Log parsing (RDD)
print("\n=== Apache Log Parsing (RDD) ===")
log_pattern = '^(\S+) (\S+) (\S+) \[(.*?)\] "(.*?)" (\S+) (\S+)$'
regex = re.compile(log_pattern)

logs = sc.textFile("apache.access.log")
logsRDD = logs.map(lambda line: regex.match(line)) \
              .filter(lambda m: m is not None) \
              .map(lambda m: (m.group(5).split(" ")[1], 1)) \
              .reduceByKey(lambda a, b: a + b)
print(logsRDD.collect())

In [None]:
# @title 1. Modify WordCount to remove stopwords and show top-10 meaningful words.
print("=== Modified Word Count(RDD) ===")
stopwords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]

# Write WordCount here
counts_ex = text_file.flatMap(lambda line: line.split(" ")) \
                     .filter(lambda word: word not in stopwords) \
                     .map(lambda word: (word, 1)) \
                     .reduceByKey(lambda a, b: a + b)

print(counts_ex.takeOrdered(10, key=lambda x: -x[1]))

In [None]:
# @title 2. Change join to handle missing IDs (leftOuterJoin) and show names with zero purchases.
print("=== Modified Join (RDD) ===")

# Write Join here
purchases = sc.textFile("purchases.csv")
header = purchases.first()
purchasesRDD = purchases.filter(lambda line: line != header) \
                        .map(lambda line: line.split(",")) \
                        .map(lambda cols: (cols[0], float(cols[2])))
users = sc.textFile("users.csv")
header2 = users.first()
usersRDD = users.filter(lambda line: line != header2) \
                .map(lambda line: line.split(",")) \
                .map(lambda cols: (cols[0], cols[1]))

joinedRDD = usersRDD.leftOuterJoin(purchasesRDD)  # (user_id, (name, amount))
totalByUser = joinedRDD.map(lambda x: (x[1][0], x[1][1])) \
                       .reduceByKey(lambda a, b: a + b)

print(totalByUser.collect())

In [None]:
# @title 3. Use mapPartitions to simulate expensive init (e.g., a fake model) and measure runtime difference.
print("=== MapPartitions (RDD) ===")

# Write MapPartitions here
import time

def process_partition(iterator):
    # Simulate an expensive initialization per partition
    time.sleep(1)
    initialized_value = "Initialized"
    results = []
    for x in iterator:
        results.append(f"{initialized_value}: {x * 2}")
    return results

# Create a simple RDD
data = sc.parallelize(range(10))

# Apply mapPartitions
start_time = time.time()
processed_rdd = data.mapPartitions(process_partition)
result = processed_rdd.collect()
end_time = time.time()

print(f"Result: {result}")
print(f"Runtime with mapPartitions: {end_time - start_time:.2f} seconds")

# Compare with map (which would initialize for each element)
def process_element(x):
    # Simulate an expensive initialization per element
    time.sleep(0.1)
    initialized_value = "Initialized"
    return f"{initialized_value}: {x * 2}"

start_time_map = time.time()
processed_rdd_map = data.map(process_element)
result_map = processed_rdd_map.collect()
end_time_map = time.time()

print(f"Runtime with map: {end_time_map - start_time_map:.2f} seconds")