In [1]:
!pip install pyspark #install pyspark (force)



In [2]:
from pyspark import SparkContext, SparkConf

# Configure Spark Application
conf = SparkConf().setAppName("Apache Spark Basics").setMaster("local[*]")  # Use all CPU cores
sc = SparkContext(conf=conf)  # Create a Spark Context

# Video logs data
video_logs = [
    "user1,video1,play",
    "user2,video2,pause",
    "user1,video1,stop",
    "user3,video3,play",
    "user2,video2,play",
]

# Parallelize the video logs (create an RDD)
logs_rdd = sc.parallelize(video_logs)

# Transformations and Actions
# Step 1: Parse the log lines into structured tuples (user, video, action)
parsed_logs = logs_rdd.map(lambda log: tuple(log.split(",")))

# Step 2: Filter logs for a specific action (e.g., "play")
play_logs = parsed_logs.filter(lambda log: log[2] == "play")

# Step 3: Count occurrences of each video being "played"
video_play_count = (
    play_logs.map(lambda log: (log[1], 1))  # Map each video play to (video, 1)
    .reduceByKey(lambda x, y: x + y)  # Reduce by key to count plays per video
)

# Step 4: Collect the results to display
result = video_play_count.collect()

# Print the results
print("Video Play Counts:")
for video, count in result:
    print(f"{video}: {count} times")

# Stop the Spark Context
sc.stop()



Video Play Counts:
video1: 1 times
video3: 1 times
video2: 1 times
