In [12]:
# Import any necessary libraries / modules
import os
import sys
import timeit

import pandas as pd # we only use pandas as examples to compare to
import numpy as np

from pyspark.sql import SparkSession


In [None]:
# Create a path to the dataset and read into our Spark Session

DATA_PATH = os.path.join("..", "datasets", "tracks_features.csv")
print(DATA_PATH)

spark = SparkSession.builder.appName("SongRecommendation").getOrCreate()

'''
First exercise is to just compare execution time between Pandas and PySpark...

There is already a meaningful difference due to dataset size of 1.2million rows, but remember 
that Spark is being run locally (single cluster) and our dataset is still relatively small 
(i.e. 350mb vs hundreds of gigs or tb in real-world applications)
'''
# Function to read with PySpark
def read_with_spark():
    df = spark.read.csv(DATA_PATH, header=True, inferSchema=True)

# Function to read with Pandas
def read_with_pandas():
    df = pd.read_csv(DATA_PATH)

# Measure execution time
spark_time = timeit.timeit(read_with_spark, number=1)  # Run once
pandas_time = timeit.timeit(read_with_pandas, number=1)  # Run once

# Print results
print(f"Time taken with PySpark: {spark_time:.4f} seconds")
print(f"Time taken with Pandas: {pandas_time:.4f} seconds")

../datasets/tracks_features.csv


                                                                                

Time taken with PySpark: 2.1501 seconds
Time taken with Pandas: 4.6297 seconds


In [None]:
# Let's ready our data in memory and get ready for data processingx
df = spark.read.csv(DATA_PATH, header=True, inferSchema=True)

In [None]:
()