In [5]:
# Step 1: Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, when

In [7]:
# Step 2: Create a SparkSession
spark = SparkSession.builder \
    .appName("LaptopStoreAnalysis") \
    .getOrCreate()

PySparkRuntimeError: [JAVA_GATEWAY_EXITED] Java gateway process exited before sending its port number.

In [None]:
# Step 3: Read the CSV file into a Spark DataFrame
file_path = "./laptop_pricing_dataset.csv"
laptop_df = spark.read.csv(file_path, header=True, inferSchema=True)

In [None]:
# Step 4: Perform data transformations and calculations for each business question
# a. Average RAM GB per manufacturer
ram_avg_per_manufacturer = laptop_df.groupBy("GPU").agg(avg("RAM_GB").alias("avg_RAM_GB")).orderBy("avg_RAM_GB", ascending=False)

In [None]:
# b. Average screen size per Screen Type
screen_size_avg_per_type = laptop_df.groupBy("Category").agg(avg("Screen_Size_cm").alias("avg_screen_size")).orderBy("avg_screen_size", ascending=False)

In [None]:
# c. Profiling by manufacturer and screen type
profiling_results = laptop_df.groupBy("GPU", "Category") \
    .agg(
        when(laptop_df["IPS_Panel"] == "Yes", 1).alias("IPS_Panel_count"),
        when(laptop_df["Screen_Resolution"] == "Full HD", 1).alias("Full_HD_count")
    ).groupBy("GPU").agg(
        sum("IPS_Panel_count").alias("IPS_Panel_count"),
        sum("Full_HD_count").alias("Full_HD_count")
    )

In [None]:
# Step 6: Show the results and save screenshots
ram_avg_per_manufacturer.show()
screen_size_avg_per_type.show()
profiling_results.show()

# Step 7: Stop the SparkSession
spark.stop()