In [3]:
#Creating spark session and checking the version
from pyspark.sql import SparkSession
import os

# (Optional safety) clear env overrides that can break local mode
os.environ.pop("MASTER", None)
os.environ.pop("PYSPARK_SUBMIT_ARGS", None)

spark = (
    SparkSession.builder
    .master("local[*]")
    .appName("AmazonReviewsSparkAnalytics")
    .config("spark.driver.host", "127.0.0.1")
    .config("spark.driver.bindAddress", "127.0.0.1")
    .config("spark.ui.enabled", "false")
    .getOrCreate()
)

print("Spark version:", spark.version)



Spark version: 4.1.1


In [5]:
import os
import urllib.request

def download_if_missing(url: str, local_path: str) -> str:
    os.makedirs(os.path.dirname(local_path), exist_ok=True)
    if not os.path.exists(local_path):
        print(f"Downloading dataset from GitHub...\n{url}\n-> {local_path}")
        urllib.request.urlretrieve(url, local_path)
        print("Download complete.")
    else:
        print(f"Dataset already exists at: {local_path}")
    return local_path



In [6]:
GITHUB_CSV_URL = "https://raw.githubusercontent.com/bharathkrishna711-lab/Amazon-Review-spark-analytics/main/dataset/AmazonProductReviews.csv"

local_csv_path = download_if_missing(
    GITHUB_CSV_URL,
    os.path.join("dataset", "AmazonProductReviews.csv")
)

local_csv_path


Downloading dataset from GitHub...
https://raw.githubusercontent.com/bharathkrishna711-lab/Amazon-Review-spark-analytics/main/dataset/AmazonProductReviews.csv
-> dataset\AmazonProductReviews.csv
Download complete.


'dataset\\AmazonProductReviews.csv'

In [7]:
df_raw = (
    spark.read
    .option("header", "true")
    .option("inferSchema", "true")
    .csv(local_csv_path)
)

df_raw.printSchema()
print("Records loaded:", df_raw.count())
df_raw.show(5, truncate=True)


root
 |-- id: string (nullable = true)
 |-- asins: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- categories: string (nullable = true)
 |-- colors: string (nullable = true)
 |-- dateAdded: timestamp (nullable = true)
 |-- dateUpdated: timestamp (nullable = true)
 |-- dimension: string (nullable = true)
 |-- ean: double (nullable = true)
 |-- keys: string (nullable = true)
 |-- manufacturer: string (nullable = true)
 |-- manufacturerNumber: string (nullable = true)
 |-- name: string (nullable = true)
 |-- prices: string (nullable = true)
 |-- reviews.date: string (nullable = true)
 |-- reviews.doRecommend: string (nullable = true)
 |-- reviews.numHelpful: string (nullable = true)
 |-- reviews.rating: string (nullable = true)
 |-- reviews.sourceURLs: string (nullable = true)
 |-- reviews.text: string (nullable = true)
 |-- reviews.title: string (nullable = true)
 |-- reviews.userCity: string (nullable = true)
 |-- reviews.userProvince: string (nullable = true)
 |-- re

In [10]:
#select only relevant columns
from pyspark.sql.functions import col

df = df_raw.select(
    col("id"),
    col("name"),
    col("brand"),
    col("categories"),
    col("prices"),
    col("`reviews.rating`").cast("double").alias("rating"),
    col("`reviews.numHelpful`").cast("int").alias("numHelpful"),
    col("`reviews.doRecommend`").alias("doRecommend"),
    col("`reviews.date`").alias("review_date_raw"),
    col("`reviews.text`").alias("review_text"),
    col("`reviews.title`").alias("review_title"),
    col("`reviews.username`").alias("reviewer")
)

