In [72]:
# Install Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!ls
!pip install faker
import os

# Check if Spark tarball already exists and download only if it doesn't
spark_file = 'spark-3.5.1-bin-hadoop3.tgz'
if not os.path.exists(spark_file):
    !wget https://dlcdn.apache.org/spark/spark-3.5.1/{spark_file}

# Ensure the file is present
!ls -lh {spark_file}

# Extract the Spark tarball
!tar xzf {spark_file}

# Install findspark
!pip install findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.1-bin-hadoop3"

# Initialize Spark using findspark
import findspark
findspark.init()

from faker import Faker
import random
import pyspark.sql.types as T

from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").appName("Fake Data Generation").getOrCreate()
# Verify Spark is initialized
print(spark.version)











0% [Working]            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
0% [Connecting to archive.ubuntu.com (185.125.190.82)] [Connecting to security.ubuntu.com (91.189.91                                                                                                    Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:7 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
sampl

In [73]:


fake = Faker()
from faker.providers import DynamicProvider

products_category = DynamicProvider(
     provider_name="categories",
     elements=["home appliances" ,"phones","laptops", "clothing" , "pharmacy" , "garden","kids toys"],
)
fake.add_provider(products_category)
def generate_data(num_records):
    data = []
    for _ in range(num_records):
        data.append((
            fake.random_number(digits=5),  # user_id
            fake.random_number(digits=7),  # product_id
            random.choice(['view', 'cart', 'purchase']),  # event_type using random.choice
            fake.random_number(digits=3),  # price
            fake.date_time_this_year(),  # event_time
            fake.categories(),  # category_code (using custom provider
            fake.company(),  # brand
            fake.uuid4()  # user_session
        ))
    return data


In [74]:

# Generate data
data = generate_data(100000)

# Define schema for the DataFrame
schema = T.StructType([
    T.StructField("user_id", T.IntegerType(), True),
    T.StructField("product_id", T.IntegerType(), True),
    T.StructField("event_type", T.StringType(), True),
    T.StructField("price", T.IntegerType(), True),
    T.StructField("event_time", T.TimestampType(), True),
    T.StructField("category_code", T.StringType(), True),
    T.StructField("brand", T.StringType(), True),
    T.StructField("user_session", T.StringType(), True)
])



In [75]:


# Create DataFrame from the data
df = spark.createDataFrame(data, schema)

# Show the DataFrame
df.show(5)

df.printSchema()
# Register the DataFrame as a temporary view to run SQL queries
df.createOrReplaceTempView("events")

# SQL query to count the number of each event type
result = spark.sql("""
SELECT event_type, COUNT(*) as count
FROM events
GROUP BY event_type
ORDER BY count DESC
""")

# Show the query results
result.show()


+-------+----------+----------+-----+--------------------+---------------+--------------------+--------------------+
|user_id|product_id|event_type|price|          event_time|  category_code|               brand|        user_session|
+-------+----------+----------+-----+--------------------+---------------+--------------------+--------------------+
|  80784|   7944703|  purchase|  681|2024-03-27 10:15:...|         phones|         Simpson Ltd|67312624-4c55-4a2...|
|  17710|   5451235|      cart|  491|2024-02-19 17:01:...|       pharmacy|        Phillips-Kim|68b74c67-130b-473...|
|  56769|   5769330|      cart|    3|2024-05-19 13:08:...|         garden|           Salas Ltd|5d58b8bc-cffa-456...|
|  98043|   8563157|      view|    4|2024-02-15 13:38:...|        laptops|         Scott-Booth|354a7a99-07ab-453...|
|  76699|   8770319|      view|  705|2024-01-08 03:04:...|home appliances|Pham, Cox and Bishop|cf8cc0bf-6890-49b...|
+-------+----------+----------+-----+--------------------+------

In [76]:
# Window function query for Spark SQL
rolling_sales_summary_query = """
WITH DailySales AS (
    SELECT
        category_code,
        DATE(event_time) AS event_date,
        COUNT(*) AS daily_sales
    FROM events
    WHERE event_type = 'purchase'
    GROUP BY category_code, DATE(event_time)
), AvgSales AS (
    SELECT
        category_code,
        event_date,
        daily_sales,
        AVG(daily_sales) OVER (PARTITION BY category_code ORDER BY event_date ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS avg_last_7_days
    FROM DailySales
)
SELECT
    category_code,
    event_date,
    daily_sales,
    avg_last_7_days,
    (daily_sales - avg_last_7_days) AS diff_from_avg
FROM AvgSales
ORDER BY event_date, category_code;
"""

# Execute the query
sales_summary = spark.sql(rolling_sales_summary_query)
sales_summary.show()

# Optionally, convert to Pandas DataFrame for visualization
sales_summary_pd = sales_summary.toPandas()



+---------------+----------+-----------+------------------+-------------------+
|  category_code|event_date|daily_sales|   avg_last_7_days|      diff_from_avg|
+---------------+----------+-----------+------------------+-------------------+
|       clothing|2024-01-01|         33|              33.0|                0.0|
|         garden|2024-01-01|         24|              24.0|                0.0|
|home appliances|2024-01-01|         24|              24.0|                0.0|
|      kids toys|2024-01-01|         24|              24.0|                0.0|
|        laptops|2024-01-01|         23|              23.0|                0.0|
|       pharmacy|2024-01-01|         15|              15.0|                0.0|
|         phones|2024-01-01|         20|              20.0|                0.0|
|       clothing|2024-01-02|         29|              31.0|               -2.0|
|         garden|2024-01-02|         31|              27.5|                3.5|
|home appliances|2024-01-02|         27|

In [77]:
from pyspark.sql import functions as F

# Print DataFrame schema
print("DataFrame Schema:")
sales_summary.printSchema()

# Display summary statistics of the result DataFrame
sales_summary.describe().show()


DataFrame Schema:
root
 |-- category_code: string (nullable = true)
 |-- event_date: date (nullable = true)
 |-- daily_sales: long (nullable = false)
 |-- avg_last_7_days: double (nullable = true)
 |-- diff_from_avg: double (nullable = true)

+-------+-------------+------------------+------------------+--------------------+
|summary|category_code|       daily_sales|   avg_last_7_days|       diff_from_avg|
+-------+-------------+------------------+------------------+--------------------+
|  count|         1267|              1267|              1267|                1267|
|   mean|         NULL| 26.44672454617206| 26.49415191490958|-0.04742736873755022|
| stddev|         NULL|5.2463147305388915|1.9855891418905245|   4.854147140514679|
|    min|     clothing|                 7|              15.0| -16.285714285714285|
|    max|       phones|                43|              33.0|                16.0|
+-------+-------------+------------------+------------------+--------------------+



In [78]:
import sqlite3
import pandas as pd

# Connect to or create a SQLite database file
conn = sqlite3.connect('ecommerce_data_spark.db')

# Connect to SQLite and write data
sales_summary_pd.to_sql('sales_summary', conn, if_exists='replace', index=False)

# Clean-up and close the connection
conn.close()
from google.colab import files
files.download('ecommerce_data_spark.db')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>