In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkFiles
from pyspark.sql.functions import col , lag
from pyspark.sql.types import *
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

In [3]:
spark = SparkSession.builder \
.master("local") \
.appName("ml_v2") \
.getOrCreate()

In [7]:
df = spark.read.csv('/mnt/group-14-pvc/processed_data', header = True, inferSchema = True)

In [37]:
df.createOrReplaceTempView("df")

In [19]:
sql_query = """
    SELECT 
        supermarket, 
        product_category,
        MONTH(date) as month, 
        YEAR(date) as year,
        city,
        SUM(price) as sales
    FROM df
    GROUP BY 
        YEAR(date),
        MONTH(date),
        city,
        product_category,
        supermarket
    ORDER BY
        YEAR(date),
        MONTH(date)
"""

agg_df = spark.sql(sql_query)

In [25]:
agg_df.show(5)

+-----------+----------------+-----+----+-----------+------------------+
|supermarket|product_category|month|year|       city|             sales|
+-----------+----------------+-----+----+-----------+------------------+
|      Sains|            pets|    1|2023|     Bangor|199.30000000000004|
|       ASDA|   food_cupboard|    1|2023|Southampton| 296.4899999999999|
|      Sains|            pets|    1|2023|    Swansea|145.35000000000002|
|      Sains|          drinks|    1|2023|  Lancaster|            590.05|
|       ASDA|      fresh_food|    1|2023|     Bangor| 631.0299999999997|
+-----------+----------------+-----+----+-----------+------------------+
only showing top 5 rows



In [21]:
output_path = '/mnt/group-14-pvc/processed_data/agg_data'
agg_df.write.csv(output_path, header=True, mode="overwrite")

In [22]:
snapshot_df = spark.read.csv('/mnt/group-14-pvc/processed_data/agg_data', header = True, inferSchema = True)

In [29]:
p_df = agg_df.toPandas()

In [31]:
p_df.to_csv('agg_csv.csv',header = True, index = False)

In [32]:
sql_query = """
    SELECT 
        supermarket, 
        MONTH(date) as month, 
        YEAR(date) as year,
        product_category,
        city,
        SUM(price) as sales
    FROM df
    GROUP BY 
        YEAR(date),
        MONTH(date),
        city,
        supermarket
    ORDER BY
        YEAR(date),
        MONTH(date)
"""

df_3 = spark.sql(sql_query)

In [33]:
df_3.show()

+-----------+-----+----+--------------+------------------+
|supermarket|month|year|          city|             sales|
+-----------+-----+----+--------------+------------------+
|      Tesco|    1|2023|    Chelmsford|3046.4399999999973|
|       ASDA|    1|2023|         Wells|3805.6999999999953|
|       ASDA|    1|2023|        Bangor| 8035.029999999986|
|      Sains|    1|2023| Milton Keynes| 3660.599999999998|
|       ASDA|    1|2023|         Derby| 3541.369999999998|
|       Aldi|    1|2023|    Chelmsford|216.12999999999994|
|       ASDA|    1|2023|     Lancaster|3517.5699999999983|
|      Sains|    1|2023|     St Albans|3587.0999999999985|
|  Morrisons|    1|2023|     Salisbury|1983.6100000000001|
|      Tesco|    1|2023|         Newry|2667.1799999999985|
|       Aldi|    1|2023|      Aberdeen| 260.4800000000001|
|       ASDA|    1|2023|         Leeds|3405.3399999999974|
|      Sains|    1|2023|        Durham| 3348.649999999998|
|       Aldi|    1|2023|       Cardiff| 248.310000000000

In [34]:
df_3.count()

4500

In [35]:
df_3 = df_3.toPandas()

In [36]:
df_3.to_csv('df_3_csv.csv',header = True, index = False)

In [66]:
sql_query = """
    SELECT 
        supermarket, 
        date,
        MONTH(date) as month, 
        YEAR(date) as year,
        product_category,
        COUNT(product_category) as item_count
    FROM df
    GROUP BY 
        supermarket, 
        date,
        MONTH(date), 
        YEAR(date),
        product_category       
    ORDER BY
        YEAR(date),
        MONTH(date)
"""

test_df = spark.sql(sql_query)

In [71]:
test_df.show(10)

+-----------+-------------------+-----+----+----------------+----------+
|supermarket|               date|month|year|product_category|item_count|
+-----------+-------------------+-----+----+----------------+----------+
|       ASDA|2023-01-02 00:00:00|    1|2023|          drinks|       162|
|      Sains|2023-01-26 00:00:00|    1|2023|   food_cupboard|       323|
|      Sains|2023-01-28 00:00:00|    1|2023|   baby_products|        39|
|      Tesco|2023-01-12 00:00:00|    1|2023|          frozen|        47|
|      Sains|2023-01-18 00:00:00|    1|2023| health_products|       332|
|  Morrisons|2023-01-07 00:00:00|    1|2023|          drinks|       154|
|  Morrisons|2023-01-25 00:00:00|    1|2023|      fresh_food|       129|
|       Aldi|2023-01-16 00:00:00|    1|2023|          bakery|        12|
|       Aldi|2023-01-06 00:00:00|    1|2023|       free-from|         5|
|  Morrisons|2023-01-02 00:00:00|    1|2023|          drinks|       117|
+-----------+-------------------+-----+----+-------

In [49]:
test_df.createOrReplaceTempView("test_df")

In [68]:
test_df.count()

19283

In [69]:
inventory_data = test_df.toPandas()

In [70]:
inventory_data.to_csv('inventory_data.csv',header = True, index = False)