In [2]:
import pandas as pd
from tqdm import tqdm
import csv
import random
import string
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

### **** RUN THESE CELLS IF YOU WANT TO GENERATE DATA ****

Download the data [here (Google Drive)](https://drive.google.com/file/d/1kCXnIeoPT6p9kS_ANJ0mmpxlfDwK1yio/view?usp=sharing&source=post_page-----242445b24565--------------------------------)

Code Starts [here](#Lab)

In [2]:
random.seed(1999)

letters = string.ascii_lowercase
letters_upper = string.ascii_uppercase
for _i in range(0, 10):
    letters += letters

for _i in range(0, 10):
    letters += letters_upper


def random_string(stringLength=10):
    """Generate a random string of fixed length """
    return ''.join(random.sample(letters, stringLength))


print("Products between {} and {}".format(1, 75000000))
product_ids = [x for x in range(1, 75000000)]
dates = ['2020-07-01', '2020-07-02', '2020-07-03', '2020-07-04', '2020-07-05', '2020-07-06', '2020-07-07', '2020-07-08',
         '2020-07-09', '2020-07-10']
seller_ids = [x for x in range(1, 10)]

Products between 1 and 75000000


In [3]:
# Generate products
products = [[0, "product_0", 22]]
for p in tqdm(product_ids, miniters=int(223265/100)):
    products.append([p, "product_{}".format(p), random.randint(1, 150)])

#   Save dataframe
df = pd.DataFrame(products)
print("df for products created")
df.columns = ["product_id", "product_name", "price"]
df.to_csv("products.csv", index=False)
print("products.csv saved")
del df
print("df deleted")
del products
print("product list deleted")

100%|███████████████████████████████████████████████████████████████████| 74999999/74999999 [32:56<00:00, 37943.56it/s]


df for products created
products.csv saved
df deleted
product list deleted


In [4]:
#   Generate sellers
sellers = [[0, "seller_0", 2500000]]
for s in tqdm(seller_ids, miniters=int(223265/100)):
    sellers.append([s, "seller_{}".format(s), random.randint(12000, 2000000)])
#   Save dataframe
df = pd.DataFrame(sellers)
print("df for sellers created")
df.columns = ["seller_id", "seller_name", "daily_target"]
df.to_csv("sellers.csv", index=False)
print("sellers.csv saved")

#   Generate sales
total_rows = 500000
prod_zero = int(total_rows * 0.95)
prod_others = total_rows - prod_zero + 1
df_array = [["order_id", "product_id", "seller_id", "date", "num_pieces_sold", "bill_raw_text"]]
with open('sales.csv', 'w', newline='') as f:
    csvwriter = csv.writer(f)
    csvwriter.writerows(df_array)

100%|████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<?, ?it/s]

df for sellers created
sellers.csv saved





In [5]:
order_id = 0
for i in tqdm(range(0, 40), miniters=int(223265/100)):
    df_array = []

    
    for i in range(0, prod_zero):
        order_id += 1
        df_array.append([order_id, 0, 0, random.choice(dates), random.randint(1, 100), random_string(500)])

    with open('sales.csv', 'a', newline='') as f:
        csvwriter = csv.writer(f)
        csvwriter.writerows(df_array)

    df_array = []
    for i in range(0, prod_others):
        order_id += 1
        df_array.append(
            [order_id, random.choice(product_ids), random.choice(seller_ids), random.choice(dates),
             random.randint(1, 100), random_string(500)])

    with open('sales.csv', 'a', newline='') as f:
        csvwriter = csv.writer(f)
        csvwriter.writerows(df_array)

print("Done")

100%|███████████████████████████████████████████████████████████████████████████████| 40/40 [3:18:10<00:00, 297.26s/it]

Done





In [2]:
spark = SparkSession.builder \
    .master("local") \
    .config("spark.sql.autoBroadcastJoinThreshold", -1) \
    .appName("Exercise1") \
    .getOrCreate()

products = spark.read.csv(
    "products.csv", header=True, mode="DROPMALFORMED"
)
products.show()
products.write.parquet("products_parquet", mode="overwrite")

sales = spark.read.csv(
    "sales.csv", header=True, mode="DROPMALFORMED"
)
sales.show()
sales.repartition(200, col("product_id")).write.parquet("sales_parquet", mode="overwrite")

sellers = spark.read.csv(
    "sellers.csv", header=True, mode="DROPMALFORMED"
)
sellers.show()
sellers.write.parquet("sellers_parquet", mode="overwrite")

+----------+------------+-----+
|product_id|product_name|price|
+----------+------------+-----+
|         0|   product_0|   22|
|         1|   product_1|   30|
|         2|   product_2|   91|
|         3|   product_3|   37|
|         4|   product_4|  145|
|         5|   product_5|  128|
|         6|   product_6|   66|
|         7|   product_7|  145|
|         8|   product_8|   51|
|         9|   product_9|   44|
|        10|  product_10|   53|
|        11|  product_11|   13|
|        12|  product_12|  104|
|        13|  product_13|  102|
|        14|  product_14|   24|
|        15|  product_15|   14|
|        16|  product_16|   38|
|        17|  product_17|   72|
|        18|  product_18|   16|
|        19|  product_19|   46|
+----------+------------+-----+
only showing top 20 rows

+--------+----------+---------+----------+---------------+--------------------+
|order_id|product_id|seller_id|      date|num_pieces_sold|       bill_raw_text|
+--------+----------+---------+----------+----

### ****************************************************************************

## Lab

## Datasets

In [3]:
#   Initialize the Spark session
spark = SparkSession.builder \
    .master("local") \
    .config("spark.sql.autoBroadcastJoinThreshold", -1) \
    .config("spark.executor.memory", "500mb") \
    .appName("Exercise1") \
    .getOrCreate()

In [6]:
#   Read the source tables in Parquet format
products_table = spark.read.parquet("./data/products_parquet")
sales_table = spark.read.parquet("./data/sales_parquet")
sellers_table = spark.read.parquet("./data/sellers_parquet")

In [7]:
products_table.head(5)

[Row(product_id='0', product_name='product_0', price='22'),
 Row(product_id='1', product_name='product_1', price='30'),
 Row(product_id='2', product_name='product_2', price='91'),
 Row(product_id='3', product_name='product_3', price='37'),
 Row(product_id='4', product_name='product_4', price='145')]

In [8]:
products_table.columns

['product_id', 'product_name', 'price']

In [9]:
sales_table.head()

Row(order_id='1', product_id='0', seller_id='0', date='2020-07-03', num_pieces_sold='98', bill_raw_text='frlnwjcoaxsaubnattnhxdejrexrovharjhomfxchbedwmwwqpkhiwzvmbzbqyowrwmggfwzvqmwnaqeekvdyiumdrhiiaoavtkjfPxpzuhrbupyksvkUsdzhrumxjhhoanlebpohrrvdflpllbgbqelzctqxjgsgzhckzvypjvilodvpbiqjoesstbcbdofpopshstupnyaxktjtnmkqwbgbljqvkmzpganqchuwwpBztdrzyminivqgfzaykvchpfidkpygmwodhhckcsiznwuapiyuhnlktnjmuqxpzmKjcuujivfcuxuyzousufrwozcxwsbgjqqophhxjpgdsztfduzscrkkurfqqkleoazelmgnbgjbmsvkwahbhvfkkpbegxwzmwgyfourqapzoizqhsesrfukcemwkruzrdanycaelgsxiykggkzgcknbuczgplifaggbberhhepzqbcejgurnfrblkpuvgdoadxadvqojnr')

In [10]:
sales_table.columns

['order_id',
 'product_id',
 'seller_id',
 'date',
 'num_pieces_sold',
 'bill_raw_text']

In [11]:
sellers_table.head(5)

[Row(seller_id='0', seller_name='seller_0', daily_target='2500000'),
 Row(seller_id='1', seller_name='seller_1', daily_target='1375559'),
 Row(seller_id='2', seller_name='seller_2', daily_target='205349'),
 Row(seller_id='3', seller_name='seller_3', daily_target='71546'),
 Row(seller_id='4', seller_name='seller_4', daily_target='1315668')]

In [12]:
sellers_table.columns

['seller_id', 'seller_name', 'daily_target']

## Warm-Up #1
Find out how many orders, how many products and how many sellers are in the data.
How many products have been sold at least once? Which is the product contained in more orders?

In [13]:
#   Print the number of orders
print("Number of Orders: {}".format(sales_table.count()))

Number of Orders: 20000040


In [14]:
#   Print the number of sellers
print("Number of sellers: {}".format(sellers_table.count()))

Number of sellers: 10


In [15]:
#   Print the number of products
print("Number of products: {}".format(products_table.count()))

Number of products: 75000000


In [16]:
#   Output how many products have been actually sold at least once
print("Number of products sold at least once")
sales_table.agg(countDistinct(col("product_id"))).show()

Number of products sold at least once
+--------------------------+
|count(DISTINCT product_id)|
+--------------------------+
|                    993299|
+--------------------------+



In [17]:
#   Output which is the product that has been sold in more orders
print("Product present in more orders")
sales_table.groupBy(col("product_id")).agg(
    count("*").alias("cnt")).orderBy(col("cnt").desc()).limit(1).show()

Product present in more orders
+----------+--------+
|product_id|     cnt|
+----------+--------+
|         0|19000000|
+----------+--------+



## Warm-Up #2
How many distinct products have been sold in each day?

In [18]:
sales_table.groupby(col("date")).agg(countDistinct(col("product_id")).alias("distinct_products_sold")).orderBy(
    col("distinct_products_sold").desc()).show()

+----------+----------------------+
|      date|distinct_products_sold|
+----------+----------------------+
|2020-07-04|                100294|
|2020-07-03|                100224|
|2020-07-10|                100218|
|2020-07-08|                100048|
|2020-07-05|                 99991|
|2020-07-06|                 99869|
|2020-07-09|                 99801|
|2020-07-02|                 99768|
|2020-07-01|                 99755|
|2020-07-07|                 99453|
+----------+----------------------+

