In [1]:
# env : pixlake
# we focuing on pyspark dataframe processing
# documentation https://spark.apache.org/docs/2.4.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame
%load_ext autoreload
%autoreload 2

# make you auto compeletion faster
# https://stackoverflow.com/questions/40536560/ipython-and-jupyter-autocomplete-not-working
%config Completer.use_jedi = False

In [3]:
import os
import sys
from os.path import join
import pandas as pd
from pyspark.sql import SparkSession as Session
from pyspark.sql import DataFrame
from pyspark import SparkConf as Conf
from pyspark.sql import functions as F, Window as W, types as T
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
C = F.col

os.environ['PYSPARK_PYTHON'] = sys.executable

In [4]:
conf = (Conf()
    .set('spark.sql.sources.partitionOverwriteMode', 'dynamic')
    .set('spark.driver.memory', '4g')
    .set('spark.driver.maxResultSize', '1g')
   )

In [5]:
spark = (Session
     .builder
     .appName('mix-rank')
     .master('local[2]')
     .config(conf=conf)
     .getOrCreate())

In [7]:
# 1
# rank the food category popularity by store_name but crossed and rotated


# create a mix ranking number
# create a popularity_rank_score in each store_name
    # Top popular in each store_name -> score 0
    # Second popular in each store_name -> score 0.1
# Add row number in each store_name, food_category -> cat_rank
# Create category_popularity_mix_rank_score = cat_rank + popularity_rank_score
# Sort the category_popularity_mix_rank_score by store_name

data = [
    ("hotpop","Meat",3,),
    ("hotpop","Meat",3),
    ("hotpop","Meat",3),
    ("hotpop","Vegetable",2),
    ("hotpop","Vegetable",2),
    ("branch","Fried food",1),
    ("branch","Dessert",1),
  ]

columns = ["store_name","food_category","food_category_popularity"]
df = spark.createDataFrame(data = data, schema = columns)

print('before')
df.show(n=10)

################# sol #######################
store_cat_pop_rank_score = F.when(C("store_cat_pop_rank") == 1, 0)\
                            .when(C("store_cat_pop_rank") == 2, 0.1)


window_sotre_cat_pop = W.partitionBy('store_name').orderBy(C("food_category_popularity").desc())
window_sotre_cat = W.partitionBy(['store_name','food_category']).orderBy(C("food_category"))
window_sotr_cat_mix_rank = W.partitionBy(['store_name']).orderBy(C("mix_cat_pop_rank_score"))
df = (
    df.withColumn("food_cat_pop_score", 100 * C("food_category_popularity") + 20 * F.randn(seed=42))\
      .withColumn("cat_idx", F.row_number().over(window_sotre_cat))\
      .withColumn("store_cat_pop_rank", F.dense_rank().over(window_sotre_cat_pop))
      .withColumn("store_cat_pop_rank_score", store_cat_pop_rank_score)\
      .withColumn("mix_cat_pop_rank_score", C("cat_idx") + C("store_cat_pop_rank_score"))\
      .withColumn("mix_cat_pop_rank", F.row_number().over(window_sotr_cat_mix_rank))\
)

print('after')

df.toPandas()

before
+----------+-------------+------------------------+
|store_name|food_category|food_category_popularity|
+----------+-------------+------------------------+
|    hotpop|         Meat|                       3|
|    hotpop|         Meat|                       3|
|    hotpop|         Meat|                       3|
|    hotpop|    Vegetable|                       2|
|    hotpop|    Vegetable|                       2|
|    branch|   Fried food|                       1|
|    branch|      Dessert|                       1|
+----------+-------------+------------------------+

after


Unnamed: 0,store_name,food_category,food_category_popularity,food_cat_pop_score,cat_idx,store_cat_pop_rank,store_cat_pop_rank_score,mix_cat_pop_rank_score,mix_cat_pop_rank
0,hotpop,Meat,3,347.689581,1,1,0.0,1.0,1
1,hotpop,Vegetable,2,222.054109,1,2,0.1,1.1,2
2,hotpop,Meat,3,303.841868,2,1,0.0,2.0,3
3,hotpop,Vegetable,2,211.442089,2,2,0.1,2.1,4
4,hotpop,Meat,3,314.674673,3,1,0.0,3.0,5
5,branch,Dessert,1,84.324392,1,1,0.0,1.0,1
6,branch,Fried food,1,86.692775,1,1,0.0,1.0,2
