## Rank Analysis

Local Computing

#### Author: Yiran Jing
#### Date: Jan 2020

In [1]:
from dataclasses import dataclass, field
from typing import Dict, List
from pyspark.sql.functions import lit
from Rank_analysis_helperfunction import *
import warnings
warnings.filterwarnings('ignore')
import findspark
from pyspark import SparkContext, SparkConf
from pyspark.sql.types import *
# Import SparkSession
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from test_Rank_functions import *
from pyspark.sql import Row
import pyspark

In [2]:
%%time
"""
Build the SparkSession
"""
findspark.init()
# getOrCreate(): get the current Spark session or to create one if there is none running
# The cores property controls the number of concurrent tasks an executor can run. 
# Note that too high cores per executor can lead to bad I/O throughput.
# manage Spark memory limits programmatically 
# To avoid out of memory error
# quite broadcast join.
# spark.executor.cores: The number of cores to use on each executor.
spark = SparkSession.builder \
   .master("local") \
   .appName("Rank Model") \
   .config("spark.executor.cores",1) \
   .getOrCreate()
 
    
sc = spark.sparkContext

CPU times: user 2.28 ms, sys: 1.73 ms, total: 4.01 ms
Wall time: 12.3 ms


### Test functions

In [3]:
%%time
test_all()  # in test_Rank_functions.py

Running tests
🎉	test_in_W1_not_W2
🎉	test_range_expansion
🎉	test_material_change
CPU times: user 740 ms, sys: 307 ms, total: 1.05 s
Wall time: 1min 43s


#### Load and clean data

In [None]:
%%time
## load and clean data
df = spark.read.csv("../data/rawData/data Ranking Report.csv", header=True) # raw data 

# create dataclass onject
df = clean_dataset(df)
dataset = Dataset(df = df, store_item_concept = get_store_item_concept_list(df, spark),
                  week = get_week_list(df), concept = get_concept_list(df))

# new column calculation
"""
Note: 
    some functions need to run in order
    
    `calculate_in_W1_not_W2` must be run before `calculate_unadressed_gap` and `calculate_material_change`
    `calculate_range_expansion` must be run before `calculate_unadressed_gap`
    
    Since some functions are calculated based on column `in_W1_not_W2` or `range_expansion`
"""
dataset.df = calculate_range_expansion(dataset)
dataset.df = calculate_in_W1_not_W2(dataset)
dataset.df = calculate_material_change(dataset)
dataset.df = calculate_unadressed_gap(dataset)
dataset.df = calculate_newcomer(dataset)

# drop duplicate row and unnecessary column
dataset.df = dataset.df.drop('avgSales_lastWeek','sumSales_oldWeek','in_W1_not_W2').dropDuplicates()
dataset.df.printSchema()

print("begin write data out\n")
dataset.df.coalesce(1).write.option("header", "true").mode('overwrite').csv("../data/output/result")
print("Finish")