## Rank Analysis

#### Library
- Pyspark
- Amazon Sagemaker
- dataclasses

#### Author: Yiran Jing
#### Date: Jan 2020

In [1]:
from dataclasses import dataclass, field
from typing import Dict, List
from pyspark.sql.functions import lit
from Rank_analysis_helperfunction import *
import warnings
warnings.filterwarnings('ignore')
from pyspark import SparkContext, SparkConf
from pyspark.sql.types import *
# Import SparkSession
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import Row
import pyspark

# only needed in sagemaker
import sagemaker_pyspark

In [2]:
%%time
"""
Build the SparkSession
"""
# getOrCreate(): get the current Spark session or to create one if there is none running
# The cores property controls the number of concurrent tasks an executor can run. 
# Note that too high cores per executor can lead to bad I/O throughput.
# manage Spark memory limits programmatically 
# To avoid out of memory error
# quite broadcast join.
# spark.executor.cores: The number of cores to use on each executor.
classpath = ":".join(sagemaker_pyspark.classpath_jars())
spark = SparkSession.builder \
   .appName("Rank Model") \
   .config("spark.driver.extraClassPath", classpath) \
   .getOrCreate()
 
sc = spark.sparkContext
print(sc)

<SparkContext master=local[*] appName=Rank Model>
CPU times: user 189 ms, sys: 9.08 ms, total: 198 ms
Wall time: 3.3 s


In [3]:
"""
Configuration for Sagemaker
"""
# Seurity configuration 
# find them in 'Security Credentials'
aws_access_key_id = ''
aws_secret_access_key = ''
"""
Define Data Path
"""
region="ap-southeast-2" # Sydney Specific
bucket = 'lagarderecommercial'
prefix = 'RankAnalysis'
path = 's3a://{}/{}/rawData/data Ranking Report.csv'.format(bucket, prefix) # s3a means connect s3 to hadoop

In [None]:
%%time
test_all()

#### Load and clean data

In [8]:
%%time
## load and clean data
#df = spark.read.format("csv").load(path)
df = spark.read.csv(path, header=True) # raw data 

# create dataclass onject
df = clean_dataset(df)
dataset = Dataset(df = df, store_item_concept = get_store_item_concept_list(df, spark),
                  week = get_week_list(df), concept = get_concept_list(df))

# new column calculation
"""
Note: 
    some functions need to run in order
    
    `calculate_in_W1_not_W2` must be run before `calculate_unadressed_gap` and `calculate_material_change`
    `calculate_range_expansion` must be run before `calculate_unadressed_gap`
    
    Since some functions are calculated based on column `in_W1_not_W2` or `range_expansion`
"""
dataset.df = calculate_range_expansion(dataset)
dataset.df = calculate_in_W1_not_W2(dataset)
dataset.df = calculate_material_change(dataset)
dataset.df = calculate_unadressed_gap(dataset)
dataset.df = calculate_newcomer(dataset)

# drop duplicate row and unnecessary column
dataset.df = dataset.df.drop('avgSales_lastWeek','sumSales_oldWeek','in_W1_not_W2').dropDuplicates()

dataset.df.printSchema()

print("Begin write data to CSV\n")
Outputpath = 's3a://{}/{}/output'.format(bucket, prefix) # Specific folder name only
dataset.df.coalesce(1).write.format('com.databricks.spark.csv').option("header", "true")\
.mode('overwrite').save(Outputpath)
# dataset.df.coalesce(1).write.option("header", "true").mode('overwrite').csv(Outputpath)

root
 |-- SKU: string (nullable = true)
 |-- Store: string (nullable = true)
 |-- Concept_NEW: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Index: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- BusinessUnit: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- NetSales: float (nullable = false)
 |-- rank: float (nullable = false)
 |-- range_expansion: string (nullable = true)
 |-- material_change: string (nullable = false)
 |-- unadressed_gap: string (nullable = false)
 |-- newcomer: string (nullable = false)

Begin write data to CSV

CPU times: user 145 ms, sys: 55.5 ms, total: 200 ms
Wall time: 1min 59s
