## Rank Analysis

Local Computing

#### Author: Yiran Jing
#### Date: Jan 2020

In [1]:
import pandas as pd
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import pyplot
import plotly.express as px
import plotly.graph_objects as go
from dataclasses import dataclass, field
from typing import Dict, List
from pyspark.sql.functions import lit
from Rank_analysis_helperfunction import *
import warnings
warnings.filterwarnings('ignore')
import findspark
from pyspark import SparkContext, SparkConf
from pyspark.sql.types import *
# Import SparkSession
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import Row
import pyspark

In [2]:
%%time
"""
Build the SparkSession
"""
findspark.init()
# getOrCreate(): get the current Spark session or to create one if there is none running
# The cores property controls the number of concurrent tasks an executor can run. 
# Note that too high cores per executor can lead to bad I/O throughput.
# manage Spark memory limits programmatically 
# To avoid out of memory error
# quite broadcast join.
# spark.executor.cores: The number of cores to use on each executor.
spark = SparkSession.builder \
   .master("local") \
   .appName("Rank Model") \
   .config("spark.executor.cores",1) \
   .getOrCreate()
 
    
sc = spark.sparkContext

CPU times: user 22.9 ms, sys: 18.1 ms, total: 41 ms
Wall time: 4.85 s


#### Load and clean data

In [4]:
%%time
## load and clean data
df = spark.read.csv("../data/rawData/data Ranking Report.csv", header=True) # raw data 

# create dataclass onject
df = clean_dataset(df)
dataset = Dataset(df = df, store_item_concept = get_store_item_concept_list(df, spark),
                  week = get_week_list(df), concept = get_concept_list(df))

# new column calculation
"""
Note: 
    some functions need to run in order
    
    `calculate_in_W1_not_W2` must be run before `calculate_unadressed_gap` and `calculate_material_change`
    `calculate_range_expansion` must be run before `calculate_unadressed_gap`
    
    Since some functions are calculated based on column `in_W1_not_W2` or `range_expansion`
"""
dataset.df = calculate_range_expansion(dataset)
dataset.df = calculate_in_W1_not_W2(dataset)
dataset.df = calculate_material_change(dataset)
dataset.df = calculate_unadressed_gap(dataset)
dataset.df = calculate_newcomer(dataset)

# drop duplicate row and unnecessary column
dataset.df = dataset.df.drop('avgSales_lastWeek','sumSales_oldWeek','in_W1_not_W2').dropDuplicates()

dataset.df.printSchema()

print("begin write data out\n")
dataset.df.coalesce(1).write.option("header", "true").mode('overwrite').csv("../data/output/result")
print("Finish")

root
 |-- SKU: string (nullable = true)
 |-- Store: string (nullable = true)
 |-- Concept_NEW: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Index: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- BusinessUnit: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- NetSales: float (nullable = false)
 |-- rank: float (nullable = false)
 |-- range_expansion: string (nullable = true)
 |-- material_change: string (nullable = false)
 |-- unadressed_gap: string (nullable = false)
 |-- newcomer: string (nullable = false)

begin write data out
CPU times: user 166 ms, sys: 47.3 ms, total: 214 ms
Wall time: 5min 34s


In [None]:
### Test function for final output 

def test_no_duplicates(df: pyspark.sql.dataframe.DataFrame):
    """
    If we have duplicate rows in output, it means our algorithm is incorrect or inefficient
    """
    #df.createOrReplaceTempView("dftestView")
    duplicates = df.groupBy(df.columns)\
    .count()\
    .where(col('count') > 1)\
    .select(sum('count'))

    assert duplicates.select(col('sum(count)').isNotNull()).count() ==0, "Find some duplicates in result, might some error in the join."
    
   
    
def test_material_change():
    """
    If material_change = True, for that (SKU, Store), we shouldnot have records in W2
    
    test case:
    1.
    AULS488089
    AULS.AVV102
    should be true
    
    2. 
    Concept_NEW  = Air N&R 
    Rank_Total = top 50
    category = 05_
    should have 2 true result
    
    
    """
    true_material_change = dataset.df.filter(col('material_change') == "True")
    filter_last_data = true_material_change.filter(col('Date') == dataset.week[-1]) # records in W2
    count = filter_last_data.count()
    assert count ==0, 'We should have 0 records in W2, but find '+str(fcount) 
    pass
    
def test_outputLength():
    pass

def test_range_expansion():
    """
    Should not have blank for all items sold in W2,
    
    test case:
    1.
    AUDF.CNS113
    AUDF100430711
    should be false 
    
    2.
    AULS.AVV102 
    AULS126353
    should be true
    
    3. 
    Concept_NEW  = Air N&R 
    Rank_Total = top 50
    category = 05_
    Company =  AULS  or  AUDF 
    should have at least 12 true result
    """
    pass

def test_in_W1_not_W2():
    """
    AULS488089
    AULS.AVV102
    should be true
    """