# Tasking Data Processing
To pull from Splunk:

```
index="" host="" sourcetype=""
| search "Received Status Update"
| rex field=_raw "itemIds:\[(?<skuNo>\d+)\]"
| rex field=_raw "pickLocationEmptyConfidence:.*?statusCode:(?<statusCode>\d+), statusDetail:,"
| rex field=_raw "customerTaskId:(?<taskID>\w+), itemIds"
| rex field=_raw "customerTaskId:\d{3}_(?<stationID>\d+)_"
| rex field=_raw "\"suction_cup_name\":\"(?<suctionCupName>\w+)\""
| rex field=_raw "commandUtime:(?<ctime>\d+)\}"
| eval commandTime = strftime(ctime/1000000,"%Y-%m-%d %H:%M:%S")
| dedup skuNo, statusCode, taskID, ctime keepempty=true consecutive=false
| table skuNo, statusCode, stationID, taskID, suctionCupName, ctime, commandTime
```

# Process the raw tasking data
* Read the data from csv
* Get the cup configuration from the previous task
* Remove UPC error and pause state

In [1]:
import csv
import os
import pathlib
import datetime
fname = 'rhr_tasks_0201_0911.csv'
fpath = pathlib.Path(os.getcwd()) / fname

In [2]:
class Task:
    def __init__(self, sku, status, stationId, taskId, suctionCup, ctime, commandTime):
        self.sku = sku
        self.status = status
        self.stationId = stationId
        self.taskId = taskId
        self.suctionCup = suctionCup
        self.ctime = int(ctime)
        self.commandTime = datetime.datetime.strptime(commandTime, '%Y-%m-%d %H:%M:%S')
        
    def __str__(self):
        return ','.join([k+':'+str(v) for k, v in vars(self).items()])

In [3]:
raws = []
with open(fpath, 'r') as file:
    csv_reader = csv.reader(file)
    for row in csv_reader:
        if row[0] == '' or row[0] is None:
            continue
        try:
            raws.append(Task(row[0], row[1], row[2], row[3], row[4], row[5], row[6]))
        except Exception as e:
            print(row)

['skuNo', 'statusCode', 'stationID', 'taskID', 'suctionCupName', 'ctime', 'commandTime']


In [4]:
# sort raw
def sort_raw(raw:Task):
    return (raw.stationId, raw.ctime)
raws = sorted(raws, key=sort_raw)

In [5]:
len(raws)

276465

In [6]:
[print(x) for x in raws[0:5]]

sku:2145184,status:200399,stationId:44,taskId:173_44_14638577_19570783_1_13713437_33151_1478039_1_402546_1_1,suctionCup:,ctime:1686581226633130,commandTime:2023-06-12 10:47:06
sku:2145184,status:200302,stationId:44,taskId:173_44_14638581_19570798_1_13713437_33151_1478039_1_402547_1_1,suctionCup:swappable_vs_25_nr,ctime:1686581351901894,commandTime:2023-06-12 10:49:11
sku:2145184,status:200399,stationId:44,taskId:173_44_14638581_19570798_1_13713437_33151_1478039_1_402547_1_1,suctionCup:,ctime:1686581356220168,commandTime:2023-06-12 10:49:16
sku:24402489,status:200302,stationId:44,taskId:173_44_14638487_19569868_1_13713433_319_1471834_1_402606_1_1,suctionCup:swappable_vs_18_nr,ctime:1686581726849315,commandTime:2023-06-12 10:55:26
sku:24402489,status:200399,stationId:44,taskId:173_44_14638487_19569868_1_13713433_319_1471834_1_402606_1_1,suctionCup:,ctime:1686581738293651,commandTime:2023-06-12 10:55:38


[None, None, None, None, None]

In [7]:
# remove pause and UPC error tasks and take the suction cup from previous tasks
tasks = []
suctionCup = None
for task in raws:
    suctionCup = task.suctionCup if task.suctionCup != '' else suctionCup
    if task.status != '200302' and task.status != '200451':
        tasks.append(Task(task.sku, task.status, task.stationId, task.taskId, suctionCup, task.ctime, datetime.datetime.strftime(task.commandTime, '%Y-%m-%d %H:%M:%S')))        

In [8]:
len(tasks)

142456

In [9]:
[print(x) for x in tasks[0:10]]

sku:2145184,status:200399,stationId:44,taskId:173_44_14638577_19570783_1_13713437_33151_1478039_1_402546_1_1,suctionCup:None,ctime:1686581226633130,commandTime:2023-06-12 10:47:06
sku:2145184,status:200399,stationId:44,taskId:173_44_14638581_19570798_1_13713437_33151_1478039_1_402547_1_1,suctionCup:swappable_vs_25_nr,ctime:1686581356220168,commandTime:2023-06-12 10:49:16
sku:24402489,status:200399,stationId:44,taskId:173_44_14638487_19569868_1_13713433_319_1471834_1_402606_1_1,suctionCup:swappable_vs_18_nr,ctime:1686581738293651,commandTime:2023-06-12 10:55:38
sku:24408522,status:200399,stationId:44,taskId:173_44_14638572_19570752_1_13713440_10574_1479464_1_402567_1_1,suctionCup:swappable_bgx_48,ctime:1686582213989421,commandTime:2023-06-12 11:03:33
sku:24408522,status:200399,stationId:44,taskId:173_44_14638583_19570806_1_13713440_10574_1479464_1_402568_1_1,suctionCup:swappable_bgx_48,ctime:1686582248988658,commandTime:2023-06-12 11:04:08
sku:24408522,status:200441,stationId:44,taskId:

[None, None, None, None, None, None, None, None, None, None]

# Aggregate the data into SKU/CUP level for the AB test

First need to calculate by SKU and cup, what is the success rate 

In [10]:
import pandas as pd

In [11]:
df_tasks = pd.DataFrame([vars(t) for t in tasks])

In [12]:
df_tasks.head()

Unnamed: 0,sku,status,stationId,taskId,suctionCup,ctime,commandTime
0,2145184,200399,44,173_44_14638577_19570783_1_13713437_33151_1478...,,1686581226633130,2023-06-12 10:47:06
1,2145184,200399,44,173_44_14638581_19570798_1_13713437_33151_1478...,swappable_vs_25_nr,1686581356220168,2023-06-12 10:49:16
2,24402489,200399,44,173_44_14638487_19569868_1_13713433_319_147183...,swappable_vs_18_nr,1686581738293651,2023-06-12 10:55:38
3,24408522,200399,44,173_44_14638572_19570752_1_13713440_10574_1479...,swappable_bgx_48,1686582213989421,2023-06-12 11:03:33
4,24408522,200399,44,173_44_14638583_19570806_1_13713440_10574_1479...,swappable_bgx_48,1686582248988658,2023-06-12 11:04:08


In [13]:
df_tasks['success'] = df_tasks['status'].apply(lambda x: True if x == '200399' else False)

In [14]:
df_tasks.sample(10)

Unnamed: 0,sku,status,stationId,taskId,suctionCup,ctime,commandTime,success
55082,823967,200399,44,173_44_16119459_21408300_1_15158854_21809_1631...,swappable_bgx_48,1692716449733085,2023-08-22 11:00:49,True
33646,79685,200399,44,173_44_15612019_20820000_1_14664428_10752_1558...,swappable_bgx_48,1690839247743414,2023-07-31 17:34:07,True
109122,589162,200399,45,174_45_15672102_20910269_1_14728774_18349_1444...,swappable_vs_18_nr,1690999147895769,2023-08-02 13:59:07,True
24708,860124,200440,44,173_44_15362243_20537653_1_14421142_4340_15456...,swappable_bgx_48,1689861825478015,2023-07-20 10:03:45,False
115713,24398946,200422,45,174_45_15835285_21097630_1_14890375_16178_1589...,swappable_bgx_48,1691603109262941,2023-08-09 13:45:09,False
21978,24421991,200399,44,173_44_15282272_20425881_1_14335511_14644_1319...,swappable_vs_25_nr,1689634848797678,2023-07-17 19:00:48,True
37138,271674,200399,44,173_44_15683582_20914390_1_14737415_21714_1568...,swappable_bgx_48,1691017079533534,2023-08-02 18:57:59,True
42578,365997,200450,44,173_44_15825030_21083452_1_14882151_291_160251...,swappable_bgx_48,1691585523961286,2023-08-09 08:52:03,False
69673,793167,200399,44,173_44_16484173_21810681_1_15526668_33810_8230...,swappable_bgx_48,1694000751794026,2023-09-06 07:45:51,True
42899,24474008,200399,44,173_44_15826902_21087793_1_14883656_9281_15296...,swappable_bgx_48,1691594544959047,2023-08-09 11:22:24,True


In [15]:
df = df_tasks.groupby(['sku','suctionCup'], sort=True).agg(p=pd.NamedAgg(column='success', aggfunc=lambda x: sum(x)/len(x)), n=pd.NamedAgg(column='success', aggfunc='count'))
df = df.reset_index()
df = df.sort_values(by='n', ascending=False)

In [16]:
df.head(20)

Unnamed: 0,sku,suctionCup,p,n
1154,2030289,swappable_bgx_48,0.644594,1933
2109,24388084,swappable_bgx_48,0.901363,1541
5643,512663,swappable_vs_25_nr,0.997188,1067
2116,24388284,swappable_vs_25_nr,0.875648,965
1360,2145184,swappable_vs_25_nr,0.933893,953
8070,917881,swappable_bgx_48,0.923341,874
610,1611435,swappable_vs_25_nr,0.638539,794
1266,209882,swappable_vsa_63_nr,0.745718,759
1358,2145184,swappable_bgx_48,0.968839,706
2115,24388284,swappable_bgx_48,0.94876,605


## Need to find the SKUs picked with more than one cup 

In [48]:
df_filtered = df[df['p'] > 0]

In [49]:
df_filtered = df_filtered[df_filtered['sku'].duplicated(keep=False)].sort_values(by='sku', ascending=False)

In [50]:
df_filtered = df_filtered[df_filtered['sku'].duplicated(keep=False)].sort_values(by='n', ascending=False)

In [51]:
df_filtered[0:30]

Unnamed: 0,sku,suctionCup,p,n
5643,512663,swappable_vs_25_nr,0.997188,1067
2116,24388284,swappable_vs_25_nr,0.875648,965
1360,2145184,swappable_vs_25_nr,0.933893,953
610,1611435,swappable_vs_25_nr,0.638539,794
1266,209882,swappable_vsa_63_nr,0.745718,759
1358,2145184,swappable_bgx_48,0.968839,706
2115,24388284,swappable_bgx_48,0.94876,605
2615,24430099,swappable_vs_25_nr,0.761194,536
2609,24430096,swappable_vs_25_nr,0.878505,535
5334,487908,swappable_bgx_48,0.943867,481


### SKU 24430099
* baseline = 0.855932
* dmin = baseline*0.15
* alpha = 0.05
* power = 0.8

Size = 121

In [52]:
df_filtered[df_filtered['sku'] == '24430099']

Unnamed: 0,sku,suctionCup,p,n
2615,24430099,swappable_vs_25_nr,0.761194,536
2613,24430099,swappable_bgx_48,0.554545,220
2614,24430099,swappable_vs_18_nr,0.681818,44


### SKU 1611435
* baseline = 0.787136
* dmin = baseline*0.15
* alpha = 0.05
* power = 0.8

Size = 196

In [53]:
df_filtered[df_filtered['sku'] == '1611435']

Unnamed: 0,sku,suctionCup,p,n
610,1611435,swappable_vs_25_nr,0.638539,794
608,1611435,swappable_bgx_48,0.555985,259
609,1611435,swappable_vs_18_nr,0.670455,88


## Find SKUs that have enough sample points to be compared in AB test.

In [74]:
from ABTestToolKit import get_sample_size


alpha = 0.05
power = 0.8
eligible_combo = []


for sku in df_filtered['sku'].unique():
    df_sku = df_filtered[df_filtered['sku'] == sku]
    n_cup = len(df_sku)
    
    for i in range(n_cup-1):
        con = df_sku[df_sku['p'] == df_sku['p'].max()]
        exp = df_sku[df_sku['p'] == df_sku['p'].nlargest(i+2).values[-1]]

        # param
        n_ctl = con['n'].values[0]
        n_exp = exp['n'].values[0]
        baseline = con['p'].values[0]
        p_exp = exp['p'].values[0]
        dmin = baseline*0.15

        # calculate size for AB test
        try:
            n = get_sample_size(alpha, power, dmin, baseline)
        except Exception as e:
            print(f"sku {sku} couldn't be measured with bl {baseline} and dmin {dmin}")
            continue

        # add eligible skus to the list
        if con['n'].values[0] >= n and exp['n'].values[0] >= n:
            print(f"sku {sku} can be measured with bl {baseline}, dmin {dmin} and n {n} ")
            eligible_combo.append((sku, n_ctl, n_exp, baseline, p_exp, dmin))    

sku 512663 can be measured with bl 1.0, dmin 0.15 and n 38 
sku 24388284 can be measured with bl 0.9487603305785124, dmin 0.14231404958677685 and n 65 
sku 24430099 can be measured with bl 0.7611940298507462, dmin 0.11417910447761193 and n 197 
sku 24430096 can be measured with bl 0.9690721649484536, dmin 0.14536082474226802 and n 54 
sku 737194 can be measured with bl 0.9858156028368794, dmin 0.1478723404255319 and n 45 
sku 2498462 can be measured with bl 0.9701897018970189, dmin 0.14552845528455283 and n 53 
sku 24460346 can be measured with bl 0.9465408805031447, dmin 0.1419811320754717 and n 66 
sku 487129 can be measured with bl 0.828125, dmin 0.12421874999999999 and n 143 
sku 633858 can be measured with bl 0.9629629629629629, dmin 0.14444444444444443 and n 57 
sku 918408 can be measured with bl 0.9866666666666667, dmin 0.148 and n 44 
sku 483018 can be measured with bl 0.9512195121951219, dmin 0.14268292682926828 and n 64 
sku 938560 can be measured with bl 0.8823529411764706, 

In [75]:
eligible_combo

[('512663', 300, 1067, 1.0, 0.9971883786316776, 0.15),
 ('24388284',
  605,
  965,
  0.9487603305785124,
  0.8756476683937824,
  0.14231404958677685),
 ('24430099',
  536,
  220,
  0.7611940298507462,
  0.5545454545454546,
  0.11417910447761193),
 ('24430096',
  97,
  535,
  0.9690721649484536,
  0.8785046728971962,
  0.14536082474226802),
 ('737194',
  141,
  375,
  0.9858156028368794,
  0.9733333333333334,
  0.1478723404255319),
 ('2498462',
  369,
  74,
  0.9701897018970189,
  0.9054054054054054,
  0.14552845528455283),
 ('24460346',
  318,
  81,
  0.9465408805031447,
  0.9382716049382716,
  0.1419811320754717),
 ('487129', 192, 194, 0.828125, 0.8195876288659794, 0.12421874999999999),
 ('633858',
  189,
  74,
  0.9629629629629629,
  0.8378378378378378,
  0.14444444444444443),
 ('918408', 75, 181, 0.9866666666666667, 0.9668508287292817, 0.148),
 ('483018',
  123,
  170,
  0.9512195121951219,
  0.9176470588235294,
  0.14268292682926828),
 ('938560',
  170,
  170,
  0.8823529411764706,

In [78]:
# show all eligible skus:
df_filtered[df_filtered['sku'].isin([x[0] for x in eligible_combo])]

Unnamed: 0,sku,suctionCup,p,n
5643,512663,swappable_vs_25_nr,0.997188,1067
2116,24388284,swappable_vs_25_nr,0.875648,965
2115,24388284,swappable_bgx_48,0.948760,605
2615,24430099,swappable_vs_25_nr,0.761194,536
2609,24430096,swappable_vs_25_nr,0.878505,535
...,...,...,...,...
3169,24517292,swappable_bgx_48,1.000000,2
1906,24376605,swappable_bgx_48,1.000000,2
4271,332865,swappable_bgx_48,1.000000,1
3689,2681262,swappable_bgx_48,1.000000,1


## Perform AB test on the SKUs with enough data points.


* Take the suction cup with higher success rate as the control group
* Take the suction cup with lower success rate as the experiment group
* Perform AB test to confirm if they have 15% relative difference
* If the result is significant, check how many failed attempts can be converted into success.

In [80]:
from ABTestToolKit import get_two_sample_Z_test

significants = []

for combo in eligible_combo:
    sku = combo[0]
    
    # param
    baseline = combo[3]
    dmin = combo[5]
    n_ctl = combo[1]
    x_ctl =  n_ctl * baseline
    n_exp = combo[2]
    x_exp = n_exp * combo[4]
    alpha = 0.05
    
    
    # ab test
    _, p, _, _, _ = get_two_sample_Z_test(n_ctl, x_ctl, n_exp, x_exp, alpha)
    
    if p < alpha:
        n_convert = int(n_exp * baseline - x_exp)
        significants.append((sku, n_exp, n_convert))
        print(f"significant sku {sku}, can convert {n_convert} from failure to success")

control mean: 1.0
experiment mean: 0.9971883786316776
The confidence interval for the difference is [-0.0022, 0.0028, 0.0078]
statistics: 0.919424594997516
The p value: 0.3578735297045885
control mean: 0.9487603305785124
experiment mean: 0.8756476683937824
The confidence interval for the difference is [0.0480, 0.0731, 0.0983]
statistics: 4.781939713261551
The p value: 1.7361175121966e-06
significant sku 24388284, can convert 70 from failure to success
control mean: 0.7611940298507462
experiment mean: 0.5545454545454546
The confidence interval for the difference is [0.1464, 0.2066, 0.2669]
statistics: 5.637608251046453
The p value: 1.7242821392926544e-08
significant sku 24430099, can convert 45 from failure to success
control mean: 0.9690721649484536
experiment mean: 0.8785046728971962
The confidence interval for the difference is [0.0343, 0.0906, 0.1468]
statistics: 2.6484978857889914
The p value: 0.00808503549455164
significant sku 24430096, can convert 48 from failure to success
cont

  Z = d/se_pool


In [81]:
significants

[('24388284', 965, 70),
 ('24430099', 220, 45),
 ('24430096', 535, 48),
 ('2498462', 74, 4),
 ('633858', 74, 9),
 ('220061', 126, 28),
 ('123729', 84, 8),
 ('867590', 45, 4),
 ('24402489', 69, 7),
 ('807969', 56, 9),
 ('2126870', 48, 10)]

In [83]:
df_filtered[df_filtered['sku'] == '220061']

Unnamed: 0,sku,suctionCup,p,n
1391,220061,swappable_vs_18_nr,0.714286,126
1390,220061,swappable_bgx_48,0.942308,104
