# Tasking Data Processing
To pull from Splunk:

```
index="" host="" sourcetype=""
| search "Received Status Update"
| rex field=_raw "itemIds:\[(?<skuNo>\d+)\]"
| rex field=_raw "pickLocationEmptyConfidence:.*?statusCode:(?<statusCode>\d+), statusDetail:,"
| rex field=_raw "customerTaskId:(?<taskID>\w+), itemIds"
| rex field=_raw "customerTaskId:\d{3}_(?<stationID>\d+)_"
| rex field=_raw "\"suction_cup_name\":\"(?<suctionCupName>\w+)\""
| rex field=_raw "commandUtime:(?<ctime>\d+)\}"
| eval commandTime = strftime(ctime/1000000,"%Y-%m-%d %H:%M:%S")
| dedup skuNo, statusCode, taskID, ctime keepempty=true consecutive=false
| table skuNo, statusCode, stationID, taskID, suctionCupName, ctime, commandTime
```

# Process the raw tasking data
* Read the data from csv
* Get the cup configuration from the previous task
* Remove UPC error and pause state

In [107]:
import csv
import os
import pathlib
import datetime
fname = 'rhr_tasks_0201_0727.csv'
fpath = pathlib.Path(os.getcwd()) / fname

In [108]:
class Task:
    def __init__(self, sku, status, stationId, taskId, suctionCup, ctime, commandTime):
        self.sku = sku
        self.status = status
        self.stationId = stationId
        self.taskId =
        self.suctionCup = suctionCu taskIdp
        self.ctime = int(ctime)
        self.commandTime = datetime.datetime.strptime(commandTime, '%Y-%m-%d %H:%M:%S')
        
    def __str__(self):
        return ','.join([k+':'+str(v) for k, v in vars(self).items()])

In [109]:
raws = []
with open(fpath, 'r') as file:
    csv_reader = csv.reader(file)
    for row in csv_reader:
        if row[0] == '' or row[0] is None:
            continue
        try:
            raws.append(Task(row[0], row[1], row[2], row[3], row[4], row[5], row[6]))
        except Exception as e:
            print(row)

['skuNo', 'statusCode', 'stationID', 'taskID', 'suctionCupName', 'ctime', 'commandTime']


In [110]:
# sort raw
def sort_raw(raw:Task):
    return (raw.stationId, raw.ctime)
raws = sorted(raws, key=sort_raw)

In [111]:
len(raws)

246381

In [112]:
[print(x) for x in raws[0:5]]

sku:24422012,status:200302,stationId:44,taskId:173_44_13749414_18302461_1_12846565_23225_1375703_10_362756_10_10,suctionCup:,ctime:1682515004386425,commandTime:2023-04-26 09:16:44
sku:24422012,status:200399,stationId:44,taskId:173_44_13749414_18302461_1_12846565_23225_1375703_10_362756_10_10,suctionCup:,ctime:1682515020261225,commandTime:2023-04-26 09:17:00
sku:24422012,status:200302,stationId:44,taskId:173_44_13749282_18302462_1_12846565_23225_1375703_10_362744_1_1,suctionCup:swappable_bgx_48,ctime:1682515030259747,commandTime:2023-04-26 09:17:10
sku:24422012,status:200399,stationId:44,taskId:173_44_13749282_18302462_1_12846565_23225_1375703_10_362744_1_1,suctionCup:,ctime:1682515034696349,commandTime:2023-04-26 09:17:14
sku:922552,status:200399,stationId:44,taskId:173_44_13751067_18303682_1_12847839_32996_1344239_1_362848_1_1,suctionCup:,ctime:1682517156754523,commandTime:2023-04-26 09:52:36


[None, None, None, None, None]

In [113]:
# remove pause and UPC error tasks and take the suction cup from previous tasks
tasks = []
suctionCup = None
for task in raws:
    suctionCup = task.suctionCup if task.suctionCup != '' else suctionCup
    if task.status != '200302' and task.status != '200451':
        tasks.append(Task(task.sku, task.status, task.stationId, task.taskId, suctionCup, task.ctime, datetime.datetime.strftime(task.commandTime, '%Y-%m-%d %H:%M:%S')))        

In [114]:
len(tasks)

126664

In [115]:
[print(x) for x in tasks[0:10]]

sku:24422012,status:200399,stationId:44,taskId:173_44_13749414_18302461_1_12846565_23225_1375703_10_362756_10_10,suctionCup:None,ctime:1682515020261225,commandTime:2023-04-26 09:17:00
sku:24422012,status:200399,stationId:44,taskId:173_44_13749282_18302462_1_12846565_23225_1375703_10_362744_1_1,suctionCup:swappable_bgx_48,ctime:1682515034696349,commandTime:2023-04-26 09:17:14
sku:922552,status:200399,stationId:44,taskId:173_44_13751067_18303682_1_12847839_32996_1344239_1_362848_1_1,suctionCup:swappable_bgx_48,ctime:1682517156754523,commandTime:2023-04-26 09:52:36
sku:24337416,status:200399,stationId:44,taskId:173_44_13752600_18302627_1_12848946_9948_1351515_5_362931_2_2,suctionCup:swappable_vs_25_nr,ctime:1682517802553269,commandTime:2023-04-26 10:03:22
sku:24384000,status:200399,stationId:44,taskId:173_44_13751838_18296475_1_12848394_16248_1359033_1_362888_1_1,suctionCup:swappable_vs_25_nr,ctime:1682518046631026,commandTime:2023-04-26 10:07:26
sku:2145184,status:200399,stationId:44,tas

[None, None, None, None, None, None, None, None, None, None]

# Aggregate the data into SKU/CUP level for the AB test

First need to calculate by SKU and cup, what is the success rate 

In [116]:
import pandas as pd

In [117]:
df_tasks = pd.DataFrame([vars(t) for t in tasks])

In [118]:
df_tasks.head()

Unnamed: 0,sku,status,stationId,taskId,suctionCup,ctime,commandTime
0,24422012,200399,44,173_44_13749414_18302461_1_12846565_23225_1375...,,1682515020261225,2023-04-26 09:17:00
1,24422012,200399,44,173_44_13749282_18302462_1_12846565_23225_1375...,swappable_bgx_48,1682515034696349,2023-04-26 09:17:14
2,922552,200399,44,173_44_13751067_18303682_1_12847839_32996_1344...,swappable_bgx_48,1682517156754523,2023-04-26 09:52:36
3,24337416,200399,44,173_44_13752600_18302627_1_12848946_9948_13515...,swappable_vs_25_nr,1682517802553269,2023-04-26 10:03:22
4,24384000,200399,44,173_44_13751838_18296475_1_12848394_16248_1359...,swappable_vs_25_nr,1682518046631026,2023-04-26 10:07:26


In [119]:
df_tasks['success'] = df_tasks['status'].apply(lambda x: True if x == '200399' else False)

In [120]:
df_tasks.sample(10)

Unnamed: 0,sku,status,stationId,taskId,suctionCup,ctime,commandTime,success
49380,24376646,200399,44,173_44_15138495_20257125_1_14203939_2066_14971...,swappable_vs_25_nr,1689078621447404,2023-07-11 08:30:21,True
22293,1480018,200399,44,173_44_14382397_19213733_1_13460429_10256_1359...,swappable_vs_25_nr,1685447085362748,2023-05-30 07:44:45,True
121801,808016,200450,45,174_45_15406300_20585229_1_14462912_13991_1547...,swappable_bgx_48,1689957154118237,2023-07-21 12:32:34,False
3160,166850,200399,44,173_44_13858877_18469489_1_12951234_27311_1387...,swappable_vs_25_nr,1683050051260560,2023-05-02 13:54:11,True
3710,646942,200399,44,173_44_13932963_18565080_1_13016560_6558_13904...,swappable_vs_25_nr,1683233440527625,2023-05-04 16:50:40,True
37920,2030289,200420,44,173_44_14806715_19815134_1_13881819_22936_1493...,swappable_bgx_48,1687351947648269,2023-06-21 08:52:27,False
112504,1182804,200399,45,174_45_15081057_20180735_1_14149767_18277_1514...,swappable_vs_25_nr,1688744839696555,2023-07-07 11:47:19,True
7085,24546238,200399,44,173_44_14036407_18716008_1_13121014_30893_1105...,swappable_bgx_48,1683736187328805,2023-05-10 12:29:47,True
93130,1981033,200399,45,174_45_14552570_19463591_1_13629246_31198_1463...,swappable_vs_25_nr,1686141707145008,2023-06-07 08:41:47,True
17679,917883,200422,44,173_44_14284598_19071806_1_13358458_4423_14349...,swappable_bgx_48,1684859155235337,2023-05-23 12:25:55,False


In [121]:
df = df_tasks.groupby(['sku','suctionCup'], sort=True).agg(p=pd.NamedAgg(column='success', aggfunc=lambda x: sum(x)/len(x)), n=pd.NamedAgg(column='success', aggfunc='count'))
df = df.reset_index()
df = df.sort_values(by='n', ascending=False)

In [122]:
df.head(20)

Unnamed: 0,sku,suctionCup,p,n
1637,24388284,swappable_vs_25_nr,0.901372,1531
1074,2145184,swappable_vs_25_nr,0.940319,1441
919,2030289,swappable_bgx_48,0.886727,1439
1631,24388084,swappable_bgx_48,0.929608,1378
4967,650499,swappable_vs_25_nr,0.830797,1117
6052,867474,swappable_bgx_48,0.520188,1065
4373,512663,swappable_vs_25_nr,0.997183,1065
6287,917881,swappable_bgx_48,0.929746,669
505,1611435,swappable_vs_25_nr,0.787136,653
6050,867473,swappable_bgx_48,0.687601,621


## Need to find the SKUs picked with more than one cup 

In [123]:
df_filtered = df[df['n'] >= 20]

In [124]:
df_filtered = df_filtered[df_filtered['sku'].duplicated(keep=False)].sort_values(by='sku', ascending=False)

In [125]:
df_filtered = df_filtered[df_filtered['sku'].duplicated(keep=False)].sort_values(by='n', ascending=False)

In [126]:
df_filtered[0:30]

Unnamed: 0,sku,suctionCup,p,n
4373,512663,swappable_vs_25_nr,0.997183,1065
505,1611435,swappable_vs_25_nr,0.787136,653
2017,24430099,swappable_vs_25_nr,0.855932,472
2012,24430096,swappable_vs_25_nr,0.872941,425
264,1182803,swappable_vs_25_nr,0.842444,311
4372,512663,swappable_bgx_48,1.0,300
504,1611435,swappable_vs_18_nr,0.789256,242
4856,634797,swappable_vs_25_nr,0.982833,233
6006,863057,swappable_vs_25_nr,0.827907,215
2375,24502578,swappable_vs_25_nr,0.874396,207


### SKU 24430099
* baseline = 0.855932
* dmin = baseline*0.15
* alpha = 0.05
* power = 0.8

Size = 121

In [141]:
df_filtered[df_filtered['sku'] == '24430099']

Unnamed: 0,sku,suctionCup,p,n
2017,24430099,swappable_vs_25_nr,0.855932,472
2016,24430099,swappable_vs_18_nr,0.781955,133


### SKU 1611435
* baseline = 0.787136
* dmin = baseline*0.15
* alpha = 0.05
* power = 0.8

Size = 196

In [144]:
df_filtered[df_filtered['sku'] == '1611435']

Unnamed: 0,sku,suctionCup,p,n
505,1611435,swappable_vs_25_nr,0.787136,653
504,1611435,swappable_vs_18_nr,0.789256,242
