# **KPI ANALYSIS - "PICK_DATA"**

## Packages

### Installing packages

In [None]:
# !pip3 install numpy
# !pip3 install --upgrade pip
# !pip3 install pandas
# !pip3 install matplotlib
# !pip3 install seaborn

### Importing packages

In [1]:
import time
import random
import numpy as np # type: ignore
import pandas as pd # type: ignore
import seaborn as sns # type: ignore
from datetime import datetime
import matplotlib.pyplot as plt # type: ignore
from itertools import combinations
from IPython.display import clear_output # type: ignore

## Functions

Function that returns an overview of the **pick_data** based on all columns

In [2]:
# function that returns an overview of the pick_data based on all columns
def general_overview(pick_data):

    #get unique values for each variable
    unique_number_of_products = len(set(pick_data.product_id.values))
    unique_number_of_categories = len(set(pick_data.category.values))
    unique_number_of_source_order_ids = len(set(pick_data.source_order_id.values))
    unique_number_of_order_numbers = len(set(pick_data.order_number.values))
    unique_number_of_positions_in_order = len(set(pick_data.position_in_order.values))
    unique_number_of_quantity_unit = len(set(pick_data.quantity_unit.values))

    # print the results for a better understanding
    print("The length of the data is: ", len(pick_data))
    print("-----")
    print("-----")
    print("product_id")
    print("The number of unique product_id is: ",  unique_number_of_products)
    print("-----")
    print("category")
    print("The number of unique category is: ", unique_number_of_categories)
    print("-----")
    print("source_order_id")
    print("The number of unique source_order_id is: ", unique_number_of_source_order_ids)
    print("-----")
    print("order_number")
    print("The number of unique order_number is: ", unique_number_of_order_numbers)
    print("-----")
    print("position_in_order")
    print("The number of unique position_in_order is: ", unique_number_of_positions_in_order)
    print("-----")
    print("pick_volume")
    print("The min value of pick_volume is: ", pick_data.pick_volume.min())
    print("The max value of pick_volume is: ", pick_data.pick_volume.max())
    print("-----")
    print("quantity_unit")
    print("The number of unique quantity_unit is: ", unique_number_of_quantity_unit)
    print("-----")
    print("date_time")
    print("The first date_time of the picks_data is: ", pick_data.date_time.min())
    print("The last date_time of the picks_data is: ", pick_data.date_time.max())

Function that checks duplicate rows showing their duplicated partners based on a column selection

In [3]:
def check_duplicates_row_columns(pick_data, index_to_check, columns_to_evaluate):

    index_row_pick_data = pick_data.loc[index_to_check, columns_to_evaluate]

    matching_rows = pick_data[pick_data[columns_to_evaluate].eq(index_row_pick_data).all(axis=1)]
    
    return(matching_rows)

Function that checks if integer list is sequential

In [4]:
def is_sequential(int_list):
    
    # sort the list first (if not already sorted)
    sorted_list = sorted(int_list)

    # check if each consecutive number differs by 1
    for i in range(len(sorted_list) - 1):
        if sorted_list[i + 1] != sorted_list[i] + 1:
            return False

    return True

Function that counts the number of times a list is not sequential

In [5]:
def count_non_sequential(int_list):
    
    # sort the list first (if not already sorted)
    sorted_list = sorted(int_list)

    # Initialize a counter for non-sequential occurrences
    non_sequential_count = 0

    # check each consecutive number
    for i in range(len(sorted_list) - 1):
        if sorted_list[i + 1] != sorted_list[i] + 1:
            non_sequential_count += 1

    return non_sequential_count

## Importing data

### Importing data

In [6]:
# read cvs pick_data file
pick_data = pd.read_csv("pick_data_upload_2.csv")

  pick_data = pd.read_csv("pick_data_upload_2.csv")


Head of data

In [7]:
# print head of data
pick_data.head()

Unnamed: 0,product_id,category,source_order_id,order_number,position_in_order,pick_volume,quantity_unit,date_time,flag_6,flag_7
0,7,AKL,48,20188105006,2,4,St,2018-01-16 03:05:21,0,0
1,10,HRL,48,20188197426,1,24,St,2018-01-30 10:52:31,0,0
2,10,HRL,48,20188197426,1,24,St,2018-01-30 10:58:08,0,0
3,10,HRL,48,20188197426,1,12,St,2018-01-30 12:03:18,0,0
4,10,HRL,48,20188197426,1,24,St,2018-01-30 12:06:39,0,0


In [8]:
pick_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10118254 entries, 0 to 10118253
Data columns (total 10 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   product_id         object
 1   category           object
 2   source_order_id    int64 
 3   order_number       int64 
 4   position_in_order  int64 
 5   pick_volume        int64 
 6   quantity_unit      object
 7   date_time          object
 8   flag_6             int64 
 9   flag_7             int64 
dtypes: int64(6), object(4)
memory usage: 772.0+ MB


## Formating columns

### Flag Columns

Flag 8 - Outlier column

In [33]:
pick_data["flag_8"] = pick_data["flag_8"].astype(int)

In [None]:
pick_data.head()

In [None]:
pick_data.info()

### Datetime to date format

In [36]:
pick_data['date_time'] = pd.to_datetime(pick_data['date_time'])

In [None]:
pick_data.info()

### Order number

Add the year to the order number column

In [41]:
pick_data["order_number"] = pick_data['date_time'].dt.year.astype(str) + pick_data['order_number'].astype(str)

In [None]:
pick_data.head()

## Filtering Dataframe

### Filtering years

We need just years >= 2018

In [43]:
pick_data = pick_data[pick_data["date_time"] >= "2018-01-01"]

In [None]:
pick_data.info()

### Flag filtering

#### Flag (2): one single order_number has different source_order_id values

In [None]:
len(pick_data[pick_data["flag_2"]==1])

In [45]:
pick_data = pick_data[pick_data["flag_2"]==0]

In [None]:
pick_data.info()

> No picks dropped

#### Flag (3): Some product_id values are measured in different quantity_unit values

In [None]:
len(pick_data[pick_data["flag_3"]==1])

In [49]:
pick_data = pick_data[pick_data["flag_3"]==0]

In [None]:
pick_data.info()

> 2.685 picks dropped

#### Flag (4): Duplicates taking into account all columns​ (perfect duplicates)

In [None]:
len(pick_data[pick_data["flag_4"]==1])

In [53]:
pick_data = pick_data[pick_data["flag_4"]==0]

In [None]:
pick_data.info()

> 3.131 picks dropped

#### Flag (8): Outliers in pick_volume

In [None]:
len(pick_data[pick_data["flag_8"]==1])

In [56]:
pick_data = pick_data[pick_data["flag_8"]==0]

In [None]:
pick_data.info()

> 194.419 picks dropped

### Format Dataframe

#### Reset index rows

In [58]:
pick_data.reset_index(drop=True, inplace=True)

#### Drop columns

In [59]:
pick_data = pick_data.drop(columns=['flag_1',"flag_2", 'flag_3',"flag_4", 'flag_5',"flag_8"])

In [None]:
pick_data.info()

## Upload

Write csv file

In [61]:
pick_data.to_csv("pick_data_upload_2.csv", index=False)

Check that csv file is well written

In [None]:
pick_dataest = pd.read_csv("pick_data_upload_2.csv")
pick_dataest.head()