In [None]:
# default_exp data.clean_rows

# Clean dataset rows
> remove duplicates, N/A and calculate bad order cancellations

In [None]:
#hide
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
# export
import logging
import pandas as pd

In [None]:
# hide
logging.basicConfig(level=logging.INFO)

In [None]:
# export

def clean_drop_na(df: pd.DataFrame) -> pd.DataFrame:
    df = df.dropna(axis=0, subset=['CustomerID'])
    return df

def clean_drop_duplicates(df: pd.DataFrame) -> pd.DataFrame:
    df = df.drop_duplicates()
    return df

In [None]:
# export

def clean_remove_bad_order_cancellations(df: pd.DataFrame) -> pd.DataFrame:
    """
    Returns a copy of `df` that doesn't include the following orders:
      1. a cancel order exists without counterpart,
      2. a order for which there exists at least one counterpart with exactly the same quantity.
    NOTE: time complexity is O(n).
    """
    
    df_cleaned = df.copy(deep=True)
    df_cleaned['QuantityCanceled'] = 0

    entry_to_remove = [] ; doubtfull_entry = []

    for index, col in  df.iterrows():
        if col['Quantity'] > 0 or col['Description'] == 'Discount':
            continue        
        df_test = df[(df['CustomerID'] == col['CustomerID']) &
                     (df['StockCode']  == col['StockCode']) & 
                     (df['InvoiceDate'] < col['InvoiceDate']) & 
                     (df['Quantity']   > 0)].copy()
        
        # Cancelation WITHOUT counterpart
        if df_test.shape[0] == 0: 
            doubtfull_entry.append(index)
        
        # Cancelation WITH a counterpart
        elif df_test.shape[0] == 1: 
            index_order = df_test.index[0]
            df_cleaned.loc[index_order, 'QuantityCanceled'] = -col['Quantity']
            entry_to_remove.append(index)        
        
        # Various counterparts exist in orders: we delete the last one
        elif df_test.shape[0] > 1:
            df_test.sort_index(axis=0 ,ascending=False, inplace=True)
            for ind, val in df_test.iterrows():
                if val['Quantity'] < -col['Quantity']: continue
                df_cleaned.loc[ind, 'QuantityCanceled'] = -col['Quantity']
                entry_to_remove.append(index) 
                break            
    
    #logging.info("entry_to_remove: {}".format(len(entry_to_remove)))
    #logging.info("doubtfull_entry: {}".format(len(doubtfull_entry)))

    df_cleaned.drop(entry_to_remove, axis=0, inplace=True)
    df_cleaned.drop(doubtfull_entry, axis=0, inplace=True)
    
    remaining_entries = df_cleaned[(df_cleaned['Quantity'] < 0) & (df_cleaned['StockCode'] != 'D')]
    #logging.info("nb of entries to delete: {}".format(remaining_entries.shape[0]))
    #logging.info(str(remaining_entries[:5]))
    
    df_cleaned['TotalPrice'] = df_cleaned['UnitPrice'] * (df_cleaned['Quantity'] - df_cleaned['QuantityCanceled'])
    
    return df_cleaned

In [None]:
# export

def clean_data_rows(df: pd.DataFrame) -> pd.DataFrame:
    df = clean_drop_na(df)
    df = clean_drop_duplicates(df)
    df = clean_remove_bad_order_cancellations(df)
    return df

In [None]:
from featurologists.data.load_split import load_data_csv

df = load_data_csv('../data/output/01_data_split_offline_online/no_live_data.csv')
clean_data_rows(df[:1000])

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,QuantityCanceled,TotalPrice
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom,0,15.30
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,0,20.34
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom,0,22.00
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,0,20.34
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,0,20.34
...,...,...,...,...,...,...,...,...,...,...
995,536520,22469,HEART OF WICKER SMALL,1,2010-12-01 12:43:00,1.65,14729,United Kingdom,0,1.65
996,536520,22100,SKULLS SQUARE TISSUE BOX,1,2010-12-01 12:43:00,1.25,14729,United Kingdom,0,1.25
997,536520,22096,PINK PAISLEY SQUARE TISSUE BOX,1,2010-12-01 12:43:00,1.25,14729,United Kingdom,0,1.25
998,536520,22583,PACK OF 6 HANDBAG GIFT BOXES,1,2010-12-01 12:43:00,2.55,14729,United Kingdom,0,2.55
