### Data Cleaning

### Steps: 
1. depot_delivered_qty_on_day remove where negative
2. closing_inventory_on_day remove where negative
3. waste_value_on_day remove where negative
4. Remove rows where (instore['stock_out_ind_on_day'] + instore['ranging_indicator_on_day']) != 1

In [1]:
import pandas as pd

In [2]:
fore = pd.read_csv('../data/Case Study - Forecast Data.txt', sep='\t', encoding='utf-16',
                  parse_dates=['calendar_date'])

In [3]:
depot = pd.read_csv('../data/Case Study - Depot Data.txt', sep='\t', encoding='utf-16',
                   parse_dates=['calendar_date'])

In [4]:
instore = pd.read_csv('../data/Case Study - In Store Data.txt', sep='\t', encoding='utf-16',
                     parse_dates=['calendar_date'])

In [5]:
cinv = pd.read_csv('../data/Case Study - Closing Inventory.txt', sep='\t', encoding='utf-16',
                   parse_dates=['calendar_date'])

#### 1. depot_delivered_qty_on_day remove where negative

In [6]:
depot_c = depot[~(depot['depot_delivered_qty_on_day'] < 0)]

#### 2. closing_inventory_on_day remove where negative


In [7]:
instore_c = instore[~(instore['closing_inventory_on_day'] < 0)]

#### 3. waste_value_on_day remove where negative

In [8]:
instore_c = instore_c[~(instore_c['waste_value_on_day'] < 0)]

#### 4. Remove rows where (instore['stock_out_ind_on_day'] + instore['ranging_indicator_on_day']) != 1

In [9]:
#(instore['stock_out_ind_on_day'] + instore['ranging_indicator_on_day']).isna().value_counts()

In [10]:
# Use instore_c
#instore_c = instore_c[(instore_c['stock_out_ind_on_day'] + instore_c['ranging_indicator_on_day'] == 1)]

#### 5. **Added:** Remove rows with too much missing data: stock_out_ind_on_day, ranging_indicator_on_day, closing_inventory_on_day

In [16]:
instore_c = instore_c[~(instore_c['stock_out_ind_on_day'].isna() | instore_c['ranging_indicator_on_day'].isna())]

In [17]:
fore = fore[~fore['forecast_demand_on_day'].isna()]

## Join data

**Note:** No need to use `Closing Inventory`

**Check mutual columns before joining to ensure clean joins**

In [18]:
# Are the consistent columns the same in depot and instore
mut_cols = list(depot.columns[depot.columns.isin(instore_c.columns)])
(depot[mut_cols] == instore[mut_cols]).apply(lambda x: x.value_counts()).transpose()
# or all(depot[mut_cols] == instore[mut_cols])
# yes

Unnamed: 0,True
upc,2648539
calendar_date,2648539
calendar_id,2648539
store_id,2648539
geography_id,2648539
shelf_life,2648539
units_per_tray,2648539


In [19]:
depot_cols = ['upc', 'calendar_date', 'store_id', 'geography_id', 'depot_delivered_qty_on_day',
       'depot_delivered_qty_over_minus_2_day',
       'depot_delivered_qty_over_shelf_life_plus_1',
       'depot_lvl_required_qty_over_supplier_lead_time',
       'depot_lvl_target_inventory_on_day', 'depot_ordered_qty_on_day',
       'depot_ordered_qty_over_minus_2_day',
       'depot_ordered_qty_over_shelf_life_plus_1',
       'depot_ordered_qty_over_supplier_lead_time', 'depot_store_id']

In [20]:
instorec_dep = instore_c.merge(depot[depot_cols], on=['upc', 'calendar_date', 'store_id', 'geography_id'],
                              how='inner')

In [21]:
instorec_dep.shape

(1756624, 36)

In [22]:
# Are the consistent columns the same in depot and instore
mut_cols = list(fore.columns[fore.columns.isin(instorec_dep.columns)])
#(fore[mut_cols] == instorec_dep[mut_cols]).apply(lambda x: x.value_counts()).transpose()
all(depot[mut_cols] == instore[mut_cols])
# yes - no oddities around shelf life and tray size

True

In [23]:
joined_data = instorec_dep.merge(fore.drop(columns=['calendar_id', 'shelf_life', 'units_per_tray'], axis=1),
                    on=['upc', 'calendar_date', 'store_id', 'geography_id'])

In [24]:
joined_data = joined_data.drop(['calendar_id'], axis=1)

### Write out joined data

In [25]:
assert joined_data.shape[0] < fore.shape[0]

In [27]:
joined_data.shape

(1750865, 62)

In [28]:
joined_data.to_csv('../data/cleaned_prep.csv', index=False)

**Note:** there may be some more to clean up...

In [30]:
joined_data['forecast_demand_on_day'].isna().value_counts()

False    1750865
Name: forecast_demand_on_day, dtype: int64