In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [15]:
train = pd.read_csv("../tidy_data/train.csv", index_col = 0)
test = pd.read_csv("../tidy_data/test.csv", index_col = 0)

  mask |= (ar1 == a)


# From tidy data to clean data - train and validation

In [16]:
train.went_on_backorder.value_counts()

0    1521323
1      10396
Name: went_on_backorder, dtype: int64

In [17]:
train.isnull().mean()

national_inv         0.000000
lead_time            0.059976
in_transit_qty       0.000000
min_bank             0.000000
potential_issue      0.000000
pieces_past_due      0.000000
local_bo_qty         0.000000
deck_risk            0.000000
oe_constraint        0.000000
ppap_risk            0.000000
stop_auto_buy        0.000000
rev_stop             0.000000
forecast_3_month     0.000000
forecast_6_month     0.042786
forecast_9_month     0.064662
perf_12_month_avg    0.044718
perf_6_month_avg     0.033780
sales_1_month        0.004986
sales_3_month        0.042786
sales_6_month        0.054690
sales_9_month        0.009972
went_on_backorder    0.000000
dtype: float64

In [18]:
train.describe()

Unnamed: 0,national_inv,lead_time,in_transit_qty,min_bank,potential_issue,pieces_past_due,local_bo_qty,deck_risk,oe_constraint,ppap_risk,...,forecast_3_month,forecast_6_month,forecast_9_month,perf_12_month_avg,perf_6_month_avg,sales_1_month,sales_3_month,sales_6_month,sales_9_month,went_on_backorder
count,1531719.0,1439852.0,1531719.0,1531719.0,1531719.0,1531719.0,1531719.0,1531719.0,1531719.0,1531719.0,...,1531719.0,1466183.0,1432675.0,1463223.0,1479977.0,1524082.0,1466183.0,1447949.0,1516445.0,1531719.0
mean,501.9623,7.877446,43.31803,53.02931,0.0005157604,2.031273,0.6386348,0.2256876,0.000155381,0.120498,...,178.6599,347.5876,507.1983,-6.456384,-6.901722,55.7932,174.4999,342.6361,523.8428,0.006787146
std,30857.72,7.047404,1359.49,1329.703,0.02270451,234.2047,33.93928,0.4180345,0.01246423,0.325543,...,5291.142,10275.04,14815.04,25.87365,26.60285,1980.1,5265.509,9826.218,14919.68,0.08210411
min,-27256.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-99.0,-99.0,0.0,0.0,0.0,0.0,0.0
25%,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.66,0.63,0.0,0.0,0.0,0.0,0.0
50%,15.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.81,0.82,0.0,1.0,2.0,4.0,0.0
75%,80.0,9.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,12.0,20.0,0.95,0.96,4.0,15.0,31.0,46.0,0.0
max,12334400.0,52.0,489408.0,313319.0,1.0,146496.0,10024.0,1.0,1.0,1.0,...,1510592.0,2461360.0,3777304.0,1.0,1.0,741774.0,1104181.0,2146625.0,3201035.0,1.0


The data, although tidy, has three problems: 
1. It is strongly unbalanced 150/1
2. Some NaNs
3. The values are not standardized (this will be taken care of in the sklearn pipelines)

This is my proposed solution for the train set: 
1. Split into a stratified train and test
2. Impute with the median separetely
3. Reduce the most common class by a factor of 30 
4. Oversample the least common class by a factor of 5

In [19]:
# Step 1. Split into a stratified train and test
from sklearn.model_selection import train_test_split
X = train.drop("went_on_backorder", 1)
y = train["went_on_backorder"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = .3, 
                                                  stratify = y, random_state=42)
y_train.value_counts()

0    1064926
1       7277
Name: went_on_backorder, dtype: int64

In [20]:
# Step 2. Impute with the median separetely
X_train = X_train.apply(lambda x: x.fillna(x.median()))
X_val = X_val.apply(lambda x: x.fillna(x.median()))

In [21]:
# Step 3. Reduce the most common class by a factor of 30  (only on train)
tmp = pd.concat([X_train, y_train], axis=1)

tr1 = tmp[tmp["went_on_backorder"] == 1]
tr2 = tmp[tmp["went_on_backorder"] == 0].sample(frac=0.33, random_state=42)

reduced = pd.concat([tr1, tr2], axis=0)

X_train_under = reduced.drop("went_on_backorder", 1)
y_train_under = reduced["went_on_backorder"]

y_train_under.value_counts()

0    351426
1      7277
Name: went_on_backorder, dtype: int64

In [22]:
# Step 4. Oversample the least common class by a factor of 5
from imblearn.over_sampling import SMOTE 

sm = SMOTE(random_state=42)
X_train_under_over, y_train_under_over = sm.fit_sample(X_train_under, y_train_under)

X_train_under_over = pd.DataFrame(X_train_under_over, columns=X_train.columns)
y_train_under_over = pd.Series(y_train_under_over, name="went_on_back_order")

y_train_under_over.value_counts()

1    351426
0    351426
Name: went_on_back_order, dtype: int64

In [28]:
X_train.shape

(1072203, 21)

In [29]:
y_train.shape

(1072203,)

We reached a balanced training dataset with no more `NaNs`.

# From tidy data to clean data - test

In [23]:
test.isnull().sum()

national_inv           0
lead_time            280
in_transit_qty         0
min_bank               0
potential_issue        0
pieces_past_due        0
local_bo_qty           0
deck_risk              0
oe_constraint          0
ppap_risk              0
stop_auto_buy          0
rev_stop               0
forecast_3_month       0
forecast_6_month       0
forecast_9_month       0
perf_12_month_avg      0
perf_6_month_avg       0
sales_1_month          0
sales_3_month          0
sales_6_month          0
sales_9_month          0
dtype: int64

In [24]:
test.lead_time.describe()

count    4720.000000
mean        7.712076
std         6.547360
min         0.000000
25%         4.000000
50%         8.000000
75%         9.000000
max        52.000000
Name: lead_time, dtype: float64

We can see that `lead_time` has 5% missing values. Also from the `describe()` function, we can see a min value of 0 and a max value of 52. Given those outliers, I chose to impute the missing values with the median rather than the mean.

In [25]:
test["lead_time"] = test["lead_time"].fillna(test["lead_time"].median())

# Export the clean dataset

In [26]:
X_train.to_csv("../clean_data/X_train.csv")
X_val.to_csv("../clean_data/X_val.csv")

y_train.to_csv("../clean_data/y_train.csv")
y_val.to_csv("../clean_data/y_val.csv")

test.to_csv("../clean_data/test.csv")