In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## Data Ingestion

In [2]:
df = pd.read_csv('/Users/iarv/Documents/Codes/backorder-prediction-old/data/raw/Kaggle_Training_Dataset_v2.csv')
print(df.shape)
df.head()

  df = pd.read_csv('/Users/iarv/Documents/Codes/backorder-prediction-old/data/raw/Kaggle_Training_Dataset_v2.csv')


(1687861, 23)


Unnamed: 0,sku,national_inv,lead_time,in_transit_qty,forecast_3_month,forecast_6_month,forecast_9_month,sales_1_month,sales_3_month,sales_6_month,...,pieces_past_due,perf_6_month_avg,perf_12_month_avg,local_bo_qty,deck_risk,oe_constraint,ppap_risk,stop_auto_buy,rev_stop,went_on_backorder
0,1026827,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-99.0,-99.0,0.0,No,No,No,Yes,No,No
1,1043384,2.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.99,0.99,0.0,No,No,No,Yes,No,No
2,1043696,2.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-99.0,-99.0,0.0,Yes,No,No,Yes,No,No
3,1043852,7.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.1,0.13,0.0,No,No,No,Yes,No,No
4,1044048,8.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-99.0,-99.0,0.0,Yes,No,No,Yes,No,No


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1687861 entries, 0 to 1687860
Data columns (total 23 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   sku                1687861 non-null  object 
 1   national_inv       1687860 non-null  float64
 2   lead_time          1586967 non-null  float64
 3   in_transit_qty     1687860 non-null  float64
 4   forecast_3_month   1687860 non-null  float64
 5   forecast_6_month   1687860 non-null  float64
 6   forecast_9_month   1687860 non-null  float64
 7   sales_1_month      1687860 non-null  float64
 8   sales_3_month      1687860 non-null  float64
 9   sales_6_month      1687860 non-null  float64
 10  sales_9_month      1687860 non-null  float64
 11  min_bank           1687860 non-null  float64
 12  potential_issue    1687860 non-null  object 
 13  pieces_past_due    1687860 non-null  float64
 14  perf_6_month_avg   1687860 non-null  float64
 15  perf_12_month_avg  1687860 non-n

## Data Cleaning

In [4]:
# sku column is irrelevent so it need to be dropped
df.drop(columns=['sku'], inplace=True)

In [5]:
df.isnull().sum()

national_inv              1
lead_time            100894
in_transit_qty            1
forecast_3_month          1
forecast_6_month          1
forecast_9_month          1
sales_1_month             1
sales_3_month             1
sales_6_month             1
sales_9_month             1
min_bank                  1
potential_issue           1
pieces_past_due           1
perf_6_month_avg          1
perf_12_month_avg         1
local_bo_qty              1
deck_risk                 1
oe_constraint             1
ppap_risk                 1
stop_auto_buy             1
rev_stop                  1
went_on_backorder         1
dtype: int64

In [6]:
# Dataset has 1 abnormal row : last row
df = df[:-1]

In [7]:
df.isnull().sum()

national_inv              0
lead_time            100893
in_transit_qty            0
forecast_3_month          0
forecast_6_month          0
forecast_9_month          0
sales_1_month             0
sales_3_month             0
sales_6_month             0
sales_9_month             0
min_bank                  0
potential_issue           0
pieces_past_due           0
perf_6_month_avg          0
perf_12_month_avg         0
local_bo_qty              0
deck_risk                 0
oe_constraint             0
ppap_risk                 0
stop_auto_buy             0
rev_stop                  0
went_on_backorder         0
dtype: int64

## Feature Engineering

In [8]:
target_name = 'went_on_backorder'
X, y = df.drop(target_name, axis=1), df[target_name]

obj_cols = X.select_dtypes('object').columns
num_cols = X.select_dtypes(['float64', 'int64']).columns

obj_cols, num_cols

(Index(['potential_issue', 'deck_risk', 'oe_constraint', 'ppap_risk',
        'stop_auto_buy', 'rev_stop'],
       dtype='object'),
 Index(['national_inv', 'lead_time', 'in_transit_qty', 'forecast_3_month',
        'forecast_6_month', 'forecast_9_month', 'sales_1_month',
        'sales_3_month', 'sales_6_month', 'sales_9_month', 'min_bank',
        'pieces_past_due', 'perf_6_month_avg', 'perf_12_month_avg',
        'local_bo_qty'],
       dtype='object'))

In [9]:
# plt.figure(figsize=(10, 10))
# sns.heatmap(df[num_cols].corr(), annot=True, fmt='.2f', square=True, cbar=False)

### Observation

- forcast_3_month, forcast_6_month, forcast_9_month are highly correlated.
- sales_3_month, sales_6_month, sales_9_month are highly correlated.

#### Take only 1 from each of the highly correlated features.

In [10]:
df['went_on_backorder'].value_counts(True) * 100

went_on_backorder
No     99.330928
Yes     0.669072
Name: proportion, dtype: float64

##### This is highly imbalanced dataset because it has only **0.67%** of data as labeled as **YES** and other **99.3%** data labeled as **NO**.

> I need to perform data upsampling or downsampling method to work with this dataset

## Requirements for model building for backorder dataset

0. Do all this with sklearn `Pipeline` and `ColumnTransformer` and then transform the data with this.
1. Impute the numerical values with `SimpleImputer(strategy='median')` and scale the numerical values with `MinMaxScaler()`.
2. Encode the categorical values with `OrdinalEncoder()` all the categorical feature has `['Yes', 'No']` category values only.
3. Peform **Up-Sampling** or **Down-Sampling** on the `went_on_backorder` target feature to balance the dataset.
4. Split the data with `train_test_split` and then transform the data with `ColumnTransformer` object which you previously build.
5. Perform `PCA()` on the dataset and select the **best 6-15 feature** of the dataset.
6. Build the model with ML Algorithms.

In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
from sklearn.utils import resample
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

def backorder_model(df):
    # Create the pipeline for numerical features
    numerical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', MinMaxScaler())
    ])

    # Create the pipeline for categorical features
    categorical_transformer = Pipeline([
        ('encoder', OrdinalEncoder())
    ])

    # Combine the numerical and categorical pipelines
    # preprocessor = ColumnTransformer([
    #     ('numerical', numerical_transformer, ['national_inv', 'lead_time', 'in_transit_qty', 'forecast_3m', 'forecast_6m', 'forecast_9m', 'sales_1m', 'sales_3m', 'sales_6m', 'sales_9m']),
    #     ('categorical', categorical_transformer, ['product_category', 'subcategory', 'brand'])
    # ])
    preprocessor = ColumnTransformer([
        ('numerical', numerical_transformer, num_cols),
        ('categorical', categorical_transformer, obj_cols)
    ])

    # Upsample the minority class
    majority = df[df['went_on_backorder'] == 'No']
    minority = df[df['went_on_backorder'] == 'Yes']
    upsampled_minority = resample(minority, replace=True, n_samples=len(majority), random_state=0)
    df = pd.concat([majority, upsampled_minority])

    # Tranform the data with preprocessor pipeline
    df = preprocessor.fit_transform(df)

    # Perform PCA and select the best 6-15 features
    pca = PCA(n_components=6)
    df = pca.fit_transform(df)

    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(df, df['went_on_backorder'], test_size=0.25, random_state=0)

    # Build the logistic regression model
    model = LogisticRegression()
    model.fit(X_train, y_train)

    # Evaluate the model
    accuracy = accuracy_score(y_test, model.predict(X_test))
    roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    print('Accuracy:', accuracy)
    print('ROC AUC:', roc_auc)

    return model
# model = backorder_model(df)


In [12]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
from sklearn.decomposition import PCA
from sklearn.utils import resample
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.svm import SVC

# Step 1: Create the pipeline
def get_transformer():
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', MinMaxScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('encoder', OrdinalEncoder(categories=[['No', 'Yes']]*len(obj_cols)))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, num_cols),
            ('cat', categorical_transformer, obj_cols)
        ])
    return preprocessor

# Step 2: Upsample or Downsample the target variable
def balance_dataset(X, y):
    X_majority = X[y == 'No']
    X_minority = X[y == 'Yes']

    y_majority = y[y == 'No']
    y_minority = y[y == 'Yes']

    X_minority_upsampled, y_minority_upsampled = resample(X_minority, y_minority,
                                                           replace=True, n_samples=X_majority.shape[0])
    X_upsampled = np.concatenate((X_majority, X_minority_upsampled))
    y_upsampled = np.concatenate((y_majority, y_minority_upsampled))
    return X_upsampled, y_upsampled

# Step 3: Perform Upsampling or Downsampling on the target feature
# X_upsampled, y_upsampled = balance_dataset(X, y)

# # Step 4: Transform the data using the preprocessor pipeline
# X = pd.DataFrame(X_upsampled, columns=df.columns[:-1])
# X_transformed = get_transformer().fit_transform(X)

# # Step 5: Perform PCA and select best 6-15 features
# pca = PCA(n_components=15)
# X_pca = pca.fit_transform(X_transformed)

# # Step 6: Build the model with ML Algorithms
# X_train, X_test, y_train, y_test = train_test_split(X_pca, y_upsampled, test_size=0.2, random_state=42)

# model = SVC()
# model.fit(X_train, y_train)

# # Evaluate the model
# y_pred = model.predict(X_test)


In [13]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
from sklearn.decomposition import PCA
from sklearn.utils import resample
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.svm import SVC

# Step 1: Create the pipeline
def get_transformer():
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', MinMaxScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('encoder', OrdinalEncoder(categories=[['No', 'Yes']]*len(obj_cols)))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, num_cols),
            ('cat', categorical_transformer, obj_cols)
        ])
    return preprocessor

# Step 2: Upsample or Downsample the target variable
def balance_dataset(X, y):
    X_majority = X[y == 'No']
    X_minority = X[y == 'Yes']

    y_majority = y[y == 'No']
    y_minority = y[y == 'Yes']

    X_minority_upsampled, y_minority_upsampled = resample(X_minority, y_minority,
                                                        replace=True, n_samples=X_majority.shape[0])
    X_upsampled = np.concatenate((X_majority, X_minority_upsampled))
    y_upsampled = np.concatenate((y_majority, y_minority_upsampled))
    return X_upsampled, y_upsampled


In [14]:
X[y == 'No'].shape, y[y == 'No'].shape

((1676567, 21), (1676567,))

In [15]:
# Step 3: Perform Upsampling or Downsampling on the target feature
X_upsampled, y_upsampled = balance_dataset(X, y)

In [17]:
len(X_upsampled), len(y_upsampled)

(3353134, 3353134)

In [19]:
pd.DataFrame(y_upsampled).value_counts(normalize=True)

No     0.5
Yes    0.5
Name: proportion, dtype: float64

In [20]:
# Step 4: Transform the data using the preprocessor pipeline
X = pd.DataFrame(X_upsampled, columns=df.columns[:-1])
X_transformed = get_transformer().fit_transform(X)

In [21]:
# Step 5: Perform PCA and select best 6-15 features
pca = PCA(n_components=15)
X_pca = pca.fit_transform(X_transformed)

In [22]:
# Step 6: Build the model with ML Algorithms
X_train, X_test, y_train, y_test = train_test_split(X_pca, y_upsampled, test_size=0.5, random_state=42)
# X_train, X_test, y_train, y_test = train_test_split(X_transformed, y_upsampled, test_size=0.5, random_state=42)

In [23]:
X_train.shape, y_train.shape

((1676567, 15), (1676567,))

In [35]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [36]:
# Evaluate the model
first = 100
y_pred = model.predict(X_test[:first])

In [37]:
from sklearn.metrics import accuracy_score

acc = accuracy_score(y_test[:first], y_pred)
acc

0.64

In [31]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X_train[-200000:], y_train[-200000:])

In [33]:
# Evaluate the model
first = 100
y_pred = model.predict(X_test[:first])

In [34]:
from sklearn.metrics import accuracy_score

acc = accuracy_score(y_test[:first], y_pred)
acc

1.0

### Score:

- `RandomForest:` 90% - 95%
- `LogisticRegression:` 50% - 65%