In [None]:

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
#from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.experimental import enable_iterative_imputer

from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import optuna

In [None]:

!gdown https://drive.google.com/uc?id=1GweUxUAZJhhUVgKHnhL0Hwd6qGZ25BCe
!gdown https://drive.google.com/uc?id=1HOSsnY0tUWlCjvIoxZPXkgx3J1tI_vgD

In [None]:

train_data = pd.read_csv("Kaggle_Training_Dataset_v2.csv")
test_data = pd.read_csv("Kaggle_Test_Dataset_v2.csv")

In [None]:

X_train = train_data.drop('went_on_backorder',axis=1)[:-1]
Y_train = train_data['went_on_backorder'][:-1]
X_test = test_data.drop('went_on_backorder',axis=1)[:-1]
Y_test = test_data['went_on_backorder'][:-1]

In [None]:
columns = X_train.columns
categorical_features = [ feature for feature in columns if X_train[feature].dtype == 'O' ]
numeric_features = [ feature for feature in columns if feature not in categorical_features]

In [None]:

X_train = X_train.drop('sku',axis=1)
X_test = X_test.drop('sku',axis=1)

In [None]:

Y_train = Y_train.map({'Yes':1,'No':0})
Y_test = Y_test.map({'Yes':1,'No':0})

In [None]:


for feature in categorical_features:
    if feature == 'sku': continue
    X_train[feature] = X_train[feature].map({'Yes':1,'No':0})
    X_test[feature] = X_test[feature].map({'Yes':1,'No':0})
#X_train[categorical_features] = X_train[categorical_features].apply(lambda x: x.map({'Yes':1,'No':0}),axis=1)

In [None]:

X_train['perf_6_month_avg'].replace({-99:np.nan},inplace=True)
X_train['perf_12_month_avg'].replace({-99:np.nan},inplace=True)
X_test['perf_6_month_avg'].replace({-99:np.nan},inplace=True)
X_test['perf_12_month_avg'].replace({-99:np.nan},inplace=True)

In [None]:

# Define models
models = {
    'XGBoost': XGBClassifier,
    #'CatBoost': CatBoostClassifier,
    'LightGBM': LGBMClassifier,
    'BalancedRandomForest': BalancedRandomForestClassifier,    
    'SVC': SVC,
    'RandomForest': RandomForestClassifier,


}

In [None]:

# Optuna objective function
def objective(trial, model_name, model_class):
    # Define imputer selection
    imputer_name = trial.suggest_categorical('imputer', ['SimpleImputer', 'IterativeImputer'])
    if imputer_name == 'SimpleImputer':
        imputer_strategy = trial.suggest_categorical('imputer__strategy', ['mean', 'median'])
        imputer = SimpleImputer(strategy=imputer_strategy)
    elif imputer_name == 'IterativeImputer':
        initial_strategy = trial.suggest_categorical('imputer__strategy', ['mean', 'median'])
        imputer = IterativeImputer(initial_strategy=initial_strategy)

    # Define hyperparameters to tune
    if model_name == 'RandomForest' or model_name == 'BalancedRandomForest':
        n_estimators = trial.suggest_int('n_estimators', 20, 70)
        max_depth = trial.suggest_categorical('max_depth', [None, 10, 20, 30])
        model = model_class(n_estimators=n_estimators, max_depth=max_depth)
    elif model_name == 'SVC':
        C = trial.suggest_loguniform('C', 0.1, 10)
        gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])
        kernel = trial.suggest_categorical('kernel', ['rbf', 'linear'])
        model = model_class(C=C, gamma=gamma, kernel=kernel)
    elif model_name == 'XGBoost':
        n_estimators = trial.suggest_int('n_estimators', 20, 70 )
        learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.2)
        max_depth = trial.suggest_int('max_depth', 3, 9)
        model = model_class(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, eval_metric='mlogloss')
    elif model_name == 'CatBoost':
        iterations = trial.suggest_int('iterations', 100, 300)
        learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.2)
        depth = trial.suggest_int('depth', 3, 9)
        model = model_class(iterations=iterations, learning_rate=learning_rate, depth=depth, verbose=0)
    elif model_name == 'LightGBM':
        n_estimators = trial.suggest_int('n_estimators', 20, 70)
        learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.2)
        num_leaves = trial.suggest_int('num_leaves', 31, 60)
        model = model_class(n_estimators=n_estimators, learning_rate=learning_rate, num_leaves=num_leaves)

    # Define preprocessing pipeline
    numeric_transformer = Pipeline(steps=[
        ('imputer', imputer),
        ('scaler', StandardScaler())
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features)
        ]
    )

    # Create pipeline
    pipeline = ImbPipeline(steps=[
        ('preprocessor', preprocessor),
        ('sampler', SMOTE(random_state=42)),
        ('model', model)
    ])

    # Cross-validation
    score = cross_val_score(pipeline, X_train, Y_train, cv=3, scoring='roc_auc').mean()
    return score

# Store results
best_estimators = {}
best_params = {}
best_scores = {}

In [None]:



# Optimize each model
for model_name, model_class in models.items():
    print(f"Optimizing {model_name} with Optuna...")
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, model_name, model_class), n_trials=10,show_progress_bar=True,n_jobs=4)

    best_estimators[model_name] = model_class(**{k.replace('model__', ''): v for k, v in study.best_params.items() if 'model__' in k})
    best_params[model_name] = study.best_params
    best_scores[model_name] = study.best_value
    print(f"Best Params for {model_name}: {study.best_params}")
    print(f"Best CV Score for {model_name}: {study.best_value}\n")


In [None]:
# Evaluate best models on test set
for model_name in best_estimators.keys():
    imputer_params = {k.replace('imputer__', ''): v for k, v in best_params[model_name].items() if 'imputer__' in k}
    imputer_name = best_params[model_name]['imputer']
    if imputer_name == 'SimpleImputer':
        imputer = SimpleImputer(**imputer_params)
    elif imputer_name == 'IterativeImputer':
        imputer = IterativeImputer(**imputer_params)

    # Define preprocessing pipeline
    numeric_transformer = Pipeline(steps=[
        ('imputer', imputer),
        ('scaler', StandardScaler())
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, [0, 1, 2, 3])
        ]
    )

    # Create final pipeline
    pipeline = ImbPipeline(steps=[
        ('preprocessor', preprocessor),
        ('sampler', SMOTE(random_state=42)),
        ('model', best_estimators[model_name])
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    print(f"Test Accuracy for {model_name}: {test_accuracy}")

In [None]:

# Summary of results
summary = pd.DataFrame({
    'Model': list(best_scores.keys()),
    'Best CV Score': list(best_scores.values()),
    'Best Params': list(best_params.values())
})

print("\nSummary of Optuna Results:")
print(summary)