In [29]:
import pandas as pd
import numpy as np
import re as re
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as mno

from sklearn.feature_selection import SelectKBest , f_classif 
from sklearn.linear_model import LinearRegression , LogisticRegression, SGDClassifier, Ridge
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier

from sklearn.model_selection import RepeatedStratifiedKFold ,cross_val_score
from sklearn.model_selection import  train_test_split , RandomizedSearchCV
from sklearn.pipeline import Pipeline

from sklearn.metrics import f1_score , make_scorer
from scipy.stats import loguniform
from imblearn.under_sampling import RandomUnderSampler

import lightgbm as lgb
from scipy.stats import chi2_contingency
from scipy.stats import chi2 , pointbiserialr

from prettytable import PrettyTable

import gc

import warnings
warnings.filterwarnings('ignore')

In [68]:
train_df = pd.read_csv('final_train.csv')
train_df.columns
train_df.drop(['Unnamed: 0'],axis=1,inplace=True)
train_df.head()

Unnamed: 0,lead_time,in_transit_qty,min_bank,potential_issue,perf_6_month_avg,local_bo_qty,deck_risk,oe_constraint,ppap_risk,stop_auto_buy,went_on_backorder
0,8.0,0.0,0.0,0.0,0.95,0.0,0.0,0.0,0.0,1.0,0.0
1,8.0,2.0,0.0,0.0,0.95,0.0,0.0,0.0,0.0,1.0,0.0
2,8.0,0.0,0.0,0.0,0.42,0.0,0.0,0.0,0.0,1.0,0.0
3,9.0,0.0,0.0,0.0,0.91,0.0,1.0,0.0,0.0,1.0,0.0
4,2.0,3.0,1.0,0.0,0.9,0.0,1.0,0.0,0.0,1.0,0.0


In [69]:
test_df = pd.read_csv('final_test.csv')
test_df.columns
test_df.drop(['Unnamed: 0'],axis=1,inplace=True)
test_df.head()

Unnamed: 0,lead_time,in_transit_qty,min_bank,potential_issue,perf_6_month_avg,local_bo_qty,deck_risk,oe_constraint,ppap_risk,stop_auto_buy,went_on_backorder
0,4.0,0.0,1.0,0.0,0.73,0.0,0.0,0.0,0.0,1.0,
1,12.0,0.0,0.0,0.0,0.63,0.0,0.0,0.0,0.0,1.0,
2,9.0,41.0,22.0,0.0,0.85,0.0,0.0,0.0,0.0,1.0,
3,8.0,0.0,17.0,0.0,0.58,0.0,0.0,0.0,1.0,1.0,
4,8.0,0.0,1.0,0.0,0.85,0.0,0.0,0.0,1.0,1.0,


#### 1.Split the train data into two equal parts 

In [70]:
X_df = train_df.iloc[:train_df.shape[0]//2] 
val_df = train_df.iloc[train_df.shape[0]//2:] 

In [83]:
print(X_df.shape)
print(val_df.shape)
print(test_df.shape)

(1257425, 11)
(1257425, 11)
(421965, 11)


In [72]:
val_df.head()

Unnamed: 0,lead_time,in_transit_qty,min_bank,potential_issue,perf_6_month_avg,local_bo_qty,deck_risk,oe_constraint,ppap_risk,stop_auto_buy,went_on_backorder
1257425,4.0,0.0,0.0,0.0,0.95,0.0,1.0,0.0,1.0,1.0,0.0
1257426,8.0,0.0,0.0,0.0,0.6,0.0,0.0,0.0,1.0,1.0,0.0
1257427,8.0,0.0,2.0,0.0,0.9,0.0,0.0,0.0,0.0,1.0,0.0
1257428,12.0,0.0,0.0,0.0,0.73,0.0,0.0,0.0,0.0,1.0,0.0
1257429,8.0,0.0,0.0,0.0,0.68,0.0,0.0,0.0,0.0,1.0,0.0


#### 2.Considering K base learners so using K samples with replacement from X_df

###### 2.1 Already have done hyper parameter optimization for Base Learners.

In [73]:
lr = LogisticRegression(solver='liblinear', penalty='l2', C = 0.01)
dt = DecisionTreeClassifier(min_samples_leaf = 10, max_depth = 20, criterion = 'gini')
xgboost1 = GradientBoostingClassifier(subsample= 0.7, n_estimators= 1000, max_depth= 9, learning_rate=0.1)
lgb_clf = lgb.LGBMClassifier(subsample= 0.5, random_state = 501, num_leaves= 200, 
                   max_depth = 7, learning_rate = 0.1, colsample_bytree = 0.5, boosting_type = 'gbdt')
models = {'LogReg':lr,'DecisionTree':dt,'XGBoost':xgboost1,'LGBM':lgb_clf}

In [87]:
def train_models(models,df):
    for model_name , model_obj in models.items():
        for itr in range(len(models)):
            #Sample with replacement
            data = df.sample(n=df.shape[0]//len(models),random_state=42,replace=True)
            y = data['went_on_backorder']
            X = data.drop(['went_on_backorder'],axis = 1)
            model_obj.fit(X,y)
    return models

In [88]:
trained_models = train_models(models,X_df)

##### 2.2 Create training data for meta learner using predictions from already trained base learners

In [89]:
def data_for_stacked_model(models,data):
    preds = {}
    y = data['went_on_backorder'].values
    X = data.drop(['went_on_backorder'],axis = 1)
    for model_name , model_obj in models.items():
        preds[model_name] = list(model_obj.predict(X))
    
    train_stacking_data = pd.DataFrame(preds)
    train_stacking_data['went_on_backorder'] = y
    return train_stacking_data

In [90]:
train_stacking_df = data_for_stacked_model(trained_models,val_df)

In [91]:
test_stacking_df = data_for_stacked_model(trained_models,test_df)

In [92]:
train_stacking_df['went_on_backorder'] = train_stacking_df['went_on_backorder'].astype(int)
train_stacking_df.head()

Unnamed: 0,LogReg,DecisionTree,XGBoost,LGBM,went_on_backorder
0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0


In [93]:
test_stacking_df.shape

(1257425, 5)

In [94]:
test_stacking_df['went_on_backorder'] = train_stacking_df['went_on_backorder'].astype(int)
test_stacking_df.head()

Unnamed: 0,LogReg,DecisionTree,XGBoost,LGBM,went_on_backorder
0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0


##### 2.3 Create meta model

In [96]:
def train_metalearner(train_data,test_data,meta_model):
    y_train = train_data['went_on_backorder'].values
    X_train = train_data.drop(['went_on_backorder'],axis = 1)
    meta_model.fit(X_train,y_train)
    
    y = test_data['went_on_backorder'].values
    X = test_data.drop(['went_on_backorder'],axis = 1)
    
    y_pred = meta_model.predict(X)
    
    return f1_score(y,y_pred,average='macro')

In [98]:
f1_score = train_metalearner(train_stacking_df,test_stacking_df,lr)
print(f1_score)

0.49496297484300067
