In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train_orig = pd.read_csv("train_FE.csv")
test_orig = pd.read_csv("test_FE.csv")

In [3]:
train = train_orig.copy()
test = test_orig.copy()

In [4]:
def submission_df(predicted_test):
    submission = test_orig[["Item_Identifier","Outlet_Identifier"]]
    submission["Item_Outlet_Sales"]=predicted_test
    return submission

In [6]:
from sklearn.model_selection import KFold,cross_val_score
import numpy as np
from sklearn.metrics import mean_squared_error

def apply_model(model,train,test,target_feature,k_splits):
   
    #define predictor and response variables
    x_train=train.drop([target_feature],axis=1)
    y_train = train[[target_feature]]

    model.fit(x_train,y_train)
    y_train_pred = model.predict(x_train)
   
    #define cross-validation method to use
    cv = KFold(n_splits=k_splits, random_state=1, shuffle=True) 
    #get the mean square error cv scores
    scores = cross_val_score(model, x_train, y_train, scoring='neg_mean_squared_error',cv=cv, n_jobs=-1)
    # get the square root of mean square error cv scores
    scores = np.sqrt(np.abs(scores))
    
    print(f"Training RMSE {np.sqrt(mean_squared_error(y_train,y_train_pred))}")
    print(f"CV  mean = {np.mean(scores)} max = {np.max(scores)} min = {np.min(scores)} std = {np.std(scores)}")
    
    y_test_pred = model.predict(test)
   
    print("Number of negative values predicted for training: {},test :{}".format((y_train_pred<0).sum(),(y_test_pred<0).sum()))
    
    final_pred_df = submission_df(y_test_pred)
   
    return final_pred_df

In [7]:

from xgboost import XGBRegressor

train_df = train.drop(["Item_Identifier","Outlet_Identifier"],axis=1)
test_df = test.drop(["Item_Identifier","Outlet_Identifier"],axis=1)

models={"xgb":XGBRegressor(random_state=1)}
df={"xgb":None}

for model in models.keys():
    print(model)
   
    df[model] = apply_model(models[model],train_df,test_df,"Item_Outlet_Sales",5)
    

xgb
Training RMSE 711.9920295518928
CV  mean = 1159.60877605948 max = 1199.9321617531295 min = 1088.8961589276746 std = 38.40269487895007
Number of negative values predicted for training: 11,test :46


In [9]:
from sklearn.model_selection import RandomizedSearchCV
class Tuning():
    def random_search_cv(self,model,params,train,test,target_feature):
        
        #define predictor and response variables
        x_train=train.drop([target_feature],axis=1)
        y_train = train[[target_feature]]
        
        randomcv=RandomizedSearchCV(estimator = model,param_distributions = params,scoring= 'neg_root_mean_squared_error',n_iter = 100,cv = 3,verbose = 2,
                               random_state=100,n_jobs=-1)
        randomcv.fit(x_train,y_train)
        print(randomcv.best_params_)
        
        #best_model = randomcv.best_estimator_
        #best_model.fit(x_train,y_train)
        #y_pred_test = best_model
        print("best random score" ,randomcv.best_score_)
        return randomcv.best_estimator_
    
    
    
    def apply_model(self,model,train,test,target_feature,k_splits):
   
        #define predictor and response variables
        x_train=train.drop([target_feature],axis=1)
        y_train = train[[target_feature]]

        model.fit(x_train,y_train)
        y_train_pred = model.predict(x_train)
   
        #define cross-validation method to use
        cv = KFold(n_splits=k_splits, random_state=1, shuffle=True) 
        #get the mean square error cv scores
        scores = cross_val_score(model, x_train, y_train, scoring='neg_mean_squared_error',cv=cv, n_jobs=-1)
        # get the square root of mean square error cv scores
        scores = np.sqrt(np.abs(scores))
    
        print(f"Training RMSE {np.sqrt(mean_squared_error(y_train,y_train_pred))}")
        print(f"CV  mean = {np.mean(scores)} max = {np.max(scores)} min = {np.min(scores)} std = {np.std(scores)}")
    
        y_test_pred = model.predict(test)
   
        print("Number of negative values predicted for training: {},test :{}".format((y_train_pred<0).sum(),(y_test_pred<0).sum()))
    
        final_pred_df = submission_df(y_test_pred)
   
        return final_pred_df

In [10]:
xgb_params = {"eta": [x for x in np.linspace(0.001,0.2,10)],
             "min_child_weight" : [int(x) for x in np.linspace(1,100,10)],
            "max_depth" : [int(x) for x in np.linspace(3,10,8)],
            "sub_sample" : [x for x in np.linspace(0.5,1,5)],
             "col_sample":[x for x in np.linspace(0.5,1,5)],
            "min_samples_leaf":[int(x) for x in np.linspace(1,10,5)],
            "max_leaf_nodes":[int(x) for x in np.linspace(1,10,5)],
            "max_samples":[x for x in np.linspace(1,10,5)]}

models_with_params = [("xgboost",XGBRegressor(random_state=1),xgb_params)]

In [11]:
train_df = train.drop(["Item_Identifier","Outlet_Identifier"],axis=1)
test_df = test.drop(["Item_Identifier","Outlet_Identifier"],axis=1)

for model in models_with_params:
    print(model[0])
    tuning_obj = Tuning()
    best_estimator = tuning_obj.random_search_cv(model[1],model[2],train_df,test_df,"Item_Outlet_Sales")
    print(best_estimator)
    df["XGBWithTuning"]=tuning_obj.apply_model(best_estimator,train_df,test_df,"Item_Outlet_Sales",5)

xgboost
Fitting 3 folds for each of 100 candidates, totalling 300 fits
Parameters: { "col_sample", "max_leaf_nodes", "max_samples", "min_samples_leaf", "sub_sample" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


{'sub_sample': 0.625, 'min_samples_leaf': 1, 'min_child_weight': 100, 'max_samples': 10.0, 'max_leaf_nodes': 7, 'max_depth': 4, 'eta': 0.045222222222222226, 'col_sample': 0.75}
best random score -1080.908634918749
XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None, col_sample=0.75,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eta=0.045222222222222226, eval_metric=None, gamma=0, gpu_id=-1,
             grow_policy='depthwi

In [12]:
#df["XGBWithTuning"].to_csv("XGBTuning.csv",index=False)
#Your score for this submission is : 1154.1352887420287.