In [4]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
import joblib
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PowerTransformer, OneHotEncoder, OrdinalEncoder

In [12]:
class DataProcessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.scaler = MinMaxScaler()
        self.standard_scaler = StandardScaler()
        self.yeo_johnson = PowerTransformer(method='yeo-johnson', standardize=True)
        self.one_hot_encoder = None
        self.ordinal_encoder = None
        self.loo_encodings = {}
        self.median_visibility = None
        self.item_weight_medians = {}
        self.outlet_size_modes = {}

    def fit(self, X, y=None):
        # Handle missing Item_Weight
        self.item_weight_medians = X.groupby('Item_Type')['Item_Weight'].median().to_dict()

        # Handle missing Outlet_Size
        self.outlet_size_modes = X.groupby('Outlet_Type')['Outlet_Size'].agg(
            lambda x: x.mode().iloc[0] if not x.mode().empty else 'Unknown'
        ).to_dict()

        # Replace zero visibility with median visibility
        non_zero_visibility = X.loc[X['Item_Visibility'] > 0, 'Item_Visibility']
        self.median_visibility = non_zero_visibility.median()

        # Fit scalers for numerical columns
        self.scaler.fit(X[['Item_MRP']])
        self.standard_scaler.fit(X[['Item_Weight', 'Item_Visibility']])

        # Fit Yeo-Johnson for skewed features
        skewed_columns = ['Item_Weight', 'Item_Visibility', 'Item_MRP']
        self.yeo_johnson.fit(X[skewed_columns])

        # Fit one-hot encoder for low-cardinality categorical features
        low_cardinality_features = ['Outlet_Type']
        self.one_hot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        self.one_hot_encoder.fit(X[low_cardinality_features])

        # Fit ordinal encoder for features with hierarchy
        ordinal_features = ['Outlet_Size', 'Outlet_Location_Type']
        self.ordinal_encoder = OrdinalEncoder(categories=[
            ['Small', 'Medium', 'High'],  # Outlet_Size
            ['Tier 3', 'Tier 2', 'Tier 1']  # Outlet_Location_Type
        ], handle_unknown='use_encoded_value', unknown_value=-1)
        self.ordinal_encoder.fit(X[ordinal_features])

        # Calculate LOO encoding for high cardinality features, excluding 'Item_Type'
        if y is not None:
            y = pd.Series(y, index=X.index)
            high_cardinality_features = ['Item_Identifier', 'Outlet_Identifier']
            for feature in high_cardinality_features:
                loo_encoding = X.groupby(feature).apply(lambda group: y.loc[group.index].mean()).to_dict()
                self.loo_encodings[feature] = loo_encoding

        return self

    def transform(self, X, y=None):
        X = X.copy()

        # Check required columns
        required_columns = ['Item_Type', 'Item_Weight', 'Item_Visibility', 'Item_MRP', 
                            'Outlet_Type', 'Outlet_Location_Type', 'Outlet_Size']
        for col in required_columns:
            if col not in X.columns:
                raise KeyError(f"The column '{col}' is missing from the input data.")

        # Handle missing values and preprocessing
        X['Item_Weight'] = X['Item_Weight'].fillna(X['Item_Type'].map(self.item_weight_medians))
        X['Outlet_Size'] = X['Outlet_Size'].fillna(X['Outlet_Type'].map(self.outlet_size_modes))
        X['Item_Visibility'] = X['Item_Visibility'].replace(0, self.median_visibility)

        # Normalize and map Item_Fat_Content
        X['Item_Fat_Content'] = X['Item_Fat_Content'].str.strip().str.lower()
        fat_content_map = {'low fat': 'Low Fat', 'lf': 'Low Fat', 'regular': 'Regular', 'reg': 'Regular'}
        X['Item_Fat_Content'] = X['Item_Fat_Content'].map(fat_content_map)

        # One-hot encode Item_Fat_Content
        one_hot = pd.get_dummies(X['Item_Fat_Content'], prefix='Item_Fat_Content')
        X = pd.concat([X.drop(columns=['Item_Fat_Content']), one_hot], axis=1)

        # Outlier capping
        continuous_columns = ['Item_Weight', 'Item_Visibility', 'Item_MRP']
        z_threshold = 3
        for col in continuous_columns:
            upper_bound = X[col].mean() + z_threshold * X[col].std()
            lower_bound = X[col].mean() - z_threshold * X[col].std()
            X[col] = np.clip(X[col], lower_bound, upper_bound)

        # Apply transformations
        skewed_columns = ['Item_Weight', 'Item_Visibility', 'Item_MRP']
        X[skewed_columns] = self.yeo_johnson.transform(X[skewed_columns])
        X[['Item_Weight', 'Item_Visibility']] = self.standard_scaler.transform(X[['Item_Weight', 'Item_Visibility']])
        X[['Item_MRP']] = self.scaler.transform(X[['Item_MRP']])

        # Encode Outlet_Type
        low_cardinality_features = ['Outlet_Type']
        encoded_features = self.one_hot_encoder.transform(X[low_cardinality_features])
        encoded_features = pd.DataFrame(
            encoded_features,
            columns=self.one_hot_encoder.get_feature_names_out(low_cardinality_features),
            index=X.index
        )
        X = pd.concat([X, encoded_features], axis=1).drop(columns=low_cardinality_features)

        # Encode hierarchical features
        ordinal_features = ['Outlet_Size', 'Outlet_Location_Type']
        X[ordinal_features] = self.ordinal_encoder.transform(X[ordinal_features])

        # One-hot encode 'Item_Type'
        one_hot_item_type = pd.get_dummies(X['Item_Type'], prefix='Item_Type')
        X = pd.concat([X.drop(columns=['Item_Type']), one_hot_item_type], axis=1)

        # LOO encoding for high cardinality, excluding 'Item_Type'
        high_cardinality_features = ['Outlet_Identifier']
        for feature in high_cardinality_features:
            if feature in X.columns:
                X[f'{feature}_LOO'] = X[feature].map(self.loo_encodings.get(feature, {})).fillna(0)

        X.drop(columns=high_cardinality_features, inplace=True)
        

        # Add new feature engineering
        X['Outlet_Age'] = 2024 - X['Outlet_Establishment_Year']
        X['Visibility_Percentage'] = X['Item_Visibility'] / (X['Item_Visibility'].sum() + 1e-5)
        X['Price_Per_Weight'] = X['Item_MRP'] / (X['Item_Weight'] + 1e-5)
        X['Visibility_to_MRP_Ratio'] = X['Item_Visibility'] / (X['Item_MRP'] + 1e-5)
        X['Discount_Potential'] = X['Item_MRP'] / (X['Item_Visibility'] + 1e-5)

        # Remove spaces in column names
        X.columns = X.columns.str.replace(' ', '_')
        X.drop(columns=['Item_Identifier','Outlet_Establishment_Year'], inplace=True)

        return X

    def fit_transform(self, X, y=None):
        self.fit(X, y)
        return self.transform(X, y)

# Example usage:
data = pd.read_csv('/kaggle/input/master-train/Train.csv')
target_variable = 'Item_Outlet_Sales'
X = data.drop(columns=[target_variable])
y = data[target_variable]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

processor = DataProcessor()
X_train = processor.fit_transform(X_train, y_train)
X_test = processor.transform(X_test)

print("Training data missing values:\n", X_train.isnull().sum())
print("Testing data missing values:\n", X_test.isnull().sum())


Training data missing values:
 Item_Weight                        0
Item_Visibility                    0
Item_MRP                           0
Outlet_Size                        0
Outlet_Location_Type               0
Item_Fat_Content_Low_Fat           0
Item_Fat_Content_Regular           0
Outlet_Type_Grocery_Store          0
Outlet_Type_Supermarket_Type1      0
Outlet_Type_Supermarket_Type2      0
Outlet_Type_Supermarket_Type3      0
Item_Type_Baking_Goods             0
Item_Type_Breads                   0
Item_Type_Breakfast                0
Item_Type_Canned                   0
Item_Type_Dairy                    0
Item_Type_Frozen_Foods             0
Item_Type_Fruits_and_Vegetables    0
Item_Type_Hard_Drinks              0
Item_Type_Health_and_Hygiene       0
Item_Type_Household                0
Item_Type_Meat                     0
Item_Type_Others                   0
Item_Type_Seafood                  0
Item_Type_Snack_Foods              0
Item_Type_Soft_Drinks              0
Item_Ty

  loo_encoding = X.groupby(feature).apply(lambda group: y.loc[group.index].mean()).to_dict()
  loo_encoding = X.groupby(feature).apply(lambda group: y.loc[group.index].mean()).to_dict()


In [13]:
X_train

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Size,Outlet_Location_Type,Item_Fat_Content_Low_Fat,Item_Fat_Content_Regular,Outlet_Type_Grocery_Store,Outlet_Type_Supermarket_Type1,Outlet_Type_Supermarket_Type2,...,Item_Type_Seafood,Item_Type_Snack_Foods,Item_Type_Soft_Drinks,Item_Type_Starchy_Foods,Outlet_Identifier_LOO,Outlet_Age,Visibility_Percentage,Price_Per_Weight,Visibility_to_MRP_Ratio,Discount_Potential
549,-2.922617,-11.862680,-0.130628,1.0,2.0,False,True,0.0,1.0,0.0,...,False,False,False,False,2382.120509,25,-0.002056,0.044696,90.819604,0.011012
7757,-2.541340,-5.387939,-0.130678,0.0,1.0,True,False,0.0,1.0,0.0,...,False,False,False,False,2228.023712,22,-0.000934,0.051421,41.233656,0.024254
764,-2.557465,7.188818,-0.134592,0.0,2.0,False,True,0.0,1.0,0.0,...,False,False,False,False,2287.769449,27,0.001246,0.052627,-53.415971,-0.018722
6867,-2.984081,-14.931097,-0.140198,0.0,1.0,True,False,0.0,1.0,0.0,...,False,False,False,False,2228.023712,22,-0.002588,0.046982,106.507479,0.009390
2716,-2.761189,25.396834,-0.131632,0.0,2.0,True,False,0.0,1.0,0.0,...,False,True,False,False,2287.769449,27,0.004401,0.047673,-192.952145,-0.005183
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,-2.927992,38.456476,-0.132706,0.0,0.0,False,True,1.0,0.0,0.0,...,False,False,False,False,348.495544,26,0.006664,0.045323,-289.808598,-0.003451
5191,-2.640309,20.552619,-0.137285,0.0,1.0,True,False,0.0,1.0,0.0,...,False,False,False,False,2322.109687,17,0.003562,0.051996,-149.718261,-0.006680
5390,-2.557465,-21.665548,-0.126680,0.0,1.0,True,False,0.0,1.0,0.0,...,False,False,False,False,2228.023712,22,-0.003755,0.049534,171.039233,0.005847
860,-2.449252,-2.058856,-0.134154,0.0,1.0,True,False,0.0,1.0,0.0,...,False,True,False,False,2322.109687,17,-0.000357,0.054774,15.348140,0.065160


In [14]:
# Importing libraries
import pandas as pd
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score
from xgboost import XGBRegressor
import lightgbm as lgb
from catboost import CatBoostRegressor

# Global verbosity toggle
verbose = False

# Define models with improved hyperparameters
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=200, learning_rate=0.1, random_state=42, verbosity=0),
    'LightGBM': lgb.LGBMRegressor(n_estimators=200, learning_rate=0.1, random_state=42, force_row_wise=True),
    'CatBoost': CatBoostRegressor(n_estimators=200, learning_rate=0.1, random_state=42, verbose=0 if not verbose else 1),
    'ExtraTrees': ExtraTreesRegressor(n_estimators=200, max_depth=10, random_state=42),
    'MLP Regressor': MLPRegressor(hidden_layer_sizes=(100, 50), activation='relu', solver='adam', 
                                   max_iter=1000, random_state=42)
}

# Train and evaluate models
r2_scores = {'Model': [], 'Training R²': [], 'Testing R²': []}
for model_name, model in models.items():
    print(f"Training {model_name}...")
    model.fit(X_train, y_train)
    
    # Predictions
    train_preds = model.predict(X_train)
    test_preds = model.predict(X_test)
    
    # R² Scores
    r2_scores['Model'].append(model_name)
    r2_scores['Training R²'].append(r2_score(y_train, train_preds))
    r2_scores['Testing R²'].append(r2_score(y_test, test_preds))

# Create a DataFrame for R² scores
r2_scores_df = pd.DataFrame(r2_scores)

# Display results
print("\nModel Performance (R² Scores):")
print(r2_scores_df)


Training Linear Regression...
Training Random Forest...
Training Gradient Boosting...
Training XGBoost...
Training LightGBM...
[LightGBM] [Info] Total Bins 1832
[LightGBM] [Info] Number of data points in the train set: 6818, number of used features: 33
[LightGBM] [Info] Start training from score 2202.365232
Training CatBoost...
Training ExtraTrees...
Training MLP Regressor...

Model Performance (R² Scores):
               Model  Training R²  Testing R²
0  Linear Regression     0.562106    0.578638
1      Random Forest     0.723045    0.606711
2  Gradient Boosting     0.670230    0.597786
3            XGBoost     0.813836    0.572002
4           LightGBM     0.790080    0.581720
5           CatBoost     0.686281    0.607138
6         ExtraTrees     0.691958    0.611993
7      MLP Regressor     0.233134    0.261973


In [15]:
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import RandomizedSearchCV, cross_val_score

# Define parameter grids
param_grids = {
    'ExtraTrees': {
        'n_estimators': [100, 200, 300, 500],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2', None],
    },
    'RandomForest': {
        'n_estimators': [100, 200, 300, 500],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2', None],
    },
    'CatBoost': {
        'iterations': [100, 200, 300, 500],
        'depth': [4, 6, 8, 10],
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'l2_leaf_reg': [1, 3, 5, 7],
    }
}

# Define models
models = {
    'ExtraTrees': ExtraTreesRegressor(random_state=42),
    'RandomForest': RandomForestRegressor(random_state=42),
    'CatBoost': CatBoostRegressor(random_state=42, verbose=0),
}

# Perform hyperparameter tuning for each model
best_models = {}
for model_name, model in models.items():
    print(f"Tuning {model_name}...")
    search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_grids[model_name],
        n_iter=50,
        scoring='r2',
        cv=5,
        random_state=42,
        n_jobs=-1,
        verbose=2
    )
    search.fit(X_train, y_train)
    best_models[model_name] = search.best_estimator_
    print(f"Best Params for {model_name}: {search.best_params_}")
    print(f"Best CV R² for {model_name}: {search.best_score_:.4f}\n")

# Evaluate all tuned models
final_results = []
for model_name, model in best_models.items():
    train_r2 = model.score(X_train, y_train)
    test_r2 = model.score(X_test, y_test)
    final_results.append((model_name, train_r2, test_r2))

# Create a DataFrame for comparison
import pandas as pd
results_df = pd.DataFrame(final_results, columns=['Model', 'Training R²', 'Testing R²'])
print("\nFinal Model Performance:")
print(results_df)

# Finalize the best model
best_model_name = results_df.iloc[results_df['Testing R²'].idxmax()]['Model']
print(f"\nFinalized Model: {best_model_name}")


Tuning ExtraTrees...
Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END max_depth=30, max_features=None, min_samples_leaf=4, min_samples_split=5, n_estimators=100; total time=   1.9s
[CV] END max_depth=30, max_features=None, min_samples_leaf=4, min_samples_split=5, n_estimators=100; total time=   1.9s
[CV] END max_depth=30, max_features=None, min_samples_leaf=4, min_samples_split=5, n_estimators=100; total time=   1.9s
[CV] END max_depth=30, max_features=None, min_samples_leaf=4, min_samples_split=5, n_estimators=100; total time=   2.0s
[CV] END max_depth=30, max_features=None, min_samples_leaf=4, min_samples_split=5, n_estimators=100; total time=   2.0s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=  16.9s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=  16.3s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_sampl

In [17]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV

# Define Gradient Boosting hyperparameter grid
gbr_param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5, 6],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'subsample': [0.8, 1.0],
}

# Initialize GradientBoostingRegressor
gbr = GradientBoostingRegressor(random_state=42)

# Perform RandomizedSearchCV
gbr_search = RandomizedSearchCV(
    estimator=gbr,
    param_distributions=gbr_param_grid,
    n_iter=50,
    scoring='r2',
    cv=5,
    random_state=42,
    n_jobs=-1,
    verbose=2
)

print("Tuning Gradient Boosting...")
gbr_search.fit(X_train, y_train)

# Best Gradient Boosting model
best_gbr = gbr_search.best_estimator_
print(f"Best Params for Gradient Boosting: {gbr_search.best_params_}")
print(f"Best CV R² for Gradient Boosting: {gbr_search.best_score_:.4f}")

# Evaluate Gradient Boosting on train and test sets
gbr_train_r2 = best_gbr.score(X_train, y_train)
gbr_test_r2 = best_gbr.score(X_test, y_test)
print(f"Gradient Boosting Training R²: {gbr_train_r2:.4f}")
print(f"Gradient Boosting Testing R²: {gbr_test_r2:.4f}")


Tuning Gradient Boosting...
Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END learning_rate=0.01, max_depth=5, min_samples_leaf=2, min_samples_split=2, n_estimators=500, subsample=0.8; total time=  11.7s
[CV] END learning_rate=0.01, max_depth=5, min_samples_leaf=2, min_samples_split=2, n_estimators=500, subsample=0.8; total time=  11.3s
[CV] END learning_rate=0.01, max_depth=5, min_samples_leaf=2, min_samples_split=2, n_estimators=500, subsample=0.8; total time=  11.1s
[CV] END learning_rate=0.01, max_depth=5, min_samples_leaf=2, min_samples_split=2, n_estimators=500, subsample=0.8; total time=  11.5s
[CV] END learning_rate=0.01, max_depth=5, min_samples_leaf=2, min_samples_split=2, n_estimators=500, subsample=0.8; total time=  11.1s
[CV] END learning_rate=0.1, max_depth=5, min_samples_leaf=2, min_samples_split=5, n_estimators=200, subsample=0.8; total time=   4.5s
[CV] END learning_rate=0.1, max_depth=5, min_samples_leaf=2, min_samples_split=5, n_estimators=200, s

In [18]:
from xgboost import XGBRegressor

# Define XGBoost hyperparameter grid
xgb_param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5, 6],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 1, 5],
    'reg_alpha': [0, 1, 5],
    'reg_lambda': [1, 5, 10],
}

# Initialize XGBRegressor
xgb = XGBRegressor(random_state=42, verbosity=0)

# Perform RandomizedSearchCV
xgb_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=xgb_param_grid,
    n_iter=50,
    scoring='r2',
    cv=5,
    random_state=42,
    n_jobs=-1,
    verbose=2
)

print("Tuning XGBoost...")
xgb_search.fit(X_train, y_train)

# Best XGBoost model
best_xgb = xgb_search.best_estimator_
print(f"Best Params for XGBoost: {xgb_search.best_params_}")
print(f"Best CV R² for XGBoost: {xgb_search.best_score_:.4f}")

# Evaluate XGBoost on train and test sets
xgb_train_r2 = best_xgb.score(X_train, y_train)
xgb_test_r2 = best_xgb.score(X_test, y_test)
print(f"XGBoost Training R²: {xgb_train_r2:.4f}")
print(f"XGBoost Testing R²: {xgb_test_r2:.4f}")


Tuning XGBoost...
Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_depth=6, n_estimators=500, reg_alpha=5, reg_lambda=5, subsample=0.8; total time=   1.3s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_depth=6, n_estimators=500, reg_alpha=5, reg_lambda=5, subsample=0.8; total time=   1.3s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_depth=6, n_estimators=500, reg_alpha=5, reg_lambda=5, subsample=0.8; total time=   1.3s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_depth=6, n_estimators=500, reg_alpha=5, reg_lambda=5, subsample=0.8; total time=   1.3s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_depth=6, n_estimators=500, reg_alpha=5, reg_lambda=5, subsample=0.8; total time=   1.3s
[CV] END colsample_bytree=1.0, gamma=1, learning_rate=0.1, max_depth=5, n_estimators=500, reg_alpha=1, reg_lambda=5, subsample=0.8; total time=   1.0s
[CV] END colsa

In [None]:
# ExtraTrees     0.682545    0.615493
# RandomForest     0.718167    0.609860
# CatBoost     0.620842    0.616522
# Gradient Boosting Training R²: 0.6221
# Gradient Boosting Testing R²: 0.6148
# XGBoost Training R²: 0.6297
# XGBoost Testing R²: 0.6205