In [33]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from category_encoders import TargetEncoder
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import cross_val_score
from catboost import CatBoostRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import RandomizedSearchCV
import optuna
from optuna.integration import OptunaSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from collections import defaultdict

In [34]:
data = pd.read_csv('named_train.csv')

In [35]:
X = data.drop(columns=['Y'])

In [36]:
Y = data['Y']

In [37]:
class ItemWeightImputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.item_weight_mode = defaultdict(lambda: None)  # Stores mode of Item_Weight per Item_Identifier
        self.global_mean = None  # Fallback global mean for Item_Weight

    def fit(self, X, y=None):
        X = X.copy()

        # Ensure the required columns exist
        if 'Item_Identifier' not in X.columns or 'Item_Weight' not in X.columns:
            raise ValueError("Both 'Item_Identifier' and 'Item_Weight' columns must be present in the dataset.")

        # Ensure no None values in Item_Identifier
        if X['Item_Identifier'].isnull().any():
            raise ValueError("Item_Identifier column contains None values.")

        # Calculate mode Item_Weight for each Item_Identifier
        item_weight_mode = X.groupby('Item_Identifier')['Item_Weight'].agg(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
        self.item_weight_mode.update(item_weight_mode.to_dict())

        # Calculate global mean for the Item_Weight column
        if X['Item_Weight'].notnull().any():
            self.global_mean = X['Item_Weight'].mean()
        else:
            raise ValueError("Item_Weight column contains only NaN values.")

        return self

    def transform(self, X):
        X = X.copy()

        # Ensure the required columns exist
        if 'Item_Identifier' not in X.columns or 'Item_Weight' not in X.columns:
            raise ValueError("Both 'Item_Identifier' and 'Item_Weight' columns must be present in the dataset.")

        # Ensure no None values in Item_Identifier
        if X['Item_Identifier'].isnull().any():
            raise ValueError("Item_Identifier column contains None values.")

        # Safely impute NaN Item_Weights based on Item_Identifier or global mean
        X['Item_Weight'] = X.apply(
            lambda row: self.item_weight_mode.get(row['Item_Identifier'], self.global_mean)
            if pd.isnull(row['Item_Weight'])
            else row['Item_Weight'],
            axis=1
        )

        X['Item_Weight'] = X['Item_Weight'].fillna(self.global_mean)

        return X[['Item_Weight']]

In [38]:
X["age"] = 2024 - X["Outlet_Establishment_Year"]

In [39]:
X.drop(columns=['Outlet_Establishment_Year'],inplace=True)

In [40]:
X

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type,age
0,FDA15,9.300,Low Fat,0.016047,Dairy,249.8092,OUT049,Medium,Tier 1,Supermarket Type1,25
1,DRC01,5.920,Regular,0.019278,Soft Drinks,48.2692,OUT018,Medium,Tier 3,Supermarket Type2,15
2,FDN15,17.500,Low Fat,0.016760,Meat,141.6180,OUT049,Medium,Tier 1,Supermarket Type1,25
3,FDX07,19.200,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,,Tier 3,Grocery Store,26
4,NCD19,8.930,Low Fat,0.000000,Household,53.8614,OUT013,High,Tier 3,Supermarket Type1,37
...,...,...,...,...,...,...,...,...,...,...,...
5995,FDB32,20.600,Low Fat,0.023586,Fruits and Vegetables,94.7778,OUT017,,Tier 2,Supermarket Type1,17
5996,FDJ16,9.195,Low Fat,0.115064,Frozen Foods,58.6246,OUT049,Medium,Tier 1,Supermarket Type1,25
5997,FDJ32,10.695,Low Fat,0.057910,Fruits and Vegetables,60.4536,OUT045,,Tier 2,Supermarket Type1,22
5998,FDO12,15.750,Low Fat,0.054920,Baking Goods,195.8452,OUT035,Small,Tier 2,Supermarket Type1,20


In [42]:
X['Category'] = X['Item_Identifier'].str[:2]

In [43]:
X["Item_Fat_Content"] = X["Item_Fat_Content"].replace({"low fat": "LF", "Low Fat": "LF", "Regular": "REG", "reg": "REG"})
X.loc[X['Category'] == 'NC', 'Item_Fat_Content'] = 'nofat'

In [45]:
impute_sizes = {
    "OUT010": "Small",
    "OUT017": "Small",
    "OUT045": "Medium"
}
X['Outlet_Size'] = X.apply(
    lambda row: impute_sizes[row['Outlet_Identifier']] if pd.isnull(row['Outlet_Size']) else row['Outlet_Size'], axis=1
)

In [46]:
X['MRP_cluster']=pd.cut(X["Item_MRP"],bins=[25,69,137,203,270],labels=['very low','low','high','very high'],right=True)

In [47]:
weightimputer = ItemWeightImputer()

In [48]:
X["Item_Weight"] = weightimputer.fit_transform(X)

In [50]:
X["Weight_per_Unit_MRP"] = X["Item_Weight"]/X["Item_MRP"]

In [52]:
#target_encoder_cols = ['Item_Type']
numerical_cols = ["Item_Weight","Weight_per_Unit_MRP","Item_Visibility"]
# ordinal_cols =["Outlet_Size"]
one_hot_columns = ["Outlet_Location_Type","Category","Item_Fat_Content","Outlet_Size","Outlet_Type"]
target_encoder_cols = ["Item_Identifier",'Item_Type','Outlet_Identifier']
ordinal_categories = [['very low', 'low', 'high', 'very high']]
# Numerical pipeline
# numerical_pipeline = Pipeline([
#     ("group_mean_imputer", FillNaWithGroupMode(group_col="Item_Identifier", target_col="Item_Weight")),
#     ("mean_imputer", SimpleImputer(strategy="mean"))
#     # ("scaler", StandardScaler())
# ])
numerical_pipeline = Pipeline([
    # ("group_mode_imputer", FillNaWithGroupMode(group_col=group_col, target_col='Item_Weight')),
    ("mean_imputer", SimpleImputer(strategy='mean')),
    ("scaler", StandardScaler())
])

#Ordinal pipeline
# ordinal_pipeline = Pipeline([
#     ("imputer", SimpleImputer(strategy="most_frequent")),
#     ("ordinal", OrdinalEncoder(categories=ordinal_categories))
# ])
target_encoding_pipeline = Pipeline([
    ("target_encoder", TargetEncoder(cols=target_encoder_cols , smoothing=0.5) )
])
# One-hot encoding pipeline
onehot_transform_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore",sparse_output=False))
])
ordinal_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ordinal", OrdinalEncoder(categories=ordinal_categories))
])

# Target encoding pipeline
# target_encoding_pipeline = Pipeline([
#     ("target_encoder", TargetEncoder(cols=target_encoder_cols))
# ])

# Combine all pipelines into a ColumnTransformer
preprocessor = ColumnTransformer([
    ("num", numerical_pipeline, numerical_cols),
    ("target_encoder", target_encoding_pipeline, target_encoder_cols),
    ("onehot", onehot_transform_pipeline, one_hot_columns),
    ("ordinal", ordinal_pipeline, ["MRP_cluster"])

])


# Final pipeline
final_pipeline = Pipeline([
    ("preprocessor", preprocessor),
])

In [54]:
import optuna
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

def objective_linear_regression(trial):
    # LinearRegression does not have many hyperparameters to tune, but we can still create a pipeline
    param = {
        'fit_intercept': trial.suggest_categorical('fit_intercept', [True, False])
    }
    
    # Create a pipeline with the final transformer and LinearRegression
    model = LinearRegression(**param)
    pipeline = Pipeline([
        ('final_transform', final_pipeline),
        ('linear_regression', model)
    ])

    # Perform cross-validation with error handling
    try:
        scores = cross_val_score(pipeline, X, Y, cv=5, scoring='neg_mean_absolute_error', error_score='raise')
        mae = -scores.mean()
        print(f"Trial MAE: {mae}")
    except Exception as e:
        print(f"Error during cross-validation: {e}")
        mae = float('inf')  # Assign a high error value if an exception occurs

    return mae

study_linear_regression = optuna.create_study(direction='minimize')
study_linear_regression.optimize(objective_linear_regression, n_trials=50)

print("Linear Regression Best Parameters:", study_linear_regression.best_params)
print("Linear Regression Best MAE:", study_linear_regression.best_value)

# Extract the best model
best_params = study_linear_regression.best_params
best_model = LinearRegression(**best_params)
best_pipeline = Pipeline([
    ('final_transform', final_pipeline),
    ('linear_regression', best_model)
])

# Fit the best pipeline on the entire dataset
best_pipeline.fit(X, Y)

# Now you can use best_pipeline for predictions
predictions = best_pipeline.predict(X)
print(predictions)

[I 2024-12-28 01:33:30,068] A new study created in memory with name: no-name-f76cec0f-ae22-4d58-bf82-3a77293f70d9
[I 2024-12-28 01:33:30,340] Trial 0 finished with value: 0.41413494811376 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.41413494811376


[I 2024-12-28 01:33:30,583] Trial 1 finished with value: 0.4147404040527344 and parameters: {'fit_intercept': True}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.4147404040527344


[I 2024-12-28 01:33:30,883] Trial 2 finished with value: 0.4147404040527344 and parameters: {'fit_intercept': True}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.4147404040527344


[I 2024-12-28 01:33:31,153] Trial 3 finished with value: 0.4147404040527344 and parameters: {'fit_intercept': True}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.4147404040527344


[I 2024-12-28 01:33:31,399] Trial 4 finished with value: 0.41413494811376 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.41413494811376


[I 2024-12-28 01:33:31,652] Trial 5 finished with value: 0.4147404040527344 and parameters: {'fit_intercept': True}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.4147404040527344


[I 2024-12-28 01:33:31,938] Trial 6 finished with value: 0.41413494811376 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.41413494811376


[I 2024-12-28 01:33:32,223] Trial 7 finished with value: 0.41413494811376 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.41413494811376


[I 2024-12-28 01:33:32,472] Trial 8 finished with value: 0.4147404040527344 and parameters: {'fit_intercept': True}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.4147404040527344


[I 2024-12-28 01:33:32,711] Trial 9 finished with value: 0.4147404040527344 and parameters: {'fit_intercept': True}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.4147404040527344


[I 2024-12-28 01:33:32,936] Trial 10 finished with value: 0.41413494811376 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.41413494811376


[I 2024-12-28 01:33:33,165] Trial 11 finished with value: 0.41413494811376 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.41413494811376


[I 2024-12-28 01:33:33,482] Trial 12 finished with value: 0.41413494811376 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.41413494811376


[I 2024-12-28 01:33:33,711] Trial 13 finished with value: 0.41413494811376 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.41413494811376


[I 2024-12-28 01:33:33,974] Trial 14 finished with value: 0.41413494811376 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.41413494811376


[I 2024-12-28 01:33:34,197] Trial 15 finished with value: 0.41413494811376 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.41413494811376


[I 2024-12-28 01:33:34,438] Trial 16 finished with value: 0.41413494811376 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.41413494811376


[I 2024-12-28 01:33:34,672] Trial 17 finished with value: 0.41413494811376 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.41413494811376


[I 2024-12-28 01:33:34,902] Trial 18 finished with value: 0.41413494811376 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.41413494811376


[I 2024-12-28 01:33:35,130] Trial 19 finished with value: 0.41413494811376 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.41413494811376


[I 2024-12-28 01:33:35,354] Trial 20 finished with value: 0.41413494811376 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.41413494811376


[I 2024-12-28 01:33:35,579] Trial 21 finished with value: 0.41413494811376 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.41413494811376


[I 2024-12-28 01:33:35,817] Trial 22 finished with value: 0.41413494811376 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.41413494811376


[I 2024-12-28 01:33:36,054] Trial 23 finished with value: 0.41413494811376 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.41413494811376


[I 2024-12-28 01:33:36,278] Trial 24 finished with value: 0.41413494811376 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.41413494811376


[I 2024-12-28 01:33:36,511] Trial 25 finished with value: 0.41413494811376 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.41413494811376


[I 2024-12-28 01:33:36,752] Trial 26 finished with value: 0.41413494811376 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.41413494811376


[I 2024-12-28 01:33:37,044] Trial 27 finished with value: 0.41413494811376 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.41413494811376


[I 2024-12-28 01:33:37,268] Trial 28 finished with value: 0.41413494811376 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.41413494811376


[I 2024-12-28 01:33:37,507] Trial 29 finished with value: 0.4147404040527344 and parameters: {'fit_intercept': True}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.4147404040527344


[I 2024-12-28 01:33:37,730] Trial 30 finished with value: 0.41413494811376 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.41413494811376


[I 2024-12-28 01:33:37,957] Trial 31 finished with value: 0.41413494811376 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.41413494811376


[I 2024-12-28 01:33:38,184] Trial 32 finished with value: 0.41413494811376 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.41413494811376


[I 2024-12-28 01:33:38,410] Trial 33 finished with value: 0.41413494811376 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.41413494811376


[I 2024-12-28 01:33:38,650] Trial 34 finished with value: 0.4147404040527344 and parameters: {'fit_intercept': True}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.4147404040527344


[I 2024-12-28 01:33:38,877] Trial 35 finished with value: 0.4147404040527344 and parameters: {'fit_intercept': True}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.4147404040527344


[I 2024-12-28 01:33:39,103] Trial 36 finished with value: 0.41413494811376 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.41413494811376


[I 2024-12-28 01:33:39,340] Trial 37 finished with value: 0.4147404040527344 and parameters: {'fit_intercept': True}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.4147404040527344


[I 2024-12-28 01:33:39,567] Trial 38 finished with value: 0.41413494811376 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.41413494811376


[I 2024-12-28 01:33:39,803] Trial 39 finished with value: 0.41413494811376 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.41413494811376


[I 2024-12-28 01:33:40,115] Trial 40 finished with value: 0.4147404040527344 and parameters: {'fit_intercept': True}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.4147404040527344


[I 2024-12-28 01:33:40,367] Trial 41 finished with value: 0.41413494811376 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.41413494811376


[I 2024-12-28 01:33:40,739] Trial 42 finished with value: 0.41413494811376 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.41413494811376


[I 2024-12-28 01:33:40,995] Trial 43 finished with value: 0.41413494811376 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.41413494811376


[I 2024-12-28 01:33:41,236] Trial 44 finished with value: 0.41413494811376 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.41413494811376


[I 2024-12-28 01:33:41,481] Trial 45 finished with value: 0.41413494811376 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.41413494811376


[I 2024-12-28 01:33:41,728] Trial 46 finished with value: 0.41413494811376 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.41413494811376


[I 2024-12-28 01:33:41,977] Trial 47 finished with value: 0.41413494811376 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.41413494811376


[I 2024-12-28 01:33:42,232] Trial 48 finished with value: 0.41413494811376 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.41413494811376


[I 2024-12-28 01:33:42,469] Trial 49 finished with value: 0.41413494811376 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 0.41413494811376.


Trial MAE: 0.41413494811376
Linear Regression Best Parameters: {'fit_intercept': False}
Linear Regression Best MAE: 0.41413494811376
[8.14742559 6.62476349 7.77681919 ... 6.72557449 7.86332508 7.2187166 ]


In [55]:
# Get the feature names from the preprocessor
feature_names = final_pipeline.named_steps['preprocessor'].get_feature_names_out()

# Get the coefficients from the linear regression model
coefficients = best_model.coef_

# Create a DataFrame to hold feature names and their corresponding coefficients
feature_importance = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients
})

# Sort the DataFrame by the absolute value of the coefficients in descending order
feature_importance['Absolute_Coefficient'] = feature_importance['Coefficient'].abs()
feature_importance = feature_importance.sort_values(by='Absolute_Coefficient', ascending=False)

# Print the top 10 most important features
print(feature_importance.head(10))

                                Feature   Coefficient  Absolute_Coefficient
8   onehot__Outlet_Location_Type_Tier 3 -8.764891e+09          8.764891e+09
7   onehot__Outlet_Location_Type_Tier 2 -8.764891e+09          8.764891e+09
6   onehot__Outlet_Location_Type_Tier 1 -8.764891e+09          8.764891e+09
12          onehot__Item_Fat_Content_LF -8.764735e+09          8.764735e+09
13         onehot__Item_Fat_Content_REG -8.764735e+09          8.764735e+09
11                  onehot__Category_NC -8.764199e+09          8.764199e+09
14       onehot__Item_Fat_Content_nofat -8.763889e+09          8.763889e+09
10                  onehot__Category_FD -8.763352e+09          8.763352e+09
9                   onehot__Category_DR -8.763352e+09          8.763352e+09
17            onehot__Outlet_Size_Small -8.762929e+09          8.762929e+09
