In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import RandomizedSearchCV
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('gs://wmt-mlp-p-intlctlg-export-bucket/AE/GenAI/Experiment/train_v9rqX0R.csv')

In [4]:
def preprocess_data(df):
    """
    Enhanced data preprocessing with advanced feature engineering
    """
    # Create a copy to avoid modifying original data
    df = df.copy()
    
    # Handle missing values with more sophisticated approaches
    df['Item_Weight'] = df['Item_Weight'].fillna(df.groupby('Item_Type')['Item_Weight'].transform('median'))
    
    # Fix the Item_Visibility replacement
    mask = df['Item_Visibility'] == 0
    df.loc[mask, 'Item_Visibility'] = df.groupby('Item_Type')['Item_Visibility'].transform('median')
    
    # Fill missing Outlet_Size based on Outlet_Type and Location
    outlet_size_mapping = {
        'Grocery Store': 'Small',
        'Supermarket Type1': 'Small',
        'Supermarket Type2': 'Medium',
        'Supermarket Type3': 'Medium'
    }
    
    for outlet_type, size in outlet_size_mapping.items():
        df.loc[(df['Outlet_Type'] == outlet_type) & df['Outlet_Size'].isnull(), 'Outlet_Size'] = size
    
    # Feature Engineering
    # 1. Item Category Features
    df['Item_Category'] = df['Item_Identifier'].str[:2]
    df['Is_Food'] = df['Item_Category'].isin(['FD', 'DR']).astype(int)
    
    # 2. Time-based Features
    current_year = 2025
    df['Outlet_Age'] = current_year - df['Outlet_Establishment_Year']
    df['Outlet_Age_Squared'] = df['Outlet_Age'] ** 2
    df['Is_Old_Store'] = (df['Outlet_Age'] > df['Outlet_Age'].median()).astype(int)
    
    # 3. Price-based Features
    df['Price_per_Weight'] = df['Item_MRP'] / df['Item_Weight']
    #df['Price_Level'] = pd.qcut(df['Item_MRP'], q=4, labels=[1,2,3,4])
    df['Price_Relative_To_Type'] = df['Item_MRP'] / df.groupby('Item_Type')['Item_MRP'].transform('mean')
    
    # 4. Visibility Features
    df['Visibility_MRP'] = df['Item_Visibility'] * df['Item_MRP']
    df['Visibility_Type_Mean'] = df.groupby('Item_Type')['Item_Visibility'].transform('mean')
    df['Visibility_Ratio'] = df['Item_Visibility'] / (df['Visibility_Type_Mean'] + 1e-6)  # Avoid division by zero
    
    # 5. Store Features
    df['Store_Age_Size'] = df['Outlet_Age'] * pd.Categorical(df['Outlet_Size']).codes
    df['Items_per_Store'] = df.groupby('Outlet_Identifier')['Item_Identifier'].transform('count')
    df['Store_Avg_Price'] = df.groupby('Outlet_Identifier')['Item_MRP'].transform('mean')
    
    # 6. Log Transformations for Skewed Features
    numeric_features = ['Item_Weight', 'Item_Visibility', 'Price_per_Weight', 'Visibility_MRP']
    for feature in numeric_features:
        df[feature] = np.log1p(df[feature])
    
    # Encode categorical variables
    cat_cols = df.select_dtypes(include=['object']).columns
    for col in cat_cols:
        oe = OrdinalEncoder()
        df[col] = oe.fit_transform(df[[col]])
    
    return df

In [5]:
data = preprocess_data(data)

In [6]:
string_cols = []
for col in data.columns:
    if pd.api.types.is_string_dtype(data[col]):
        string_cols.append(col)
string_cols

[]

In [7]:
X = data.drop('Item_Outlet_Sales',axis=1)
y = data['Item_Outlet_Sales']

In [None]:
def train_top_features_model(X, y, n_features=20):
    """
    Train XGBoost model with feature selection based on importance thresholds
    and hyperparameter tuning using Mean Squared Error (MSE) as the evaluation metric
    
    Parameters:
    -----------
    X : pandas DataFrame
        Input features
    y : pandas Series
        Target variable
    n_features : int
        number of features
    
    Returns:
    --------
        best_model, selected_features, feature_importance, best_score
    """
    # Import necessary libraries and metrics
    from sklearn.metrics import mean_squared_error
    from xgboost import XGBRegressor
    from sklearn.model_selection import ParameterSampler
    import numpy as np
    
    # Split data into train and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Initial feature importance calculation using XGBoost
    initial_model = XGBRegressor(n_estimators=100, random_state=42)
    initial_model.fit(X_train, y_train)
    
    # Get feature importance
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': initial_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    # Calculate cumulative importance and normalized importance
    total_importance = feature_importance['importance'].sum()
    feature_importance['importance_normalized'] = feature_importance['importance'] / total_importance
    feature_importance['cumulative_importance'] = feature_importance['importance_normalized'].cumsum()
    
    print("Feature importance distribution:")
    print(feature_importance[['feature', 'importance_normalized', 'cumulative_importance']].head(10))
    
    # Define importance thresholds to try instead of fixed feature counts
    # These thresholds represent the minimum normalized importance for a feature to be included
    importance_thresholds = [0.0001, 0.001, 0.005, 0.01, 0.02, 0.03]
    cumulative_thresholds = [0.75, 0.80, 0.85, 0.90, 0.95, 0.99]
    
    # For MSE, lower is better, so initialize with infinity
    best_val_mse = float('inf')
    best_params = None
    best_features = None
    best_model = None
    best_threshold_type = None
    best_threshold_value = None
    
    # Define hyperparameter search space for XGBoost
    param_grid = {
        'n_estimators': [100, 200, 300, 500],
        'max_depth': [3, 5, 7, 9],
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'min_child_weight': [1, 3, 5],
        'gamma': [0, 0.1, 0.2]
    }
    
    # Sample a limited number of hyperparameter combinations
    param_list = list(ParameterSampler(
        param_grid, 
        n_iter=10,
        random_state=42
    ))
    
    print("Starting importance threshold and hyperparameter grid search on validation set...")
    
    # Try feature selection based on minimum importance threshold
    for threshold in importance_thresholds:
        # Select features with importance above threshold
        selected_features = feature_importance[feature_importance['importance_normalized'] > threshold]['feature'].tolist()
        
        if len(selected_features) < 5:  # Ensure we have at least 5 features
            continue
            
        print(f"Testing minimum importance threshold {threshold}: selected {len(selected_features)} features")
        
        # Prepare data with selected features
        X_train_selected = X_train[selected_features]
        X_val_selected = X_val[selected_features]
        
        # Try different hyperparameter combinations
        for params in param_list:
            # Create and train XGBoost model with current hyperparameters
            model = XGBRegressor(random_state=42, **params)
            model.fit(X_train_selected, y_train)
            
            # Evaluate on validation set using MSE
            val_predictions = model.predict(X_val_selected)
            val_mse = mean_squared_error(y_val, val_predictions)
            
            # Update best configuration if this performs better
            if val_mse < best_val_mse:
                best_val_mse = val_mse
                best_params = params
                best_features = selected_features
                best_model = model
                best_threshold_type = "min_importance"
                best_threshold_value = threshold
                
                print(f"New best: min_importance {threshold}, {len(selected_features)} features, validation MSE: {val_mse:.4f}")
    
    # Try feature selection based on cumulative importance threshold
    for threshold in cumulative_thresholds:
        # Select features up to the cumulative importance threshold
        selected_features = feature_importance[feature_importance['cumulative_importance'] <= threshold]['feature'].tolist()
        
        if len(selected_features) < 5:  # Ensure we have at least 5 features
            continue
            
        print(f"Testing cumulative importance threshold {threshold}: selected {len(selected_features)} features")
        
        # Prepare data with selected features
        X_train_selected = X_train[selected_features]
        X_val_selected = X_val[selected_features]
        
        # Try different hyperparameter combinations
        for params in param_list:
            # Create and train XGBoost model with current hyperparameters
            model = XGBRegressor(random_state=42, **params)
            model.fit(X_train_selected, y_train)
            
            # Evaluate on validation set using MSE
            val_predictions = model.predict(X_val_selected)
            val_mse = mean_squared_error(y_val, val_predictions)
            
            # Update best configuration if this performs better
            if val_mse < best_val_mse:
                best_val_mse = val_mse
                best_params = params
                best_features = selected_features
                best_model = model
                best_threshold_type = "cumulative_importance"
                best_threshold_value = threshold
                
                print(f"New best: cumulative_importance {threshold}, {len(selected_features)} features, validation MSE: {val_mse:.4f}")
    
    # Get final feature importance from the best model
    final_importance = pd.DataFrame({
        'feature': best_features,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    # Print results
    print("\nFinal Model Selected:")
    print(f"Feature selection method: {best_threshold_type} with threshold {best_threshold_value}")
    print(f"Number of features: {len(best_features)}")
    print(f"Validation MSE: {best_val_mse:.4f}")
    print("\nBest Parameters:")
    for param, value in best_params.items():
        print(f"{param}: {value}")
    
    print("\nTop 10 Most Important Features:")
    print(final_importance.head(10))
    
    # Return best model, features, importance, and MSE score
    return best_model, best_features, feature_importance, best_val_mse

In [11]:
%%time
final_model, features, feature_importance, mean_r2_score= train_top_features_model(X, y, n_features=10)

Feature importance distribution:
                      feature  importance_normalized  cumulative_importance
10                Outlet_Type               0.604685               0.604685
5                    Item_MRP               0.106427               0.711112
7   Outlet_Establishment_Year               0.086716               0.797828
17     Price_Relative_To_Type               0.026675               0.824503
9        Outlet_Location_Type               0.016469               0.840972
22            Items_per_Store               0.014953               0.855925
21             Store_Age_Size               0.014019               0.869944
16           Price_per_Weight               0.013288               0.883233
20           Visibility_Ratio               0.012862               0.896094
19       Visibility_Type_Mean               0.012257               0.908351
Starting importance threshold and hyperparameter grid search on validation set...
Testing minimum importance threshold 0.0001: sele

In [12]:
mean_r2_score

1056177.8458175422

In [16]:
features

['Outlet_Type',
 'Item_MRP',
 'Outlet_Establishment_Year',
 'Price_Relative_To_Type',
 'Outlet_Location_Type',
 'Items_per_Store',
 'Store_Age_Size',
 'Price_per_Weight',
 'Visibility_Ratio']

In [43]:
test_data = pd.read_csv('gs://wmt-mlp-p-intlctlg-export-bucket/AE/GenAI/Experiment/test_AbJTz2l.csv')
test_data2 = test_data.copy()

In [None]:
test_data2 = preprocess_data(test_data2)
test_data2 = test_data2[features]

In [None]:
test_result = final_model.predict(test_data2)

In [46]:
test_data['Item_Outlet_Sales'] = test_result

In [47]:
test_data = test_data[['Item_Identifier', 'Outlet_Identifier', 'Item_Outlet_Sales']]

In [48]:
test_data.to_csv('gs://wmt-mlp-p-intlctlg-export-bucket/AE/GenAI/Experiment/submissionv9.csv', index=False)