In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import RandomizedSearchCV
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('train_v9rqX0R.csv')

In [4]:
def preprocess_data(data):
    
    # Handle missing values
    data['Item_Weight'] = data['Item_Weight'].interpolate(method="linear")
    data['Item_Visibility'] = data['Item_Visibility'].replace(0, np.nan).interpolate(method='linear')
    
    # Fill missing Outlet_Size based on Outlet_Type
    outlet_size_mapping = {
        'Grocery Store': 'Small',
        'Supermarket Type1': 'Small',
        'Supermarket Type2': 'Medium',
        'Supermarket Type3': 'Medium'
    }
    
    for outlet_type, size in outlet_size_mapping.items():
        data.loc[(data['Outlet_Type'] == outlet_type) & data['Outlet_Size'].isnull(), 'Outlet_Size'] = size
    
    # Standardize Item_Fat_Content
    fat_content_mapping = {
        'Low Fat': 'Low Fat',
        'LF': 'Low Fat',
        'low fat': 'Low Fat',
        'Regular': 'Regular',
        'reg': 'Regular'
    }
    data['Item_Fat_Content'] = data['Item_Fat_Content'].replace(fat_content_mapping)
    
    # Extract first two characters from Item_Identifier
    data['Item_Identifier'] = data['Item_Identifier'].apply(lambda x: x[:2])
    
    # Calculate outlet age
    current_year = 2025
    data['Outlet_Establishment_Year'] = current_year - data['Outlet_Establishment_Year']
    
    data['Price_per_Weight'] = data['Item_MRP'] / data['Item_Weight']
    data['Store_Age_Size'] = data['Outlet_Establishment_Year'] * data['Outlet_Size']
    data['Visibility_MRP'] = data['Item_Visibility'] * data['Item_MRP']
    
    # Encode categorical variables
    cat_cols = data.select_dtypes(include=['object']).columns
    for col in cat_cols:
        oe = OrdinalEncoder()
        data[col] = oe.fit_transform(data[[col]])
    
    return data

In [5]:
data = preprocess_data(data)

In [6]:
X = data.drop('Item_Outlet_Sales',axis=1)
y = data['Item_Outlet_Sales']

In [9]:
def train_top_features_model(X, y, n_features=20, cv_folds=5):
    
    xg_all = XGBRFRegressor(n_estimators=100, random_state=42)
    xg_all.fit(X, y)
    
    # Get feature importance
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': xg_all.feature_importances_
    }).sort_values('importance', ascending=False)
    
    # Select top n features
    top_features = feature_importance.head(n_features)
    X_selected = X[top_features['feature'].tolist()]
    
    # Perform cross-validation
    model = XGBRFRegressor(n_estimators=100, random_state=42)
    cv_scores = cross_val_score(
        model, 
        X_selected, 
        y, 
        cv=cv_folds, 
        scoring='r2',
        n_jobs=-1
    )
    
    # Train final model with selected features
    final_model = XGBRFRegressor(n_estimators=100, random_state=42)
    final_model.fit(X_selected, y)
    
    # Calculate mean R2 score
    mean_r2_score = cv_scores.mean()
    
    print(f"\nCross-validation R2 scores: {cv_scores}")
    print(f"Mean R2 score: {mean_r2_score:.4f} (+/- {cv_scores.std() * 2:.4f})")
    
    return final_model, top_features['feature'].tolist(), feature_importance, mean_r2_score

In [80]:
final_model, features, feature_importance, mean_r2_score= train_top_features_model(X, y, n_features=6)


Cross-validation R2 scores: [0.60535215 0.58353803 0.57464397 0.60874792 0.60634988]
Mean R2 score: 0.5957 (+/- 0.0278)


In [81]:
test_data = pd.read_csv('test_AbJTz2l.csv')
test_data2 = test_data.copy()

In [82]:
test_data2 = preprocess_data(test_data2)
test_data2 = test_data2[features]

In [83]:
test_result = final_model.predict(test_data2)

In [84]:
test_data['Item_Outlet_Sales'] = test_result

In [85]:
test_data = test_data[['Item_Identifier', 'Outlet_Identifier', 'Item_Outlet_Sales']]

In [86]:
test_data.to_csv('submissionv8.csv', index=False)

In [87]:
feature_importance

Unnamed: 0,feature,importance
10,Outlet_Type,0.367534
12,Store_Age_Size,0.349663
5,Item_MRP,0.088537
7,Outlet_Establishment_Year,0.082485
6,Outlet_Identifier,0.064045
8,Outlet_Size,0.022419
11,Price_per_Weight,0.01469
13,Visibility_MRP,0.002617
3,Item_Visibility,0.002021
9,Outlet_Location_Type,0.001878
