# ML Zoomcamp 2024 Competition
Link: https://www.kaggle.com/competitions/ml-zoomcamp-2024-competition

# 1. All in One Cell

In [8]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
from datetime import datetime

# Define data paths
DATA_PATH = '/kaggle/input/ml-zoomcamp-2024-competition'

def load_and_check_data():
    """Load data and print basic information"""
    # Load training data
    sales_df = pd.read_csv(f'{DATA_PATH}/sales.csv')
    
    # Load and process test data
    test_df = pd.read_csv(f'{DATA_PATH}/test.csv', sep=';')  # Use separator
    
    print("Sales DataFrame Info:")
    print(sales_df.info())
    print("\nSales DataFrame Head:")
    print(sales_df.head())
    
    print("\nTest DataFrame Info:")
    print(test_df.info())
    print("\nTest DataFrame Head:")
    print(test_df.head())
    
    return sales_df, test_df

def prepare_features(df, is_train=True):
    """Create features from the available data"""
    if is_train:
        # For training data
        df['date'] = pd.to_datetime(df['date'])
    else:
        # For test data
        df['date'] = pd.to_datetime(df['date'], format='%d.%m.%Y')
    
    # Create date-based features
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['day_of_week'] = df['date'].dt.dayofweek
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    
    if is_train:
        # Create lag features for training data
        df['lag_1'] = df.groupby(['item_id', 'store_id'])['quantity'].shift(1)
        df['lag_7'] = df.groupby(['item_id', 'store_id'])['quantity'].shift(7)
        
        # Create rolling mean features
        df['rolling_mean_7'] = df.groupby(['item_id', 'store_id'])['quantity'].transform(
            lambda x: x.rolling(window=7, min_periods=1).mean())
        df['rolling_mean_30'] = df.groupby(['item_id', 'store_id'])['quantity'].transform(
            lambda x: x.rolling(window=30, min_periods=1).mean())
    else:
        # For test data, initialize these columns with 0
        df['lag_1'] = 0
        df['lag_7'] = 0
        df['rolling_mean_7'] = 0
        df['rolling_mean_30'] = 0
    
    # Fill NaN values
    df = df.fillna(0)
    
    return df

def prepare_training_data(df):
    """Prepare data for model training"""
    # Select features for training
    feature_columns = [
        'year', 'month', 'day', 'day_of_week', 'is_weekend',
        'lag_1', 'lag_7', 'rolling_mean_7', 'rolling_mean_30',
        'store_id'  # Adding store_id as a feature
    ]
    
    X = df[feature_columns]
    y = df['quantity']
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    return X_train, X_test, y_train, y_test, feature_columns

def train_model(X_train, y_train):
    """Train XGBoost model"""
    model = xgb.XGBRegressor(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=7,
        random_state=42
    )
    
    model.fit(X_train, y_train)
    return model

def evaluate_model(y_true, y_pred, model_name):
    """Evaluate model performance using multiple metrics"""
    metrics = {
        'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)),
        'MAE': mean_absolute_error(y_true, y_pred),
        'R2': r2_score(y_true, y_pred)
    }
    
    print(f"\nModel: {model_name}")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")
    
    return metrics

def generate_submission(model, test_df, feature_columns):
    """Generate submission file"""
    # Make predictions
    test_features = test_df[feature_columns]
    predictions = model.predict(test_features)
    
    # Create submission dataframe
    submission = pd.DataFrame({
        'row_id': test_df['row_id'],
        'quantity': predictions
    })
    
    # Save submission file
    submission.to_csv('submission.csv', index=False)
    print("Submission file created successfully!")

def main():
    # Load and check data
    print("Loading and checking data...")
    train_df, test_df = load_and_check_data()
    
    # Prepare features
    print("Preparing features...")
    train_df = prepare_features(train_df, is_train=True)
    test_df = prepare_features(test_df, is_train=False)
    
    # Prepare training data
    print("Preparing training data...")
    X_train, X_test, y_train, y_test, feature_columns = prepare_training_data(train_df)
    
    # Train model
    print("Training model...")
    model = train_model(X_train, y_train)
    
    # Make predictions and evaluate
    print("Evaluating model...")
    y_pred = model.predict(X_test)
    metrics = evaluate_model(y_test, y_pred, "XGBoost")
    
    # Generate submission
    print("Generating submission...")
    generate_submission(model, test_df, feature_columns)
    
    return model, metrics

if __name__ == "__main__":
    main()

Loading and checking data...
Sales DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7432685 entries, 0 to 7432684
Data columns (total 7 columns):
 #   Column      Dtype  
---  ------      -----  
 0   Unnamed: 0  int64  
 1   date        object 
 2   item_id     object 
 3   quantity    float64
 4   price_base  float64
 5   sum_total   float64
 6   store_id    int64  
dtypes: float64(3), int64(2), object(2)
memory usage: 396.9+ MB
None

Sales DataFrame Head:
   Unnamed: 0        date       item_id  quantity  price_base  sum_total  \
0           0  2023-08-04  293375605257     1.000       47.86      47.86   
1           1  2023-08-04  a66fdf2c0ae7     3.000       49.60     148.80   
2           2  2023-08-04  daa46ef49b7a     0.822      379.00     311.54   
3           3  2023-08-04  a3b49c1bf758     1.000      129.00     129.00   
4           4  2023-08-04  ab611c5cef62     7.000       79.90     559.30   

   store_id  
0         1  
1         1  
2         1  
3      

# 2. EDA

In [13]:
# Import necessary libraries
import pandas as pd
import numpy as np
from datetime import datetime

# Define data paths
DATA_PATH = '/kaggle/input/ml-zoomcamp-2024-competition'

# Load the data first
print("Loading data...")
train_df = pd.read_csv(f'{DATA_PATH}/sales.csv')

# Now perform EDA
def perform_eda(sales_df):
    """Perform Exploratory Data Analysis"""
    print("=== Basic Statistics ===")
    print("\nDescriptive Statistics for Numerical Columns:")
    print(sales_df.describe())
    
    print("\n=== Missing Values Analysis ===")
    missing_values = sales_df.isnull().sum()
    print(missing_values[missing_values > 0])
    
    print("\n=== Target Variable Analysis ===")
    print("\nQuantity Statistics:")
    print(f"Mean quantity: {sales_df['quantity'].mean():.2f}")
    print(f"Median quantity: {sales_df['quantity'].median():.2f}")
    print(f"Min quantity: {sales_df['quantity'].min():.2f}")
    print(f"Max quantity: {sales_df['quantity'].max():.2f}")
    
    # Time-based analysis
    sales_df['date'] = pd.to_datetime(sales_df['date'])
    print("\n=== Temporal Analysis ===")
    monthly_sales = sales_df.groupby(sales_df['date'].dt.to_period('M'))['quantity'].sum()
    print("\nMonthly Sales Trends:")
    print(monthly_sales)
    
    # Store and Item Analysis
    print("\n=== Store Analysis ===")
    store_stats = sales_df.groupby('store_id')['quantity'].agg(['mean', 'count'])
    print("\nStore-wise Statistics:")
    print(store_stats)
    
    print("\n=== Item Analysis ===")
    item_stats = sales_df.groupby('item_id')['quantity'].agg(['mean', 'count'])
    print("\nTop 5 Items by Average Quantity:")
    print(item_stats.sort_values('mean', ascending=False).head())
    
    # Feature Correlations
    numeric_cols = sales_df.select_dtypes(include=[np.number]).columns
    correlations = sales_df[numeric_cols].corr()
    print("\n=== Feature Correlations ===")
    print(correlations['quantity'].sort_values(ascending=False))

    return {
        'monthly_sales': monthly_sales,
        'store_stats': store_stats,
        'item_stats': item_stats,
        'correlations': correlations
    }

# Perform EDA
print("Performing EDA...")
eda_results = perform_eda(train_df)

Loading data...
Performing EDA...
=== Basic Statistics ===

Descriptive Statistics for Numerical Columns:
         Unnamed: 0      quantity    price_base     sum_total      store_id
count  7.432685e+06  7.432685e+06  7.432685e+06  7.432685e+06  7.432685e+06
mean   1.235202e+07  5.642398e+00  2.075824e+02  7.612796e+02  2.041405e+00
std    7.625496e+06  2.740466e+01  3.372886e+02  4.789478e+03  1.206728e+00
min    0.000000e+00 -5.000000e+02 -2.167667e+04 -2.041080e+04  1.000000e+00
25%    1.858171e+06  1.000000e+00  6.000000e+01  1.279000e+02  1.000000e+00
50%    1.344877e+07  2.000000e+00  1.099000e+02  2.596000e+02  2.000000e+00
75%    1.988142e+07  4.672000e+00  1.999000e+02  5.990000e+02  3.000000e+00
max    2.173959e+07  4.952000e+03  2.899990e+04  6.865242e+05  4.000000e+00

=== Missing Values Analysis ===
Series([], dtype: int64)

=== Target Variable Analysis ===

Quantity Statistics:
Mean quantity: 5.64
Median quantity: 2.00
Min quantity: -500.00
Max quantity: 4952.00

=== Tempo

# 3. New Approaches

In [16]:
# Import all necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
import xgboost as xgb
from lightgbm import LGBMRegressor
from datetime import datetime

# Define data paths
DATA_PATH = '/kaggle/input/ml-zoomcamp-2024-competition'

# Load and prepare data
print("Loading and preparing data...")
train_df = pd.read_csv(f'{DATA_PATH}/sales.csv')

def prepare_features(df):
    """Create features from the available data"""
    # Convert date and create date-based features
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['day_of_week'] = df['date'].dt.dayofweek
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    
    # Create lag features
    df['lag_1'] = df.groupby(['item_id', 'store_id'])['quantity'].shift(1)
    df['lag_7'] = df.groupby(['item_id', 'store_id'])['quantity'].shift(7)
    
    # Create rolling mean features
    df['rolling_mean_7'] = df.groupby(['item_id', 'store_id'])['quantity'].transform(
        lambda x: x.rolling(window=7, min_periods=1).mean())
    df['rolling_mean_30'] = df.groupby(['item_id', 'store_id'])['quantity'].transform(
        lambda x: x.rolling(window=30, min_periods=1).mean())
    
    # Fill NaN values
    df = df.fillna(0)
    
    return df

# Prepare features
train_df = prepare_features(train_df)

# Select features for training
feature_columns = [
    'year', 'month', 'day', 'day_of_week', 'is_weekend',
    'lag_1', 'lag_7', 'rolling_mean_7', 'rolling_mean_30',
    'store_id'
]

# Prepare X and y
X = train_df[feature_columns]
y = train_df['quantity']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def train_multiple_models(X_train, X_test, y_train, y_test):
    """Train and tune multiple models"""
    models = {}
    results = {}
    
    # 1. XGBoost with GridSearch
    print("\n=== Training XGBoost with GridSearch ===")
    xgb_params = {
        'n_estimators': [100, 200],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1],
        'subsample': [0.8, 1.0]
    }
    xgb_model = xgb.XGBRegressor(random_state=42)
    xgb_grid = GridSearchCV(xgb_model, xgb_params, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
    xgb_grid.fit(X_train, y_train)
    models['xgboost'] = xgb_grid.best_estimator_
    results['xgboost'] = {
        'best_params': xgb_grid.best_params_,
        'best_score': -xgb_grid.best_score_
    }
    
    # 2. LightGBM with GridSearch
    print("\n=== Training LightGBM with GridSearch ===")
    lgb_params = {
        'n_estimators': [100, 200],
        'num_leaves': [31, 63],
        'learning_rate': [0.01, 0.1],
        'subsample': [0.8, 1.0]
    }
    lgb_model = LGBMRegressor(random_state=42)
    lgb_grid = GridSearchCV(lgb_model, lgb_params, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
    lgb_grid.fit(X_train, y_train)
    models['lightgbm'] = lgb_grid.best_estimator_
    results['lightgbm'] = {
        'best_params': lgb_grid.best_params_,
        'best_score': -lgb_grid.best_score_
    }
    
    # 3. Random Forest with GridSearch
    print("\n=== Training Random Forest with GridSearch ===")
    rf_params = {
        'n_estimators': [100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5]
    }
    rf_model = RandomForestRegressor(random_state=42)
    rf_grid = GridSearchCV(rf_model, rf_params, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
    rf_grid.fit(X_train, y_train)
    models['random_forest'] = rf_grid.best_estimator_
    results['random_forest'] = {
        'best_params': rf_grid.best_params_,
        'best_score': -rf_grid.best_score_
    }
    
    # 4. Ridge Regression
    print("\n=== Training Ridge Regression with GridSearch ===")
    ridge_params = {
        'alpha': [0.1, 1.0, 10.0],
        'solver': ['auto', 'svd']
    }
    ridge_model = Ridge(random_state=42)
    ridge_grid = GridSearchCV(ridge_model, ridge_params, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
    ridge_grid.fit(X_train, y_train)
    models['ridge'] = ridge_grid.best_estimator_
    results['ridge'] = {
        'best_params': ridge_grid.best_params_,
        'best_score': -ridge_grid.best_score_
    }
    
    # Evaluate all models on test set
    print("\n=== Model Evaluation on Test Set ===")
    for name, model in models.items():
        y_pred = model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        results[name].update({
            'test_rmse': rmse,
            'test_mae': mae,
            'test_r2': r2
        })
        
        print(f"\nModel: {name}")
        print(f"Test RMSE: {rmse:.4f}")
        print(f"Test MAE: {mae:.4f}")
        print(f"Test R2: {r2:.4f}")
        print(f"Best Parameters: {results[name]['best_params']}")
    
    # Find best model
    best_model = min(results.items(), key=lambda x: x[1]['test_rmse'])
    print(f"\nBest Model: {best_model[0]} with RMSE: {best_model[1]['test_rmse']:.4f}")
    
    return models, results

# Train multiple models
print("Training multiple models with hyperparameter tuning...")
models, results = train_multiple_models(X_train, X_test, y_train, y_test)

# Save the best model's results
best_model_name = min(results.items(), key=lambda x: x[1]['test_rmse'])[0]
best_model = models[best_model_name]