In [None]:
pip install requests pandas numpy matplotlib seaborn scipy requests-cache retry-requests

In [2]:
# ======== IMPORTS ========
import os
import sys
import time
import math
import random
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from scipy import stats

In [3]:
# Step 1: Correlation filtering for DataCo + Weather dataset (Train/Validation only).

import pandas as pd
import numpy as np
import os

# === Config ===
INPUT_FILE = "./final_datasets/DataCo_Weather_Lagged.csv"  # Dataset with lags
OUTPUT_FILE = "./final_datasets/Feature_Selected_Step1.csv"
TARGET = "Sales"

# Load dataset
print("Loading dataset...")
df = pd.read_csv(INPUT_FILE)

# Use only Train/Validation (2015–2016)
df["date_only"] = pd.to_datetime(df["date_only"], errors='coerce')
df_train = df[df["date_only"].dt.year.isin([2015, 2016])].copy()

# Select numeric features (exclude ID columns and date)
numeric_cols = df_train.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols = [col for col in numeric_cols if col not in ["latitude", "longitude"]]  # keep coords separate

# Compute correlation matrix
corr_matrix = df_train[numeric_cols].corr()

# Report correlation with target
print("\n=== Correlation with Target (Sales) ===")
target_corr = corr_matrix[TARGET].sort_values(ascending=False)
print(target_corr)

# Drop highly correlated features (> 0.9)
threshold = 0.9
to_drop = set()
for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if abs(corr_matrix.iloc[i, j]) > threshold:
            colname = corr_matrix.columns[i]
            to_drop.add(colname)

print(f"\nDropping {len(to_drop)} highly correlated features (>0.9): {to_drop}")

# Keep the rest
selected_cols = [col for col in numeric_cols if col not in to_drop]

# Save filtered dataset (Train only)
df_train[selected_cols + ["latitude", "longitude", "date_only"]].to_csv(OUTPUT_FILE, index=False)
print(f"\nSaved Step 1 feature-filtered dataset: {OUTPUT_FILE}")

Loading dataset...

=== Correlation with Target (Sales) ===
Sales                            1.000000
Sales per customer               0.996306
Order Item Total                 0.996306
Sales_lag_1                      0.937114
Department Id                    0.901650
Order Item Product Price         0.894086
Product Price                    0.894086
Product Category Id              0.871668
Category Id                      0.871668
Order Item Cardprod Id           0.869906
Product Card Id                  0.869906
Order Item Quantity              0.822369
order_hour                       0.792463
Order Item Discount              0.792436
Order Item Discount Rate         0.704495
shipping_hour                    0.694716
Days for shipping (real)         0.680200
Customer Id                      0.654647
Order Customer Id                0.654647
Sales_lag_7                      0.606214
Days for shipment (scheduled)    0.542588
Late_delivery_risk               0.528297
Customer Zipcode

In [7]:
# feature_selection_step1_correlation_fixed.py
# Step 1: Correlation filtering for DataCo + Weather dataset (keeps Sales).

import pandas as pd
import numpy as np
import os

# === Config ===
INPUT_FILE = "./final_datasets/DataCo_Weather_Lagged.csv"  # Dataset with lags
OUTPUT_FILE = "./final_datasets/Feature_Selected_Step1_new.csv"
TARGET = "Sales"  

# Load dataset
print("Loading dataset...")
df = pd.read_csv(INPUT_FILE)

# Use only Train/Validation (2015–2016) for feature selection
df["date_only"] = pd.to_datetime(df["date_only"], errors='coerce')
df_train = df[df["date_only"].dt.year.isin([2015, 2016])].copy()

# Select numeric features (exclude coords and date)
numeric_cols = df_train.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols = [col for col in numeric_cols if col not in ["latitude", "longitude"]]  # coords stay separate

# Compute correlation matrix
corr_matrix = df_train[numeric_cols].corr()

# Show correlation with target
print("\n=== Correlation with Target (Sales) ===")
target_corr = corr_matrix[TARGET].sort_values(ascending=False)
print(target_corr)

# Drop redundant features (correlated with other features > 0.9, but never drop TARGET)
threshold = 0.9
to_drop = set()
for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if abs(corr_matrix.iloc[i, j]) > threshold:
            colname = corr_matrix.columns[i]
            if colname != TARGET:  # Never drop Sales (target)
                to_drop.add(colname)

print(f"\nDropping {len(to_drop)} highly correlated features (>0.9, not target): {to_drop}")

# Keep selected features + target + coords + date
selected_cols = [col for col in numeric_cols if col not in to_drop]

# Always include Sales in the final output
final_cols = [TARGET] + [col for col in selected_cols if col != TARGET] + ["latitude", "longitude", "date_only"]

# Save the filtered dataset
df_train[final_cols].to_csv(OUTPUT_FILE, index=False)
print(f"\nSaved Step 1 feature-filtered dataset (with target): {OUTPUT_FILE}")


Loading dataset...

=== Correlation with Target (Sales) ===
Sales                            1.000000
Sales per customer               0.996306
Order Item Total                 0.996306
Sales_lag_1                      0.937114
Department Id                    0.901650
Order Item Product Price         0.894086
Product Price                    0.894086
Product Category Id              0.871668
Category Id                      0.871668
Order Item Cardprod Id           0.869906
Product Card Id                  0.869906
Order Item Quantity              0.822369
order_hour                       0.792463
Order Item Discount              0.792436
Order Item Discount Rate         0.704495
shipping_hour                    0.694716
Days for shipping (real)         0.680200
Customer Id                      0.654647
Order Customer Id                0.654647
Sales_lag_7                      0.606214
Days for shipment (scheduled)    0.542588
Late_delivery_risk               0.528297
Customer Zipcode

In [None]:
pip install scikit-learn

In [8]:
# Step 2: Rank features by importance using Random Forest (nonlinear).
# Uses filtered dataset from Step 1 (correlation-filtered).

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import os

# === Config ===
INPUT_FILE = "./final_datasets/Feature_Selected_Step1_new.csv"  # Output from Step 1
OUTPUT_FILE = "./final_datasets/Feature_Importance_Ranked.csv"
TARGET = "Sales"
TOP_N = 20  # How many features to keep (adjust based on results)

# Load dataset
print("Loading correlation-filtered dataset...")
df = pd.read_csv(INPUT_FILE)

# Ensure date is datetime
df["date_only"] = pd.to_datetime(df["date_only"], errors='coerce')

# Select features (exclude coords and date for training)
features = [col for col in df.columns if col not in ["date_only", "latitude", "longitude", TARGET]]

X = df[features]
y = df[TARGET]

# Train-test split (just for ranking stability)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest
print("Training Random Forest for feature ranking...")
rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)

# Get feature importances
importances = rf.feature_importances_
feature_ranking = pd.DataFrame({
    "Feature": features,
    "Importance": importances
}).sort_values(by="Importance", ascending=False)

# Save full ranking
feature_ranking.to_csv(OUTPUT_FILE, index=False)
print(f"\nSaved ranked feature importance: {OUTPUT_FILE}")

# Print top N features
print(f"\n=== Top {TOP_N} Features ===")
print(feature_ranking.head(TOP_N))

Loading correlation-filtered dataset...
Training Random Forest for feature ranking...

Saved ranked feature importance: ./final_datasets/Feature_Importance_Ranked.csv

=== Top 20 Features ===
                          Feature  Importance
3              Sales per customer    0.991365
9             Order Item Discount    0.004262
13            Order Item Quantity    0.000697
10       Order Item Discount Rate    0.000478
4              Late_delivery_risk    0.000436
5                     Category Id    0.000385
20             shipping_dayofweek    0.000283
8                        Order Id    0.000221
11       Order Item Product Price    0.000209
16                     order_week    0.000165
21               shipping_weekend    0.000141
26                      windspeed    0.000129
17                order_dayofweek    0.000114
0        Days for shipping (real)    0.000100
6                     Customer Id    0.000094
15                     order_hour    0.000091
7                Customer 

In [11]:
# Step 3: Final feature selection using Recursive Feature Elimination (RFE)
# Selects top N features with Random Forest, evaluates RMSE.

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
import os

# === Config ===
DATA_FILE = "./final_datasets/Feature_Selected_Step1_new.csv"  # Dataset after correlation filtering
OUTPUT_FILE = "./final_datasets/Final_Selected_Features.csv"
TARGET = "Sales"   # Target column
TOP_N = 20         # How many features to keep (adjust as needed)

# Load dataset
print("Loading Step 1 dataset...")
df = pd.read_csv(DATA_FILE)

# Ensure date is datetime
df["date_only"] = pd.to_datetime(df["date_only"], errors='coerce')

# Filter to Train/Validation period (2015–2016 only)
df_train = df[df["date_only"].dt.year.isin([2015, 2016])].copy()

# Identify feature columns (exclude target, coords, and date)
features = [col for col in df_train.columns if col not in ["date_only", "latitude", "longitude", TARGET]]
X = df_train[features]
y = df_train[TARGET]

# Train-validation split for model stability
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Random Forest
rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

# Recursive Feature Elimination (RFE)
print(f"\nRunning RFE to select top {TOP_N} features...")
selector = RFE(rf, n_features_to_select=TOP_N, step=1)
selector.fit(X_train, y_train)

# Get the selected features
selected_features = [f for f, keep in zip(features, selector.support_) if keep]
print(f"\nSelected Top {TOP_N} Features:")
for feat in selected_features:
    print(f" - {feat}")

# Evaluate model performance (RMSE) on validation data using selected features
X_val_selected = X_val[selected_features]
rf.fit(X_train[selected_features], y_train)
y_pred = rf.predict(X_val_selected)

rmse = sqrt(mean_squared_error(y_val, y_pred))
print(f"\nValidation RMSE with top {TOP_N} features: {rmse:.4f}")

# Save the final Train dataset (2015–2016) with selected features
final_cols = [TARGET] + selected_features + ["latitude", "longitude", "date_only"]
df_train[final_cols].to_csv(OUTPUT_FILE, index=False)
print(f"\nSaved Final Feature-Selected Train Dataset: {OUTPUT_FILE}")

Loading Step 1 dataset...

Running RFE to select top 20 features...

Selected Top 20 Features:
 - Days for shipping (real)
 - Sales per customer
 - Late_delivery_risk
 - Category Id
 - Customer Id
 - Customer Zipcode
 - Order Id
 - Order Item Discount
 - Order Item Discount Rate
 - Order Item Product Price
 - Order Item Quantity
 - order_hour
 - order_week
 - order_dayofweek
 - shipping_hour
 - shipping_dayofweek
 - shipping_weekend
 - humidity
 - windspeed
 - cloudcover

Validation RMSE with top 20 features: 60.1250

Saved Final Feature-Selected Train Dataset: ./final_datasets/Final_Selected_Features.csv


In [None]:
pip install "numpy<=2.2" --upgrade --force-reinstall

In [None]:
pip install shap xgboost

In [None]:
pip install shap --upgrade --force-reinstall

In [None]:
pip uninstall shap numba -y

In [None]:
pip install "numpy<=2.2" --upgrade --force-reinstall

In [None]:
pip install shap