In [None]:
pip install requests pandas numpy matplotlib seaborn scipy requests-cache retry-requests

In [3]:
# ======== IMPORTS ========
import os
import sys
import time
import math
import random
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from scipy import stats

In [4]:
# feature_selection_step1_correlation_fixed.py
# Step 1: Correlation filtering for DataCo + Weather dataset (keeps Sales).

import pandas as pd
import numpy as np
import os

# === Config ===
INPUT_FILE = "./final_datasets/DataCo_Weather_Lagged_new.csv"  # Dataset with lags
OUTPUT_FILE = "./final_datasets/Feature_Selected_Step1_new2.csv"
TARGET = "Sales"  

# Load dataset
print("Loading dataset...")
df = pd.read_csv(INPUT_FILE)

# Use only Train/Validation (2015–2016) for feature selection
df["date_only"] = pd.to_datetime(df["date_only"], errors='coerce')
df_train = df[df["date_only"].dt.year.isin([2015, 2016])].copy()

# Select numeric features (exclude coords and date)
numeric_cols = df_train.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols = [col for col in numeric_cols if col not in ["latitude", "longitude"]]  # coords stay separate

# Compute correlation matrix
corr_matrix = df_train[numeric_cols].corr()

# Show correlation with target
print("\n=== Correlation with Target (Sales) ===")
target_corr = corr_matrix[TARGET].sort_values(ascending=False)
print(target_corr)

# Drop redundant features (correlated with other features > 0.9, but never drop TARGET)
threshold = 0.9
to_drop = set()
for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if abs(corr_matrix.iloc[i, j]) > threshold:
            colname = corr_matrix.columns[i]
            if colname != TARGET:  # Never drop Sales (target)
                to_drop.add(colname)

print(f"\nDropping {len(to_drop)} highly correlated features (>0.9, not target): {to_drop}")

# Keep selected features + target + coords + date
selected_cols = [col for col in numeric_cols if col not in to_drop]

# Always include Sales in the final output
final_cols = [TARGET] + [col for col in selected_cols if col != TARGET] + ["latitude", "longitude", "date_only"]

# Save the filtered dataset
df_train[final_cols].to_csv(OUTPUT_FILE, index=False)
print(f"\nSaved Step 1 feature-filtered dataset (with target): {OUTPUT_FILE}")


Loading dataset...

=== Correlation with Target (Sales) ===
Sales                               1.000000
Order Item Total                    0.995462
Sales_lag_1                         0.816597
Product Card Id                     0.748084
Order Item Quantity                 0.697607
Sales_lag_7                         0.441269
Order Region_Other                  0.425769
Shipping Mode_Standard Class        0.376684
Delivery Status_Late delivery       0.345775
Customer Segment_Consumer           0.340839
Order Item Discount                 0.273520
Type_DEBIT                          0.268366
Order Status_COMPLETE               0.240383
Order Status_Other                  0.225871
Customer Segment_Corporate          0.213241
Type_TRANSFER                       0.210913
Market_Europe                       0.205641
Market_Pacific Asia                 0.196546
Market_LATAM                        0.195985
Market_Other                        0.192092
Delivery Status_Advance shipping    0.18

In [None]:
pip install scikit-learn

In [6]:
# Step 2: Rank features by importance using Random Forest (nonlinear).
# Uses filtered dataset from Step 1 (correlation-filtered).

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import os

# === Config ===
INPUT_FILE = "./final_datasets/Feature_Selected_Step1_new2.csv"  # Output from Step 1
OUTPUT_FILE = "./final_datasets/Feature_Importance_Ranked_new.csv"
TARGET = "Sales"
TOP_N = 20  # How many features to keep

# Load dataset
print("Loading correlation-filtered dataset...")
df = pd.read_csv(INPUT_FILE)

# Ensure date is datetime
df["date_only"] = pd.to_datetime(df["date_only"], errors='coerce')

# Select features (exclude coords and date for training)
features = [col for col in df.columns if col not in ["date_only", "latitude", "longitude", TARGET]]

X = df[features]
y = df[TARGET]

# Train-test split (just for ranking stability)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest
print("Training Random Forest for feature ranking...")
rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)

# Get feature importances
importances = rf.feature_importances_
feature_ranking = pd.DataFrame({
    "Feature": features,
    "Importance": importances
}).sort_values(by="Importance", ascending=False)

# Save full ranking
feature_ranking.to_csv(OUTPUT_FILE, index=False)
print(f"\nSaved ranked feature importance: {OUTPUT_FILE}")

# Print top N features
print(f"\n=== Top {TOP_N} Features ===")
print(feature_ranking.head(TOP_N))

Loading correlation-filtered dataset...
Training Random Forest for feature ranking...

Saved ranked feature importance: ./final_datasets/Feature_Importance_Ranked_new.csv

=== Top 20 Features ===
                             Feature  Importance
10                   Product Card Id    0.501609
48                       Sales_lag_1    0.360786
0             Order Profit Per Order    0.053678
2                Order Item Discount    0.015913
53                       temp_lag_30    0.015269
24                Order Region_Other    0.013469
52                        temp_lag_7    0.009126
1                Order Item Quantity    0.007305
4            Order Item Profit Ratio    0.006771
3           Order Item Discount Rate    0.002846
51                        temp_lag_1    0.002389
30      Shipping Mode_Standard Class    0.002260
16     Delivery Status_Late delivery    0.001440
31         Customer Segment_Consumer    0.000929
20                      Market_LATAM    0.000630
37                  

In [7]:
# Step 3: Final feature selection using Recursive Feature Elimination (RFE)
# Selects top N features with Random Forest, evaluates RMSE.

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
import os

# === Config ===
DATA_FILE = "./final_datasets/Feature_Selected_Step1_new2.csv"  # Dataset after correlation filtering
OUTPUT_FILE = "./final_datasets/Final_Selected_Features_new.csv"
TARGET = "Sales"   # Target column
TOP_N = 20         # How many features to keep (adjust as needed)

# Load dataset
print("Loading Step 1 dataset...")
df = pd.read_csv(DATA_FILE)

# Ensure date is datetime
df["date_only"] = pd.to_datetime(df["date_only"], errors='coerce')

# Filter to Train/Validation period (2015–2016 only)
df_train = df[df["date_only"].dt.year.isin([2015, 2016])].copy()

# Identify feature columns (exclude target, coords, and date)
features = [col for col in df_train.columns if col not in ["date_only", "latitude", "longitude", TARGET]]
X = df_train[features]
y = df_train[TARGET]

# Train-validation split for model stability
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Random Forest
rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

# Recursive Feature Elimination (RFE)
print(f"\nRunning RFE to select top {TOP_N} features...")
selector = RFE(rf, n_features_to_select=TOP_N, step=1)
selector.fit(X_train, y_train)

# Get the selected features
selected_features = [f for f, keep in zip(features, selector.support_) if keep]
print(f"\nSelected Top {TOP_N} Features:")
for feat in selected_features:
    print(f" - {feat}")

# Evaluate model performance (RMSE) on validation data using selected features
X_val_selected = X_val[selected_features]
rf.fit(X_train[selected_features], y_train)
y_pred = rf.predict(X_val_selected)

rmse = sqrt(mean_squared_error(y_val, y_pred))
print(f"\nValidation RMSE with top {TOP_N} features: {rmse:.4f}")

# Save the final Train dataset (2015–2016) with selected features
final_cols = [TARGET] + selected_features + ["latitude", "longitude", "date_only"]
df_train[final_cols].to_csv(OUTPUT_FILE, index=False)
print(f"\nSaved Final Feature-Selected Train Dataset: {OUTPUT_FILE}")

Loading Step 1 dataset...

Running RFE to select top 20 features...

Selected Top 20 Features:
 - Order Profit Per Order
 - Order Item Quantity
 - Order Item Discount
 - Order Item Discount Rate
 - Order Item Profit Ratio
 - Product Card Id
 - Type_DEBIT
 - Delivery Status_Late delivery
 - Market_LATAM
 - Order Region_Other
 - Shipping Mode_Second Class
 - Shipping Mode_Standard Class
 - Customer Segment_Consumer
 - tempmax
 - humidity
 - solarradiation
 - Sales_lag_1
 - temp_lag_1
 - temp_lag_7
 - temp_lag_30

Validation RMSE with top 20 features: 81.6737

Saved Final Feature-Selected Train Dataset: ./final_datasets/Final_Selected_Features_new.csv
