<a href="https://colab.research.google.com/github/apoorvsinghal20042004/EXECUTE3.0/blob/main/Untitled6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl.metadata (8.0 kB)
Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.6.3


In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import TimeSeriesSplit
import pickle

# Load data
data = pd.read_csv('sales.csv')

# Convert 'Date' to datetime format
data['Date'] = pd.to_datetime(data['Date'])

# Extract features from 'Date'
data['Day'] = data['Date'].dt.day
data['WeekOfYear'] = data['Date'].dt.isocalendar().week
data['Month'] = data['Date'].dt.month
data['Year'] = data['Date'].dt.year

# Create Lag Features
data['SalesVolume_Lag1'] = data['SalesVolume'].shift(1).fillna(0)
data['Revenue_Lag1'] = data['Revenue'].shift(1).fillna(0)

# Drop 'Date' column if not needed
data.drop(['Date'], axis=1, inplace=True)

# Convert 'CustomerIncome' to numeric
data['CustomerIncome'] = data['CustomerIncome'].replace({'Low': 1, 'Medium': 2, 'High': 3}).astype(int)

# Convert categorical columns to category type
categorical_columns = ['DayOfWeek', 'Season', 'Event', 'ProductID', 'ProductCategory',
                       'CustomerGender', 'CustomerLocation', 'CustomerSegment']

for col in categorical_columns:
    data[col] = data[col].astype('category')

# One-Hot Encoding for Categorical Variables
data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

# Log Transformation of skewed features
data['Revenue'] = np.log1p(data['Revenue'])
data['MarketingSpend'] = np.log1p(data['MarketingSpend'])
data['SalesVolume'] = np.log1p(data['SalesVolume'])

# Define features and target variables
X = data.drop(['SalesVolume', 'Revenue'], axis=1)
y_sales = data['SalesVolume']
y_revenue = data['Revenue']

# Save the full feature names before feature selection
full_feature_names = X.columns.tolist()

# Feature Selection using Feature Importance from RandomForest (or any model)
feature_selector = SelectFromModel(xgb.XGBRegressor(n_estimators=100, random_state=42), threshold='median')
feature_selector.fit(X, y_sales)
X_selected = feature_selector.transform(X)

# Get selected feature names based on the feature selector
selected_features = np.array(full_feature_names)[feature_selector.get_support()].tolist()

# Convert selected features back to a DataFrame with the original feature names
X_selected = pd.DataFrame(X_selected, columns=selected_features)

# Split data for training and testing
X_train_sales, X_test_sales, y_train_sales, y_test_sales = train_test_split(X_selected, y_sales, test_size=0.2, random_state=42)
X_train_revenue, X_test_revenue, y_train_revenue, y_test_revenue = train_test_split(X_selected, y_revenue, test_size=0.2, random_state=42)

# Save the selected feature names to be used during prediction
with open('selected_features.pkl', 'wb') as f:
    pickle.dump(selected_features, f)

# Define XGBoost model with reduced parameters
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=5, subsample=0.8, colsample_bytree=0.8)

# Use RandomizedSearchCV for Hyperparameter Tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.6, 0.8],
    'colsample_bytree': [0.6, 0.8]
}

# Hyperparameter tuning for Sales Volume model
random_search_sales = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_grid, scoring='neg_mean_squared_error', cv=TimeSeriesSplit(n_splits=3), verbose=1, n_jobs=-1, n_iter=10)
random_search_sales.fit(X_train_sales, y_train_sales)

# Use the best estimator for Sales Volume
best_xgb_sales_model = random_search_sales.best_estimator_
best_xgb_sales_model.fit(X_train_sales, y_train_sales)

# Hyperparameter tuning for Revenue model
random_search_revenue = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_grid, scoring='neg_mean_squared_error', cv=TimeSeriesSplit(n_splits=3), verbose=1, n_jobs=-1, n_iter=10)
random_search_revenue.fit(X_train_revenue, y_train_revenue)

# Use the best estimator for Revenue
best_xgb_revenue_model = random_search_revenue.best_estimator_
best_xgb_revenue_model.fit(X_train_revenue, y_train_revenue)

# Evaluate models
sales_predictions = best_xgb_sales_model.predict(X_test_sales)
revenue_predictions = best_xgb_revenue_model.predict(X_test_revenue)
print(f"Sales Volume MSE: {mean_squared_error(y_test_sales, sales_predictions)}")
print(f"Revenue MSE: {mean_squared_error(y_test_revenue, revenue_predictions)}")

# Save the improved models as pickle files
with open('optimized_sales_volume_model.pkl', 'wb') as f:
    pickle.dump(best_xgb_sales_model, f)

with open('optimized_revenue_model.pkl', 'wb') as f:
    pickle.dump(best_xgb_revenue_model, f)


Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Sales Volume MSE: 0.019403468206711304
Revenue MSE: 0.018395438150712918
