# Evaluation - Feature Importance 

### Objectives:
- Uncover the factors that significantly impact sales
- Build a machine learning model with a 90% accuracy rating (less than 10% average error)

---
## <u>Functions Used</u>

In [1]:
def grab_file():
    app = QApplication([])
    file_path, _ = QFileDialog.getOpenFileName(None, 
                                               "Select a CSV or XLSX file", "", 
                                               "Files (*.xlsx *.csv)")
    
    if not file_path:  # If no file is selected
        print("No file selected.")
        return None
    
    if file_path.endswith('.xlsx'):
        print("XLSX file selected. Converting to CSV.")
        df = pd.read_excel(file_path)
        csv_path = file_path.rsplit('.', 1)[0] + '.csv'
        df.to_csv(csv_path, index=False)
        print(f"CSV file created: {csv_path}")
    elif file_path.endswith('.csv'):
        print("CSV file selected. Reading file.")
        csv_path = file_path
        df = pd.read_csv(csv_path)
    else:
        raise ValueError("Unsupported file type selected.")
    return df

In [2]:
def data_cleaning(df):
    
    # Define the columns needed for analysis to simplify the dataset.
    cols_needed = ['Date','Category', 'Style', 'Size', 'SKU', 'Qty', 'Amount', 'promotion-ids', 'B2B']
    
    # Remove duplicates based on 'Order ID' to ensure unique transactions and select only the necessary columns.
    df = df.drop_duplicates(subset='Order ID')[cols_needed]
    
    # Standardize column names by renaming them to maintain consistent naming conventions across the dataset.
    df.rename(columns={'Qty': 'Quantity', 'promotion-ids': 'Promotions'}, inplace=True)
    
    # Fill missing values in 'Promotions' column with "No Promotion" to indicate transactions without any promotions.
    df['Promotions'].fillna("No Promotion", inplace=True)
    
    # Remove transactions with missing 'Amount' values as these represent incomplete data points.
    df.dropna(subset=['Amount'], inplace=True)
    df = df[(df['Quantity'] > 0) & (df['Amount'] > 0)]
    
    # Convert 'Amount' from INR to USD for standardization, assuming a fixed exchange rate (1 INR = 0.012 USD).
    df['Amount'] = df['Amount'] * 0.012
    
    # Simplify 'Promotions' column
    # Extract relevant information from Promotions
    promotions = ["Free-Financing", "Free Shipping", "Duplicated", "Coupon"]
    df['Promotions'] = df['Promotions'].apply(lambda x: next((promo for promo in promotions if promo in str(x)), x))
    
    # Convert "B2B" boolean to integer
    df['B2B'] = df['B2B'].astype(int)
    
    # Adjust 'Size' values
    df['Size'] = df['Size'].replace({'Free': 'Any'})

    # Convert the 'Date' column from string to DateTime objects for easier manipulation and analysis.
    df['Date'] = pd.to_datetime(df['Date'])
    
    # Remove numbers from 'Style'
    df['Style'] = df['Style'].str.replace('\d+', '', regex=True)
    
    # Sort by 'Date'
    df = df.sort_values(by='Date', ascending=True)
    
    # Return the cleaned dataset.
    return df

In [3]:
def feature_engineering(df):
    
    # Extract colors from 'SKU'
    color_names = ['MUSTARD', 'BLUE', 'NAVY', 'WHITE', 'BROWN', 'GREEN', 'BIEGE', 
                   'RED', 'PURPLE', 'MAROON', 'PINK', 'CHIKU', 'BLACK', 'GOLD', 
                   'BEIGE', 'ORANGE', 'YELLOW']
    
    # Extract the Colors from the column SKU
    df['SKU_Color'] = df['SKU'].apply(lambda sku: next((color for color in color_names if color in sku), 'NOT LISTED'))

    # Extracting Date (Year, Month, and Days)
#     df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
    df = df.drop('Date', axis=1).reset_index(drop=True)

    # Dropping features that have extracted information
    df = df.drop('SKU', axis=1)
        
    # Return the engineered dataset.
    return df

In [4]:
def split_dataset(df):
    X = df.drop('Amount', axis=1)
    y = df['Amount']

    return train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
def preprocess_features(X_train, X_test):
    # Columns for one-hot encoding and label encoding
    ohe_cols = X_train.select_dtypes(include='object').columns.to_list()
    scale_cols = X_train.select_dtypes(exclude='object').columns.to_list()
    scale_cols.remove('B2B')
    # Preprocessor for transformations
    preprocessor = ColumnTransformer(
        transformers=[
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first'), ohe_cols),
            ('scale', MinMaxScaler(), scale_cols)
        ], remainder='passthrough')  # 'B2B' remains unchanged
    
    # Fit and transform training data
    X_train_transformed = preprocessor.fit_transform(X_train)
    X_test_transformed = preprocessor.transform(X_test)

    X_train_transformed = pd.DataFrame(X_train_transformed, columns = preprocessor.get_feature_names_out())
    X_test_transformed = pd.DataFrame(X_test_transformed, columns = preprocessor.get_feature_names_out())
    
    return X_train_transformed, X_test_transformed, preprocessor

In [6]:
def apply_transformation(target):
    """Transform the target variable to reduce skewness."""
    transformations = {
        "sqrt": np.sqrt(target.clip(lower=0)),
        "p1_3": target**(1/3),
        "p1_4": target**(1/4),
        "p1_5": target**(1/5),
        "log1p": np.log1p(target)
    }
    skewness = {key: val.skew() for key, val in transformations.items()}
    best_transformation = min(skewness, key=skewness.get)
    print(f'Best transformation is "{best_transformation}" with skewness {skewness[best_transformation]}')
    return transformations[best_transformation], best_transformation

In [7]:
def inverse_transformation(predictions, transformation):
    if  transformation == 'log1p':
        return np.expm1(predictions)
    elif  transformation == 'sqrt':
        return np.power(predictions, 2)
    else:
        print('No transformation needed!')
        return predictions

In [8]:
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model_name = model.__class__.__name__
    print(f"Scoring {model_name}")
    
    # Fit the model
    start = time.time()
    model.fit(X_train, y_train)
    endtime = time.time() - start
    
    # Predicting
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    # Inverse transform the predictions and true values only once
    y_train, y_test = inverse_transformation(y_train, 'log1p'), inverse_transformation(y_test, 'log1p')
    y_pred_train, y_pred_test = inverse_transformation(y_pred_train, 'log1p'), inverse_transformation(y_pred_test, 'log1p')
    
    # Calculate metrics
    metrics = {
        'Model': model_name,
        'RMSE Train Set': np.sqrt(mean_squared_error(y_train, y_pred_train)),
        'RMSE Test Set': np.sqrt(mean_squared_error(y_test, y_pred_test)),
        
        'MAE Train Set': mean_absolute_error(y_train, y_pred_train),
        'MAE Test Set': mean_absolute_error(y_test, y_pred_test),
        
        'R2 Train Set': r2_score(y_train, y_pred_train),
        'R2 Test Set': r2_score(y_test, y_pred_test),
        'Time(sec)': endtime
    }
    return metrics

In [9]:
def grid_search_for_best_model(grid_models, param_dict, scoring_dict, X_train, y_train):

    # Container for grid search results
    grid_results = []

    # Running grid search for each model
    for model_name, model in grid_models.items():
        print(f"Running GridSearchCV for {model_name}...")
        start = time.time()
        grid = GridSearchCV(model, param_grid=param_dict[model_name], cv=5, verbose=False,
                            scoring=scoring_dict, refit='MAE')
        
        grid.fit(X_train, y_train)
        fit_time = time.time() - start
        
        # Extracting best scores
        best_score_rmse = np.sqrt(-grid.cv_results_['mean_test_RMSE'].max())
        best_score_mae = -grid.cv_results_['mean_test_MAE'].max()
        best_score_r2 = grid.cv_results_['mean_test_R2'].max()
        
        # Appending results
        grid_results.append({
            'Model': model_name,
            'Best Parameters': grid.best_params_,
            'RMSE': best_score_rmse,
            'MAE': best_score_mae,
            'R2': best_score_r2,
            'Time(sec)': fit_time
        })

    # Converting results to DataFrame
    results_df = pd.DataFrame(grid_results)
    
    # Extracting the best model based on RMSE
    best_model_info = results_df.loc[results_df['RMSE'].idxmin()]
    best_model_name = best_model_info['Model']
    best_model = grid_models[best_model_name].set_params(**best_model_info['Best Parameters'])
    
    print(f"Best Model: {best_model_name} with parameters {best_model_info['Best Parameters']}")
    
    return best_model, results_df

In [10]:
def get_importances_expanded(preprocessor, best_model):
        """Extracts expanded feature importances from a trained model."""
        if not hasattr(best_model, 'feature_importances_'):
            print("The provided model does not have feature_importances_ attribute.")
            return None

        imp_list = []
        feature_names = preprocessor.get_feature_names_out()
        feature_importances = best_model.feature_importances_

        # Normalize the feature importances
        feature_importances_normalized = feature_importances / sum(feature_importances) * 100

        # Populate imp_list with feature names and their importances
        for name, importance in zip(feature_names, feature_importances_normalized):
            imp_list.append({'Importance Name': name, 'Importance(%)': round(importance, 2)})
        
        return pd.DataFrame(imp_list)

In [11]:
class ModelFeatureImportance:
    def __init__(self, feature_names, n_repeats=10, random_state=42):
        self.feature_names = feature_names
        self.n_repeats = n_repeats
        self.random_state = random_state
        self.feature_importances_data = []

    def aggregate_and_normalize_importances(self, importances):
        aggregated_importances = defaultdict(float)
        for feature, importance in zip(self.feature_names, importances):
            base_name = feature.split("__")[1] if "__" in feature else feature
            base_name = base_name.rsplit("_", 1)[0]
            aggregated_importances[base_name] += np.abs(importance)
        total_importance = sum(aggregated_importances.values())
        return {k: v / total_importance * 100 for k, v in aggregated_importances.items()}

    def process_model(self, model, X_test, y_test):
        model_name = type(model).__name__
        if hasattr(model, 'coef_'):
            coefs = model.coef_.flatten()
            importances = np.abs(coefs)
            importance_type = "Coefficient Importance"
        elif hasattr(model, 'feature_importances_'):
            importances = model.feature_importances_
            importance_type = "Feature Importance"
        else:
            result = permutation_importance(model, X_test, y_test, n_repeats=self.n_repeats, random_state=self.random_state)
            importances = np.abs(result.importances_mean)
            importance_type = "Permutation Importance"
        
        normalized_importances = self.aggregate_and_normalize_importances(importances)
        for feature, importance in normalized_importances.items():
            self.feature_importances_data.append((model_name, feature, importance, importance_type))

    def display_importances(self):
        for data in self.feature_importances_data:
            model_name, feature, importance, importance_type = data
            print(f"{model_name} - {importance_type} for {feature}: {importance:.2f}%")

In [12]:
def deploy_best_model(model, X_train, y_train, X_test, y_test):
    
    # Fit the best model to understand the feature importance
    model.fit(X_train, y_train)

    y_train = np.expm1(y_train)
    y_test = np.expm1(y_test)

    # Predicting
    y_pred_train_log = model.predict(X_train)
    y_pred_test_log = model.predict(X_test)

    # Inverse the log transformation with expm1 (exp(x) - 1) to get predictions in the original scale
    y_pred_train = np.expm1(y_pred_train_log)
    y_pred_test = np.expm1(y_pred_test_log)

    # Combine the actual and predicted values for both training and test sets
    actual_combined = np.concatenate((y_train, y_test), axis=0)
    predicted_combined = np.concatenate((y_pred_train, y_pred_test), axis=0)

    # Create a DataFrame for the combined actual and predicted values
    final_output = pd.DataFrame({
        'Actual(USD)': actual_combined,
        'Predicted(USD)': predicted_combined
    })
    
    # Calculate errors and differences
    final_output['Dollar Difference(USD)'] = round(abs(final_output['Actual(USD)'] - final_output['Predicted(USD)']), 2)
    final_output['Error(%)'] = round(100 * final_output['Dollar Difference(USD)'] / final_output['Actual(USD)'], 2)
    
    # Calculate evaluation metrics for the combined dataset
    mape = final_output['Error(%)'].mean()
    rmse = np.sqrt(mean_squared_error(final_output['Actual(USD)'], final_output['Predicted(USD)']))
    mae = mean_absolute_error(final_output['Actual(USD)'], final_output['Predicted(USD)'])
    
    return mape, rmse, mae, final_output


---
## <u>Installations

In [13]:
# INSTALLATIONS
# !pip install pandas
# !pip install openpyxl
# !pip install numpy
# !pip install xgboost
# !pip install lightgbm
# !pip install category_encoders
# !pip install mlxtend
# !pip install shap

---
## <u>Imports of Libraries</u>
- imports separated for each step along the way

In [14]:
# CLEANING & BASIC IMPORTS
import pandas as pd
import numpy as np
import os
import time
from PyQt5.QtWidgets import QApplication, QFileDialog
from collections import defaultdict

# PREPROCESSING
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler, OrdinalEncoder, PolynomialFeatures, LabelEncoder
from category_encoders import TargetEncoder, BinaryEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# MODELING
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR, LinearSVR

# EVALUATING
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, make_scorer
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
import seaborn as sns
import shap

---
## <u>Reading File Into Project</u>
- taking into account for both 'xlsx' and 'csv' files
- display the full dataset rows and columns

In [15]:
df = grab_file()

CSV file selected. Reading file.


In [16]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

---
## <u>Data Cleaning</u>
- #### Only Keeping Necessary Columns for Analysis
- #### Removing Duplicates with Order ID the Same
    - False-Positive Transactions
- #### Renaming Columns for Similar Naming Conventions
- #### Filling missing values in 'promotion-ids' with "No Promotion"
- #### Drop missing values to disclude any potential biases
    - Since 'Amount' is crucial for analysis
    - There is only 1 type of currency wich is INR (Indian Rupee) * 0.012 for USD conversion
#### OPTIONAL:
- #### Extracting Valid Purchases from Orders
    - Ordered, Cancelled, Returned



In [17]:
df_cleaned = data_cleaning(df)

In [18]:
df_cleaned.head(2)

Unnamed: 0,Date,Category,Style,Size,SKU,Quantity,Amount,Promotions,B2B
48972,2022-03-31,Set,J,XS,J0127-SKD-XS,1,14.388,Free Shipping,0
49017,2022-03-31,kurta,JNE,XS,JNE3633-KR-XS,1,5.436,No Promotion,0


---
## <u>Feature Engineering

In [19]:
df_engineered = feature_engineering(df_cleaned)
df_engineered.head(2)

Unnamed: 0,Category,Style,Size,Quantity,Amount,Promotions,B2B,SKU_Color,Month,Day
0,Set,J,XS,1,14.388,Free Shipping,0,NOT LISTED,3,31
1,kurta,JNE,XS,1,5.436,No Promotion,0,NOT LISTED,3,31


---
## <u>Splitting Dataset</u>
- #### Train Test Split my Dataset
    - transfer target variable to split

In [20]:
X_train, X_test, y_train, y_test = split_dataset(df_engineered)
X_train.shape, X_test.shape

((84720, 9), (21180, 9))

---
## <u>Preprocessing (Encoding & Scaling)</u>

In [21]:
X_train, X_test, preprocessor = preprocess_features(X_train.copy(), X_test.copy())
X_train.shape, X_test.shape

Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros


((84720, 54), (21180, 54))

---
## <u>Addressing Skewness</u>
- #### Introducing functions to handle skewness of dataset
    
    - transforming target variable column and present a more evenly distributed data to analyze
    - will need to convert back to original form later

In [22]:
y_train.skew(), y_test.skew()

(1.1304304311033697, 1.001637164520419)

In [23]:
# Assuming apply_transformation is your function that applies the best transformation
y_train, _ = apply_transformation(y_train.copy())
y_test, _ = apply_transformation(y_test.copy())
y_train.skew(), y_test.skew()

Best transformation is "log1p" with skewness 0.2456782846199202
Best transformation is "log1p" with skewness 0.22925266829601365


(0.2456782846199202, 0.22925266829601365)

---
## <u>Developing a Baseline Model to Beat (DecisionTreeRegressor)

- #### Initializing baseline model(DecisionTreeRegressor) & models to beat the baseline

In [24]:
models = [
    DecisionTreeRegressor(random_state=42),
    XGBRegressor(),
    LGBMRegressor(force_col_wise=True, verbosity=-1),
    GradientBoostingRegressor(),
    RandomForestRegressor()
]

- #### Runnning models against baseline

In [25]:
model_scores = []
    
for model in models:
    # Evaluate the model
    metrics = evaluate_model(model, 
                             X_train, 
                             y_train, 
                             X_test, 
                             y_test)
    
    # Append the scoring with metrics dictionary
    model_scores.append(metrics)

Scoring DecisionTreeRegressor
Scoring XGBRegressor
Scoring LGBMRegressor
Scoring GradientBoostingRegressor
Scoring RandomForestRegressor


In [26]:
model_scores_df = pd.DataFrame(model_scores)
model_scores_df

Unnamed: 0,Model,RMSE Train Set,RMSE Test Set,MAE Train Set,MAE Test Set,R2 Train Set,R2 Test Set,Time(sec)
0,DecisionTreeRegressor,1.775249,2.180926,1.149388,1.464993,0.695641,0.549574,0.182679
1,XGBRegressor,1.964069,2.019429,1.344556,1.383259,0.627453,0.613812,0.270806
2,LGBMRegressor,2.003221,2.028818,1.375129,1.391457,0.612452,0.610213,0.187455
3,GradientBoostingRegressor,2.059488,2.069364,1.428004,1.43365,0.590375,0.594477,5.375488
4,RandomForestRegressor,1.79266,2.116969,1.190632,1.434552,0.689642,0.575605,12.063902


---
## <u>Grid Search Scoring

#### Setting up the Grid Search for the top 2 performing models (LGBMRegressor & XGBRegressor)

- The goal is to narrow down the best model to deploy

In [27]:
# Initialize Scoring Dictionary
scoring_dict = {
    'RMSE': make_scorer(lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)), 
                        greater_is_better=False),
    'MAE': 'neg_mean_absolute_error',
    'R2': 'r2'
}

# Initialize Grid Models
grid_models = {
    'XGBoost': XGBRegressor(enable_categorical=True),
    'LightGBM': LGBMRegressor(force_col_wise=True, verbosity=-1),
}

# Setting parameters for grid search
param_dict = {

    'XGBoost': {'n_estimators': [470],
                'max_depth': [5]},
#                 'learning_rate': [0.1],
#                 'reg_alpha': [0.01, 0.1],
#                 'reg_lambda': [2.4, 2.5, 2.6]},

    'LightGBM': {'num_leaves': [57, 58, 59, 60]},
                 'n_estimators': [740, 745, 750],
                 'max_depth': [6, 7, 8],
#                  'learning_rate': [0.1],
#                  'reg_alpha': [0.01, 0.1],
#                  'reg_lambda': [0.3, 0.4, 0.5]},
}

In [28]:
best_model, model_performance_df = grid_search_for_best_model(grid_models,
                                                              param_dict,
                                                              scoring_dict,
                                                              X_train, 
                                                              y_train)
print('The best model is:',best_model.__class__.__name__)

Running GridSearchCV for XGBoost...
Running GridSearchCV for LightGBM...
Best Model: LightGBM with parameters {'num_leaves': 60}
The best model is: LGBMRegressor


In [29]:
model_performance_df

Unnamed: 0,Model,Best Parameters,RMSE,MAE,R2,Time(sec)
0,XGBoost,"{'max_depth': 5, 'n_estimators': 470}",0.446259,0.149922,0.656269,4.384387
1,LightGBM,{'num_leaves': 60},0.445058,0.148989,0.659947,3.736452


---
## <u> Deploy Best Model on Test Data

In [30]:
mape, rmse, mae, final_output = deploy_best_model(best_model, X_train, y_train, X_test, y_test)

In [31]:
print(f"MAPE: {mape}")
print(f"RMSE: {rmse}")
print(f"MAE: {mae}\n\n")

final_output.head(3)

MAPE: 17.03741709159585
RMSE: 1.987643470363519
MAE: 1.3615095031566964




Unnamed: 0,Actual(USD),Predicted(USD),Dollar Difference(USD),Error(%)
0,5.988,5.349686,0.64,10.69
1,8.76,9.340797,0.58,6.62
2,4.02,5.606002,1.59,39.55


---
## <u>Feature Importance</u>
#### Percentage Explainer
- Aggregating the transformed columns back to percentages

In [32]:
# Initialize the class with feature names
feature_importance_processor = ModelFeatureImportance(feature_names=preprocessor.get_feature_names_out())

# Process feature importances for each model
feature_importance_processor.process_model(best_model, X_test, y_test)

# Display the calculated feature importances
feature_importance_processor.display_importances()
importances_df = get_importances_expanded(preprocessor, best_model)
importances_df.sort_values(by='Importance(%)',ascending=False)

LGBMRegressor - Feature Importance for Category: 12.61%
LGBMRegressor - Feature Importance for Style: 11.53%
LGBMRegressor - Feature Importance for Size: 15.63%
LGBMRegressor - Feature Importance for Promotions: 11.80%
LGBMRegressor - Feature Importance for SKU_Color: 1.42%
LGBMRegressor - Feature Importance for Quantity: 2.63%
LGBMRegressor - Feature Importance for Month: 12.56%
LGBMRegressor - Feature Importance for Day: 30.98%
LGBMRegressor - Feature Importance for B2B: 0.85%


Unnamed: 0,Importance Name,Importance(%)
52,scale__Day,30.98
51,scale__Month,12.56
31,onehot__Promotions_Free-Financing,4.03
30,onehot__Promotions_Free Shipping,3.73
10,onehot__Style_J,3.73
3,onehot__Category_Set,3.46
32,onehot__Promotions_No Promotion,3.07
25,onehot__Size_S,2.86
27,onehot__Size_XS,2.66
23,onehot__Size_L,2.63
