In [4]:
# Model Training and Evaluation



## 1. Import Libraries


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb

# Set up visualizations
sns.set(style="whitegrid")


In [7]:
import sys
import os
sys.path.append(os.path.abspath('scripts'))


In [11]:
scripts_path = os.path.abspath('scripts')
sys.path.append(scripts_path)


In [12]:
def import_from_path(module_name, module_path):
    spec = importlib.util.spec_from_file_location(module_name, module_path)
    module = importlib.util.module_from_spec(spec)
    sys.modules[module_name] = module
    spec.loader.exec_module(module)
    return module


In [14]:
print(scripts_path)


c:\Users\nikhi\electronics_supply_chain\notebooks\scripts


In [15]:
print(os.path.join(scripts_path, "data_preprocessing.py"))


c:\Users\nikhi\electronics_supply_chain\notebooks\scripts\data_preprocessing.py


In [19]:
import importlib
import data_preprocessing
importlib.reload(data_preprocessing)


['__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__']


In [23]:
print("data_preprocessing module loaded")


data_preprocessing module loaded


In [24]:
import importlib
import data_preprocessing

importlib.reload(data_preprocessing)


<module 'data_preprocessing' from 'C:\\Users\\nikhi\\electronics_supply_chain\\scripts\\data_preprocessing.py'>

In [25]:
import data_preprocessing

print(dir(data_preprocessing))  # Should include 'preprocess_data'


['__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', 'pd', 'preprocess_data']


In [27]:
import importlib
import data_preprocessing

# Reload the module if necessary
importlib.reload(data_preprocessing)

# Access the function
preprocess_data_func = data_preprocessing.preprocess_data

# Define the path to your sample file
sample_file_path = 'C:\\Users\\nikhi\\electronics_supply_chain\\data\\electronics_supply_chain_data_500.csv'

# Test the function
try:
    X, y = preprocess_data_func(sample_file_path)
    print("Features:")
    print(X.head())  # Display the first few rows of the feature DataFrame
    print("Target:")
    print(y.head())  # Display the first few rows of the target Series
except Exception as e:
    print(f"Error: {e}")


Features:
         date  product_id  product_name product_category  supplier_id  \
0  2023-10-09         107             0      electronics            5   
1  2023-08-17          21             1      electronics           10   
2  2023-04-15         122             2      electronics           10   
3  2023-03-31         117             3      electronics            2   
4  2024-03-08         104             4      electronics            9   

   supplier_name  supplier_location  supplier_reliability  quantity_ordered  \
0              0                  0                  0.90               430   
1              1                  1                  0.77                63   
2              2                  0                  0.90               280   
3              3                  1                  0.94              1816   
4              1                  2                  0.90              1218   

   demand_forecast  days_to_availability  
0              891               

In [29]:
print(X.dtypes)


date                     object
product_id                int64
product_name              int64
product_category         object
supplier_id               int64
supplier_name             int64
supplier_location         int64
supplier_reliability    float64
quantity_ordered          int64
demand_forecast           int64
days_to_availability      int64
dtype: object


In [32]:
import pandas as pd

def preprocess_data(file_path):
    df = pd.read_csv(file_path)
    
    # Convert date columns to numeric features
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day

    # Drop the original date column
    df = df.drop('date', axis=1)
    
    # Convert categorical columns
    df['product_name'] = pd.factorize(df['product_name'])[0]
    df['supplier_name'] = pd.factorize(df['supplier_name'])[0]
    df['supplier_location'] = pd.factorize(df['supplier_location'])[0]
    df['product_category'] = pd.factorize(df['product_category'])[0]
    
    # Ensure all columns are numeric
    for column in df.columns:
        if df[column].dtype == 'object':
            df[column] = pd.to_numeric(df[column], errors='coerce')
    
    # Drop any remaining non-numeric columns
    df = df.dropna()  # Drop rows with NaN values after conversion
    
    # Separate features and target
    X = df.drop('quantity_available', axis=1)
    y = df['quantity_available']
    
    return X, y


In [33]:
# Example of using the preprocess_data function
X, y = preprocess_data('C:\\Users\\nikhi\\electronics_supply_chain\\data\\electronics_supply_chain_data_500.csv')


In [34]:
def preprocess_data(file_path):
    df = pd.read_csv(file_path)
    print(df.head())  # Debugging line to check the initial DataFrame
    
    # Convert date columns to numeric features
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day

    # Drop the original date column
    df = df.drop('date', axis=1)
    
    # Convert categorical columns
    df['product_name'] = pd.factorize(df['product_name'])[0]
    df['supplier_name'] = pd.factorize(df['supplier_name'])[0]
    df['supplier_location'] = pd.factorize(df['supplier_location'])[0]
    df['product_category'] = pd.factorize(df['product_category'])[0]
    
    # Ensure all columns are numeric
    for column in df.columns:
        if df[column].dtype == 'object':
            df[column] = pd.to_numeric(df[column], errors='coerce')
    
    # Drop any remaining non-numeric columns
    df = df.dropna()  # Drop rows with NaN values after conversion
    
    # Separate features and target
    X = df.drop('quantity_available', axis=1)
    y = df['quantity_available']
    
    return X, y


In [36]:
X, y = preprocess_data(file_path)
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")


         date  product_id              product_name product_category  \
0  2023-10-09         107   Microsoft Surface Pro 9      electronics   
1  2023-08-17          21        Logitech G502 Hero      electronics   
2  2023-04-15         122  Microsoft Surface Book 3      electronics   
3  2023-03-31         117     Epson EcoTank ET-2760      electronics   
4  2024-03-08         104   NVIDIA GeForce RTX 3080      electronics   

   order_date availability_date  supplier_id       supplier_name  \
0  2023-06-05        2023-11-03            5     TechSource Inc.   
1  2023-02-07        2023-05-13           10           GadgetPro   
2  2024-05-29        2024-06-03           10          FutureTech   
3  2023-04-13        2023-05-23            2  PremiumElectronics   
4  2023-04-13        2024-07-09            9           GadgetPro   

  supplier_location  supplier_reliability  quantity_ordered  \
0             Japan                  0.90               430   
1                UK             

In [37]:
def preprocess_data(file_path):
    df = pd.read_csv(file_path)
    print("Original Data Shape:", df.shape)

    # Drop rows with missing values
    df = df.dropna()
    print("Data Shape After Dropna:", df.shape)
    
    # Convert categorical variables to numeric
    df['product_name'] = pd.factorize(df['product_name'])[0]
    df['supplier_name'] = pd.factorize(df['supplier_name'])[0]
    df['supplier_location'] = pd.factorize(df['supplier_location'])[0]
    
    # Convert date columns
    df['order_date'] = pd.to_datetime(df['order_date'])
    df['availability_date'] = pd.to_datetime(df['availability_date'])
    df['days_to_availability'] = (df['availability_date'] - df['order_date']).dt.days
    
    # Drop original date columns
    df = df.drop(['order_date', 'availability_date'], axis=1)
    
    # Separate features and target
    X = df.drop('quantity_available', axis=1)
    y = df['quantity_available']
    
    print("Features Shape:", X.shape)
    print("Target Shape:", y.shape)
    
    return X, y


In [41]:
file_path = 'electronics_supply_chain_data_500.csv'


In [43]:
file_path = 'C:\\Users\\nikhi\\electronics_supply_chain\\data\\electronics_supply_chain_data_500.csv'


In [44]:
X, y = preprocess_data(file_path)


Original Data Shape: (257, 13)
         date  product_id              product_name product_category  \
0  2023-10-09         107   Microsoft Surface Pro 9      electronics   
1  2023-08-17          21        Logitech G502 Hero      electronics   
2  2023-04-15         122  Microsoft Surface Book 3      electronics   
3  2023-03-31         117     Epson EcoTank ET-2760      electronics   
4  2024-03-08         104   NVIDIA GeForce RTX 3080      electronics   

   order_date availability_date  supplier_id       supplier_name  \
0  2023-06-05        2023-11-03            5     TechSource Inc.   
1  2023-02-07        2023-05-13           10           GadgetPro   
2  2024-05-29        2024-06-03           10          FutureTech   
3  2023-04-13        2023-05-23            2  PremiumElectronics   
4  2023-04-13        2024-07-09            9           GadgetPro   

  supplier_location  supplier_reliability  quantity_ordered  \
0             Japan                  0.90               430   
1

In [47]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# File path
file_path = 'C:\\Users\\nikhi\\electronics_supply_chain\\data\\electronics_supply_chain_data_500.csv'

# Load and preprocess the data
def preprocess_data(file_path):
    df = pd.read_csv(file_path)
    
    # Print original shape and first few rows
    print("Original Data Shape:", df.shape)
    print(df.head())
    
    # Drop rows with missing values
    df = df.dropna()
    print("Data Shape After Dropna:", df.shape)
    
    # Convert date columns to datetime
    df['date'] = pd.to_datetime(df['date'])
    df['order_date'] = pd.to_datetime(df['order_date'])
    df['availability_date'] = pd.to_datetime(df['availability_date'])
    
    # Feature engineering: create new features from dates
    df['days_to_availability'] = (df['availability_date'] - df['order_date']).dt.days
    df['days_since_date'] = (df['date'] - df['order_date']).dt.days
    
    # Drop original date columns
    df = df.drop(columns=['date', 'order_date', 'availability_date'])
    
    # Convert categorical features to numeric
    df['product_name'] = df['product_name'].astype('category').cat.codes
    df['product_category'] = df['product_category'].astype('category').cat.codes
    df['supplier_name'] = df['supplier_name'].astype('category').cat.codes
    df['supplier_location'] = df['supplier_location'].astype('category').cat.codes
    
    return df

df = preprocess_data(file_path)

# Define features and target
X = df.drop(columns=['days_to_availability'])
y = df['days_to_availability']

# Print shapes
print("Features Shape:", X.shape)
print("Target Shape:", y.shape)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
score = model.score(X_test, y_test)
print("Model R^2 Score:", score)


Original Data Shape: (257, 13)
         date  product_id              product_name product_category  \
0  2023-10-09         107   Microsoft Surface Pro 9      electronics   
1  2023-08-17          21        Logitech G502 Hero      electronics   
2  2023-04-15         122  Microsoft Surface Book 3      electronics   
3  2023-03-31         117     Epson EcoTank ET-2760      electronics   
4  2024-03-08         104   NVIDIA GeForce RTX 3080      electronics   

   order_date availability_date  supplier_id       supplier_name  \
0  2023-06-05        2023-11-03            5     TechSource Inc.   
1  2023-02-07        2023-05-13           10           GadgetPro   
2  2024-05-29        2024-06-03           10          FutureTech   
3  2023-04-13        2023-05-23            2  PremiumElectronics   
4  2023-04-13        2024-07-09            9           GadgetPro   

  supplier_location  supplier_reliability  quantity_ordered  \
0             Japan                  0.90               430   
1

In [48]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = model.score(X_test, y_test)

print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

# Feature Importance
feature_importances = model.feature_importances_
features = X.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
print("Feature Importances:")
print(importance_df)


Mean Absolute Error: 136.43115384615388
Mean Squared Error: 28469.681561538462
R^2 Score: -0.11522288339094655
Feature Importances:
                 Feature  Importance
10       days_since_date    0.228860
9        demand_forecast    0.125729
8     quantity_available    0.108571
7       quantity_ordered    0.101153
1           product_name    0.092396
4          supplier_name    0.081824
0             product_id    0.081075
6   supplier_reliability    0.080276
3            supplier_id    0.060495
5      supplier_location    0.039620
2       product_category    0.000000


In [49]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='r2', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best Score: -0.1016873784959564


In [50]:
import joblib

# Save model
joblib.dump(model, 'random_forest_model.pkl')

# Load model
model = joblib.load('random_forest_model.pkl')


In [1]:
pip install pandas numpy statsmodels


Collecting statsmodels
  Downloading statsmodels-0.14.2-cp312-cp312-win_amd64.whl.metadata (9.5 kB)
Collecting patsy>=0.5.6 (from statsmodels)
  Downloading patsy-0.5.6-py2.py3-none-any.whl.metadata (3.5 kB)
Downloading statsmodels-0.14.2-cp312-cp312-win_amd64.whl (9.8 MB)
   ---------------------------------------- 0.0/9.8 MB ? eta -:--:--
   ---- ----------------------------------- 1.0/9.8 MB 6.3 MB/s eta 0:00:02
   ---------- ----------------------------- 2.6/9.8 MB 6.9 MB/s eta 0:00:02
   ---------------- ----------------------- 3.9/9.8 MB 6.7 MB/s eta 0:00:01
   -------------------- ------------------- 5.0/9.8 MB 6.4 MB/s eta 0:00:01
   -------------------------- ------------- 6.6/9.8 MB 6.5 MB/s eta 0:00:01
   --------------------------------- ------ 8.1/9.8 MB 6.6 MB/s eta 0:00:01
   -------------------------------------- - 9.4/9.8 MB 6.5 MB/s eta 0:00:01
   ---------------------------------------- 9.8/9.8 MB 6.3 MB/s eta 0:00:00
Downloading patsy-0.5.6-py2.py3-none-any.whl (233

In [5]:
import pandas as pd

# Load historical order data
df = pd.read_csv('C:/Users/nikhi/electronics_supply_chain/data/electronics_supply_chain_data_500.csv')

# Print the column names
print("Columns in DataFrame:", df.columns)


Columns in DataFrame: Index(['date', 'product_id', 'product_name', 'product_category', 'order_date',
       'availability_date', 'supplier_id', 'supplier_name',
       'supplier_location', 'supplier_reliability', 'quantity_ordered',
       'quantity_available', 'demand_forecast'],
      dtype='object')


In [10]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

def feature_engineering(X_train, X_test):
    # Identify non-numeric columns
    non_numeric_cols = X_train.select_dtypes(include=['object']).columns
    
    # Convert non-numeric columns to numeric using one-hot encoding
    X_train = pd.get_dummies(X_train, columns=non_numeric_cols, drop_first=True)
    X_test = pd.get_dummies(X_test, columns=non_numeric_cols, drop_first=True)
    
    # Align columns between train and test datasets
    X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)
    
    # Check if all features are numeric
    if not X_train.applymap(pd.api.types.is_numeric_dtype).all().all():
        raise ValueError("Feature engineering expects all features to be numeric.")
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, X_test_scaled


In [11]:
from sklearn.model_selection import GridSearchCV

def train_models():
    # Load and preprocess data
    X, y = preprocess_data('C:/Users/nikhi/electronics_supply_chain/data/electronics_supply_chain_data_500.csv')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Feature engineering
    X_train_scaled, X_test_scaled = feature_engineering(X_train, X_test)
    
    # Hyperparameter tuning for Random Forest
    param_grid_rf = {
        'n_estimators': [100, 200],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    }
    rf_model = RandomForestRegressor(random_state=42)
    grid_search_rf = GridSearchCV(estimator=rf_model, param_grid=param_grid_rf, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
    grid_search_rf.fit(X_train_scaled, y_train)
    best_rf_model = grid_search_rf.best_estimator_
    
    # Train other models
    xgb_model = xgb.XGBRegressor(n_estimators=100, random_state=42)
    lgb_model = lgb.LGBMRegressor(n_estimators=100, random_state=42)
    
    # Fit models
    best_rf_model.fit(X_train_scaled, y_train)
    xgb_model.fit(X_train_scaled, y_train)
    lgb_model.fit(X_train_scaled, y_train)
    
    # Save models
    joblib.dump(best_rf_model, 'C:/Users/nikhi/electronics_supply_chain/models/random_forest_model.pkl')
    joblib.dump(xgb_model, 'C:/Users/nikhi/electronics_supply_chain/models/xgboost_model.pkl')
    joblib.dump(lgb_model, 'C:/Users/nikhi/electronics_supply_chain/models/lightgbm_model.pkl')
    
    # Evaluate models
    rf_preds = best_rf_model.predict(X_test_scaled)
    xgb_preds = xgb_model.predict(X_test_scaled)
    lgb_preds = lgb_model.predict(X_test_scaled)
    
    rf_rmse = mean_squared_error(y_test, rf_preds, squared=False)
    xgb_rmse = mean_squared_error(y_test, xgb_preds, squared=False)
    lgb_rmse = mean_squared_error(y_test, lgb_preds, squared=False)
    
    rf_r2 = r2_score(y_test, rf_preds)
    xgb_r2 = r2_score(y_test, xgb_preds)
    lgb_r2 = r2_score(y_test, lgb_preds)
    
    print(f"Random Forest RMSE: {rf_rmse}")
    print(f"XGBoost RMSE: {xgb_rmse}")
    print(f"LightGBM RMSE: {lgb_rmse}")
    
    print(f"Random Forest R²: {rf_r2}")
    print(f"XGBoost R²: {xgb_r2}")
    print(f"LightGBM R²: {lgb_r2}")


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# Train and evaluate each model
mse_results = {}
rmse_results = {}
r2_results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)

    # Calculate MSE
    mse = mean_squared_error(y_test, predictions)
    mse_results[name] = mse

    # Calculate RMSE
    rmse = np.sqrt(mse)
    rmse_results[name] = rmse

    # Calculate R2 score
    r2 = r2_score(y_test, predictions)
    r2_results[name] = r2

    print(f"{name} - MSE: {mse:.2f}, RMSE: {rmse:.2f}, R2: {r2:.2f}")

NameError: name 'models' is not defined