In [2]:
%pip install xgboost seaborn 

Collecting xgboost
  Downloading xgboost-2.1.4-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.25.1-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.8 kB)
Downloading xgboost-2.1.4-py3-none-manylinux_2_28_x86_64.whl (223.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m223.6/223.6 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:03[0m
[?25hUsing cached seaborn-0.13.2-py3-none-any.whl (294 kB)
Downloading nvidia_nccl_cu12-2.25.1-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (201.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m201.4/201.4 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m
[?25hInstalling collected packages: nvidia-nccl-cu12, xgboost, seaborn
Successfully installed nvidia-nccl-cu12-2.25.1 seaborn-0.13.2 xgboost-2.1.4
No

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Load Data
data = pd.read_csv("/home/usman/Downloads/Projects/btp_code/data/gg.csv")

# Feature Engineering
# Lag Features for Sales and Stock Prices
for lag in range(1, 4):  # Lag features for the last 3 months
    data[f'Sales_Lag_{lag}'] = data['Quantity'].shift(lag)
    data[f'M&M_Stock_Lag_{lag}'] = data['M&M Stock Price'].shift(lag)
    data[f'Escorts_Stock_Lag_{lag}'] = data['Escorts Kuboto Stock Price'].shift(lag)
    data[f'Swaraj_Stock_Lag_{lag}'] = data['Swaraj Engines Stock Price'].shift(lag)

data.dropna(inplace=True)  # Remove missing values caused by lagging

# One-Hot Encoding for Categorical Features
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
categorical_features = ['PartNo', 'Description']
encoded_features = pd.DataFrame(encoder.fit_transform(data[categorical_features]))
encoded_features.columns = encoder.get_feature_names_out(categorical_features)
data = pd.concat([data, encoded_features], axis=1).drop(columns=categorical_features)

# Feature Scaling
scaler = StandardScaler()
scaled_features = ['Sales Price', 'Inflation', 'Total Tractor Sales In india',
                   'M&M Stock Price', 'Escorts Kuboto Stock Price', 'Swaraj Engines Stock Price']
data[scaled_features] = scaler.fit_transform(data[scaled_features])

# Train-Test Split
X = data.drop(columns=['Quantity'])
y = data['Quantity']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Training & Evaluation
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    return rmse, mae

# Linear Regression
lr_model = LinearRegression()
print(X_train.isna().sum())

lr_rmse, lr_mae = evaluate_model(lr_model, X_train, X_test, y_train, y_test)

# Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_rmse, rf_mae = evaluate_model(rf_model, X_train, X_test, y_train, y_test)

# XGBoost
xgb_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_rmse, xgb_mae = evaluate_model(xgb_model, X_train, X_test, y_train, y_test)

# LSTM Model
X_train_lstm = np.reshape(X_train.values, (X_train.shape[0], X_train.shape[1], 1))
X_test_lstm = np.reshape(X_test.values, (X_test.shape[0], X_test.shape[1], 1))

lstm_model = Sequential([
    LSTM(50, return_sequences=True, input_shape=(X_train_lstm.shape[1], 1)),
    LSTM(50),
    Dense(25),
    Dense(1)
])

lstm_model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
lstm_model.fit(X_train_lstm, y_train, epochs=20, batch_size=16, verbose=1)
lstm_predictions = lstm_model.predict(X_test_lstm)
lstm_rmse = np.sqrt(mean_squared_error(y_test, lstm_predictions))
lstm_mae = mean_absolute_error(y_test, lstm_predictions)

# Model Comparison
results = pd.DataFrame({
    "Model": ["Linear Regression", "Random Forest", "XGBoost", "LSTM"],
    "RMSE": [lr_rmse, rf_rmse, xgb_rmse, lstm_rmse],
    "MAE": [lr_mae, rf_mae, xgb_mae, lstm_mae]
})

print(results.sort_values(by='RMSE'))

# Visualization
plt.figure(figsize=(10, 5))
sns.barplot(x='Model', y='RMSE', data=results)
plt.title("Model Comparison - RMSE")
plt.show()


Inflation                                         1
Month                                             1
Year                                              1
pmi                                               1
Sales Price                                       1
                                                 ..
Description_Valve Housing HFB 64                  3
Description_Valve Housing casting                 3
Description_Valve Housing casting-C03             3
Description_Valve housing casting  E01 (HFB82)    3
Description_Wheel - Drive                         3
Length: 361, dtype: int64


ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

## with shap analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shap
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Load Data
data = pd.read_csv("sales_data.csv")

# Feature Engineering
# Lag Features for Sales and Stock Prices
for lag in range(1, 4):  # Lag features for the last 3 months
    data[f'Sales_Lag_{lag}'] = data['Quantity'].shift(lag)
    data[f'M&M_Stock_Lag_{lag}'] = data['M&M Stock Price'].shift(lag)
    data[f'Escorts_Stock_Lag_{lag}'] = data['Escorts Kuboto Stock Price'].shift(lag)
    data[f'Swaraj_Stock_Lag_{lag}'] = data['Swaraj Engines Stock Price'].shift(lag)

data.dropna(inplace=True)  # Remove missing values caused by lagging

# One-Hot Encoding for Categorical Features
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
categorical_features = ['PartNo', 'Description']
encoded_features = pd.DataFrame(encoder.fit_transform(data[categorical_features]))
encoded_features.columns = encoder.get_feature_names_out(categorical_features)
data = pd.concat([data, encoded_features], axis=1).drop(columns=categorical_features)

# Feature Scaling
scaler = StandardScaler()
scaled_features = ['Sales Price', 'Inflation', 'Total Tractor Sales In india',
                   'M&M Stock Price', 'Escorts Kuboto Stock Price', 'Swaraj Engines Stock Price']
data[scaled_features] = scaler.fit_transform(data[scaled_features])

# Train-Test Split
X = data.drop(columns=['Quantity'])
y = data['Quantity']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Training & Evaluation
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    return rmse, mae

# Linear Regression
lr_model = LinearRegression()
lr_rmse, lr_mae = evaluate_model(lr_model, X_train, X_test, y_train, y_test)

# Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_rmse, rf_mae = evaluate_model(rf_model, X_train, X_test, y_train, y_test)

# XGBoost
xgb_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_rmse, xgb_mae = evaluate_model(xgb_model, X_train, X_test, y_train, y_test)

# LSTM Model
X_train_lstm = np.reshape(X_train.values, (X_train.shape[0], X_train.shape[1], 1))
X_test_lstm = np.reshape(X_test.values, (X_test.shape[0], X_test.shape[1], 1))

lstm_model = Sequential([
    LSTM(50, return_sequences=True, input_shape=(X_train_lstm.shape[1], 1)),
    LSTM(50),
    Dense(25),
    Dense(1)
])

lstm_model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
lstm_model.fit(X_train_lstm, y_train, epochs=20, batch_size=16, verbose=1)
lstm_predictions = lstm_model.predict(X_test_lstm)
lstm_rmse = np.sqrt(mean_squared_error(y_test, lstm_predictions))
lstm_mae = mean_absolute_error(y_test, lstm_predictions)

# Model Comparison
results = pd.DataFrame({
    "Model": ["Linear Regression", "Random Forest", "XGBoost", "LSTM"],
    "RMSE": [lr_rmse, rf_rmse, xgb_rmse, lstm_rmse],
    "MAE": [lr_mae, rf_mae, xgb_mae, lstm_mae]
})

print(results.sort_values(by='RMSE'))

# Visualization
plt.figure(figsize=(10, 5))
sns.barplot(x='Model', y='RMSE', data=results)
plt.title("Model Comparison - RMSE")
plt.show()

# SHAP Analysis for XGBoost
explainer = shap.Explainer(xgb_model, X_train)
shap_values = explainer(X_test)

# Summary Plot
shap.summary_plot(shap_values, X_test)

# Feature Importance
shap.plots.bar(shap_values)
