In [4]:
# 1. Import Necessary Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pickle
import warnings

# Ignore warnings for a cleaner output
warnings.filterwarnings('ignore')

# --- DATA LOADING AND PREPROCESSING ---

# 2. Load the Dataset
try:
    # Make sure the CSV file is in the same directory or provide the correct path
    df = pd.read_csv('garments_worker_productivity.csv')
    print("✅ Dataset loaded successfully!")
except FileNotFoundError:
    print("❌ Error: 'garments_worker_productivity.csv' not found.")
    print("Please ensure the dataset file is in the correct directory.")
    exit()

# 3. Handle Missing Values
# The original notebook dropped 'wip', here we will fill missing values with the mean
# This retains the column which might have predictive power.
if 'wip' in df.columns and df['wip'].isnull().any():
    df['wip'].fillna(df['wip'].mean(), inplace=True)
    print("✅ Missing values in 'wip' column handled by filling with the mean.")

# 4. Feature Engineering from Date
# Convert 'date' column to datetime objects
df['date'] = pd.to_datetime(df['date'])
# Create a 'month' feature from the date
df['month'] = df['date'].dt.month
# Drop the original 'date' column as it's no longer needed
df = df.drop(columns=['date'])
print("✅ 'month' feature created and 'date' column dropped.")

# 5. Clean Categorical Data
# Standardize the 'department' column values
df['department'] = df['department'].str.strip().str.lower()
df['department'] = df['department'].replace({'sweing': 'sewing', 'finishing ': 'finishing'})
print("✅ 'department' column cleaned and standardized.")

# 6. Handle Categorical Columns with One-Hot Encoding
# One-hot encoding is often better than label encoding for nominal categories
# as it doesn't imply an ordinal relationship.
categorical_cols = ['quarter', 'department', 'day']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
print("✅ Categorical features converted to numerical using one-hot encoding.")


# --- MODEL BUILDING AND EVALUATION ---

# 7. Splitting Data into Training and Testing Sets
# Define features (X) and target (y)
X = df.drop('actual_productivity', axis=1)
y = df['actual_productivity']

# Split the data (80% for training, 20% for testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"\n📊 Data split into {len(X_train)} training samples and {len(X_test)} testing samples.")

# 8. Initialize and Train Models
print("\n🚀 Training models...")

# a) Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
print("   - Linear Regression trained.")

# b) Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
print("   - Random Forest trained.")

# c) XGBoost Regressor
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)
print("   - XGBoost trained.")

# 9. Evaluate Models
print("\n📈 Evaluating model performance...")
models = {
    "Linear Regression": lr_model,
    "Random Forest": rf_model,
    "XGBoost": xgb_model
}

performance_data = []

for name, model in models.items():
    predictions = model.predict(X_test)
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    performance_data.append({"Model": name, "MAE": mae, "MSE": mse, "R² Score": r2})

# Create a DataFrame for a clean comparison table
performance_df = pd.DataFrame(performance_data)
print("\n--- Model Performance Comparison ---")
print(performance_df.to_string(index=False))
print("------------------------------------")


# --- SAVE THE BEST MODEL ---

# 10. Identify and Save the Best Model based on R² Score
best_model_name = performance_df.loc[performance_df['R² Score'].idxmax()]['Model']
best_model_obj = models[best_model_name]

print(f"\n🏆 Best performing model is: {best_model_name} (based on R² Score)")

# 11. Save the model to a .pkl file
model_filename = "best_productivity_model.pkl"
with open(model_filename, 'wb') as file:
    pickle.dump(best_model_obj, file)

print(f"💾 Best model has been saved to '{model_filename}'.")

# Optional: Verify the model can be loaded
try:
    loaded_model = pickle.load(open(model_filename, 'rb'))
    print("✅ Model successfully loaded for verification.")
except Exception as e:
    print(f"❌ Error loading saved model: {e}")



✅ Dataset loaded successfully!
✅ Missing values in 'wip' column handled by filling with the mean.
✅ 'month' feature created and 'date' column dropped.
✅ 'department' column cleaned and standardized.
✅ Categorical features converted to numerical using one-hot encoding.

📊 Data split into 957 training samples and 240 testing samples.

🚀 Training models...
   - Linear Regression trained.
   - Random Forest trained.
   - XGBoost trained.

📈 Evaluating model performance...

--- Model Performance Comparison ---
            Model      MAE      MSE  R² Score
Linear Regression 0.108824 0.021921  0.174430
    Random Forest 0.071849 0.013704  0.483890
          XGBoost 0.077149 0.017219  0.351514
------------------------------------

🏆 Best performing model is: Random Forest (based on R² Score)
💾 Best model has been saved to 'best_productivity_model.pkl'.
✅ Model successfully loaded for verification.
