In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import MultiColumnLabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import pickle
import numpy as np
import io
import base64
from datetime import datetime

In [None]:
data=pd.read_csv('/content/garments_worker_productivity.csv')
data.head()

In [None]:
corrMatrix=data.corr()
fig, ax= plt.subplots(figsize=(15,15))
sns.heatmap(corrMatrix, annot=True, linewidths=0.5, ax=ax)
plt.show()

In [None]:
data.describe()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
data.drop(['wip'],axis=1,inplace=True)

In [None]:
data['date']=pd.to_datetime(data['date'])
data.date

In [None]:
data['month']=data['date'].dt.month
data.drop(['date'],axis=1,inplace=True)
data.month

In [None]:
data['department'].value_counts()

In [None]:
data['department']=data['department'].apply(lambda x: 'finishing' if x.replace(" ","") == 'finishing' else 'sweing')
data['department'].value_counts()

In [None]:
Mcle=MultiColumnLabelEncoder.MultiColumnLabelEncoder()
data=Mcle.fit_transform(data)

In [None]:
x=data.drop(['actual_productivity'],axis=1)
y=data['actual_productivity']
X=x.to_numpy()
X

In [None]:
# Split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train Linear Regression model
model_lr = LinearRegression()
model_lr.fit(x_train, y_train)
pred_test = model_lr.predict(x_test)
print("test_MSE:", mean_squared_error(y_test, pred_test))
print("test_MAE:", mean_absolute_error(y_test, pred_test))
print("R2_Score:{}".format(r2_score(y_test, pred_test)))

In [None]:
# Train Random Forest Regressor
model_rf = RandomForestRegressor(n_estimators=200, max_depth=5)
model_rf.fit(x_train, y_train)
pred = model_rf.predict(x_test)
print("test_MSE:", mean_squared_error(y_test, pred))
print("test_MAE:", mean_absolute_error(y_test, pred))
print("R2_Score:{}".format(r2_score(y_test, pred)))

In [None]:
# Train XGBoost model
model_xgb = xgb.XGBRegressor(n_estimators=200, max_depth=5, learning_rate=0.1)
model_xgb.fit(x_train, y_train)
pred3 = model_xgb.predict(x_test)
print("test_MSE:", mean_squared_error(y_test, pred3))
print("test_MAE:", mean_absolute_error(y_test, pred3))
print("R2_Score:{}".format(r2_score(y_test, pred3)))

In [None]:
# Save models to pickle files
with open('model_lr.pkl', 'wb') as f:
    pickle.dump(model_lr, f)
with open('model_rf.pkl', 'wb') as f:
    pickle.dump(model_rf, f)
with open('model_xgb.pkl', 'wb') as f:
    pickle.dump(model_xgb, f)

print("Models saved successfully!")

# REST API Implementation

The machine learning models trained in this notebook are used in our Flask-based REST API. The API provides endpoints for both single predictions and batch processing of employee productivity data.

## API Endpoints

1. **Health Check**: `GET /health`
2. **Single Prediction**: `POST /api/predict`
3. **Batch Processing**: `POST /api/batch` and `GET /api/batch/{id}`
4. **Metadata**: `GET /api/meta/departments` and `GET /api/meta/teams`

In [None]:
# Example function for making predictions
def predict_productivity(input_data):
    """
    Make a prediction using the saved model
    
    Args:
        input_data (dict): Dictionary containing employee productivity features
    
    Returns:
        dict: Prediction results including productivity value and category
    """
    # Load the best model (Random Forest in this case)
    with open('model_rf.pkl', 'rb') as f:
        model = pickle.load(f)
    
    # Prepare input data
    quarter = input_data.get('date').month // 3 + 1 if isinstance(input_data.get('date'), datetime) else 1
    month = input_data.get('date').month if isinstance(input_data.get('date'), datetime) else 1
    
    # Map department to numeric value
    department_map = {"Sewing": 1, "Finishing": 0}
    department = department_map.get(input_data.get('department'), 1)
    
    # Map day of week (1=Monday, 7=Sunday)
    day = input_data.get('date').weekday() + 1 if isinstance(input_data.get('date'), datetime) else 1
    
    # Extract team number
    team = int(input_data.get('team').split(" ")[1]) if isinstance(input_data.get('team'), str) else 1
    
    # Map incentive level
    incentive_map = {"None": 0, "Low": 1, "Standard": 2, "High": 3}
    incentive = incentive_map.get(input_data.get('incentive_level'), 2)
    
    # Prepare model input
    model_input = [
        quarter,
        department,
        day,
        team,
        float(input_data.get('targeted_productivity')),
        float(input_data.get('smv_minutes')),
        int(input_data.get('over_time_hours')),
        incentive,
        float(input_data.get('idle_time_minutes')),
        int(input_data.get('idle_men_count')),
        int(input_data.get('style_change_count')),
        int(input_data.get('worker_count')),
        month
    ]
    
    # Make prediction
    prediction = model.predict([model_input])[0]
    
    # Get category
    if prediction <= 0.3:
        category = "Below Average Productivity"
    elif 0.3 < prediction <= 0.8:
        category = "Medium Productivity"
    else:
        category = "High Productivity"
    
    return {
        "actual_productivity": float(prediction),
        "category": category
    }

In [None]:
# Example API request
sample_input = {
    "date": datetime(2023, 5, 15),
    "department": "Sewing",
    "team": "Team 3",
    "targeted_productivity": 75,
    "smv_minutes": 2.5,
    "over_time_hours": 1,
    "incentive_level": "Standard",
    "idle_time_minutes": 30,
    "idle_men_count": 1,
    "style_change_count": 2,
    "worker_count": 50
}

prediction_result = predict_productivity(sample_input)
print(prediction_result)

In [None]:
# Generate example visualization
def generate_sample_visualization(data):
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # Extract key metrics
    targeted_productivity = data.get("targeted_productivity", 0)
    smv_minutes = data.get("smv_minutes", 0)
    over_time_hours = data.get("over_time_hours", 0)
    idle_time_minutes = data.get("idle_time_minutes", 0)
    
    categories = ['Targeted Productivity', 'SMV', 'Over Time', 'Idle Time']
    values = [targeted_productivity, smv_minutes, over_time_hours, idle_time_minutes]
    
    # Bar chart (top left)
    axes[0, 0].bar(categories, values, color=['blue', 'green', 'red', 'orange'])
    axes[0, 0].set_xlabel('Parameters')
    axes[0, 0].set_ylabel('Values')
    axes[0, 0].set_title('Employee Productivity Parameters')
    
    # Scatter plot (top right)
    axes[0, 1].scatter([1, 2, 3, 4], values, s=100)
    axes[0, 1].set_xticks([1, 2, 3, 4])
    axes[0, 1].set_xticklabels(categories)
    axes[0, 1].set_xlabel('Parameters')
    axes[0, 1].set_ylabel('Values')
    axes[0, 1].set_title('Scatter Plot of Parameters')
    
    # Line plot (bottom left)
    axes[1, 0].plot([1, 2, 3, 4], values, marker='o', linestyle='-', linewidth=2)
    axes[1, 0].set_xticks([1, 2, 3, 4])
    axes[1, 0].set_xticklabels(categories)
    axes[1, 0].set_xlabel('Parameters')
    axes[1, 0].set_ylabel('Values')
    axes[1, 0].set_title('Line Plot of Parameters')
    
    # Pie chart (bottom right)
    axes[1, 1].pie(values, labels=categories, autopct='%1.1f%%', startangle=90)
    axes[1, 1].axis('equal')
    axes[1, 1].set_title('Distribution of Parameters')
    
    plt.tight_layout()
    plt.show()

# Generate sample visualization
generate_sample_visualization(sample_input)

# Conclusion

This notebook demonstrates:

1. Data preprocessing and feature engineering for employee productivity prediction
2. Training and evaluation of three machine learning models:
   - Linear Regression
   - Random Forest Regressor
   - XGBoost Regressor
3. Model saving for use in the REST API
4. Example prediction code that mimics the API implementation

The best performing model is used in our Flask-based REST API to provide productivity predictions to the frontend application.