In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor

import xgboost as xgb
import pickle


# -----------------------------
# MultiColumnLabelEncoder (FIX)
# -----------------------------
class MultiColumnLabelEncoder:
    def __init__(self, columns=None):
        self.columns = columns
        self.encoders = {}

    def fit(self, X, y=None):
        for col in self.columns:
            le = LabelEncoder()
            le.fit(X[col])
            self.encoders[col] = le
        return self

    def transform(self, X):
        X = X.copy()
        for col, le in self.encoders.items():
            X[col] = le.transform(X[col])
        return X

    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)


In [None]:
import pandas as pd

data = pd.read_csv(r"C:\Users\hp\Desktop\SmartinternZ\Dataset\garments_worker_productivity.csv")
print(data.head())
print(data.shape)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# keep only numeric columns for correlation
corrMatrix = data.select_dtypes(include=['number']).corr()

fig, ax = plt.subplots(figsize=(15, 15))
sns.heatmap(corrMatrix, annot=True, linewidths=.5, ax=ax)
plt.show()


In [None]:
print(data.describe())

In [None]:
data.describe()

In [None]:
data.shape

In [None]:
data.info

In [None]:
data.isnull().sum()

In [None]:
data.drop(['wip'],axis=1,inplace=True)

In [None]:
data["date"]=pd.to_datetime(data["date"])

In [None]:
data.date

In [None]:
data['month'] = data['date'].dt.month
data.drop(['date'], axis=1, inplace=True)


In [None]:
data.month

In [None]:
data['department'].value_counts()

In [None]:
data['department'] = data['department'].apply(
    lambda x: 'finishing' if x.replace(" ", "") == 'finishing' else 'sweing'
)

In [None]:
data['department'].value_counts()

In [None]:
# Apply MultiColumnLabelEncoder correctly
categorical_cols = data.select_dtypes(include='object').columns

Mcle = MultiColumnLabelEncoder(columns=categorical_cols)
data = Mcle.fit_transform(data)


In [None]:
data.dtypes


In [None]:
x = data.drop(['actual_productivity'], axis=1)
y = data['actual_productivity']


In [None]:

x = x.to_numpy()

In [None]:

x

In [None]:
# Splitting the data
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    x, y, train_size=0.8, random_state=0
)


In [None]:
from sklearn.linear_model import LinearRegression

model_lr = LinearRegression()


In [None]:
# use the already-defined numpy arrays
X_train, X_test = x_train, x_test
model_lr.fit(X_train, y_train)


In [None]:
pred_test = model_lr.predict(X_test)

print("test_MSE:", mean_squared_error(y_test, pred_test))
print("test_MAE:", mean_absolute_error(y_test, pred_test))
print("R2_score:{}".format(r2_score(y_test, pred_test)))


In [None]:
from sklearn.ensemble import RandomForestRegressor

model_rf = RandomForestRegressor(n_estimators=200, max_depth=5)


In [None]:
model_rf.fit(X_train, y_train)


In [None]:
pred = model_rf.predict(X_test)

print("test_MSE:", mean_squared_error(y_test, pred))
print("test_MAE:", mean_absolute_error(y_test, pred))
print("R2_score:{}".format(r2_score(y_test, pred)))


In [None]:
import xgboost as xgb

model_xgb = xgb.XGBRegressor(
    n_estimators=200,
    max_depth=5,
    learning_rate=0.1
)


In [None]:
model_xgb.fit(X_train, y_train)


In [None]:
print(model_xgb)


In [None]:
pred3 = model_xgb.predict(X_test)

print("test_MSE:", mean_squared_error(y_test, pred3))
print("test_MAE:", mean_absolute_error(y_test, pred3))
print("R2_score:{}".format(r2_score(y_test, pred3)))


In [None]:
pred_test = model_lr.predict(X_test)

print("test_MSE:", mean_squared_error(y_test, pred_test))
print("test_MAE:", mean_absolute_error(y_test, pred_test))
print("R2_score:{}".format(r2_score(y_test, pred_test)))


In [None]:
pred = model_rf.predict(X_test)

print("test_MSE:", mean_squared_error(y_test, pred))
print("test_MAE:", mean_absolute_error(y_test, pred))
print("R2_score:{}".format(r2_score(y_test, pred)))


In [None]:
pred = model_rf.predict(X_test)

print("test_MSE:", mean_squared_error(y_test, pred))
print("test_MAE:", mean_absolute_error(y_test, pred))
print("R2_score:{}".format(r2_score(y_test, pred)))


In [None]:
pred3 = model_xgb.predict(X_test)

print("test_MSE:", mean_squared_error(y_test, pred3))
print("test_MAE:", mean_absolute_error(y_test, pred3))
print("R2_score:{}".format(r2_score(y_test, pred3)))


In [None]:
import pickle
import os

print("Current working directory:", os.getcwd())

with open("gwp.pkl", "wb") as f:
    pickle.dump(model_xgb, f)

print("Model saved successfully")


In [None]:
os.getcwd()


In [None]:
import pickle
import os

save_path = r"C:\Users\hp\Desktop\SmartinternZ\Flask\templates\gwp.pkl"

print("Saving model to:", save_path)
print("Model type:", type(model_xgb))

with open(save_path, "wb") as f:
    pickle.dump(model_xgb, f)
    f.flush()
    os.fsync(f.fileno())

print("Save completed")
