# STEP 1: Import Libraries

In [15]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

import joblib


# Step 2: Load the Preprocessed Dataset

In [16]:
df = pd.read_csv("../data/processed/cleaned_manufacturing_data.csv")

In [17]:
df.head()

Unnamed: 0,Injection_Temperature,Injection_Pressure,Cycle_Time,Cooling_Time,Material_Viscosity,Ambient_Temperature,Machine_Age,Operator_Experience,Maintenance_Hours,Temperature_Pressure_Ratio,...,Machine_Type_Type_C,Material_Grade_Premium,Material_Grade_Standard,Day_of_Week_Monday,Day_of_Week_Saturday,Day_of_Week_Sunday,Day_of_Week_Thursday,Day_of_Week_Tuesday,Day_of_Week_Wednesday,Parts_Per_Hour
0,0.47409,1.359149,-0.856562,0.72805,1.706789,1.843286,-1.040282,-0.715348,0.838407,-0.951369,...,-0.606621,-0.51558,-1.202076,-0.396746,-0.405147,-0.355142,2.380476,-0.400116,-0.42829,36.5
1,-0.168139,0.874835,-0.161894,0.901726,-0.49371,-0.124333,-0.270824,-0.894229,0.46356,-0.841954,...,-0.606621,-0.51558,0.831894,-0.396746,-0.405147,-0.355142,-0.420084,-0.400116,2.334869,29.9
2,0.624221,-0.011937,-1.910541,-1.05213,0.762931,0.859476,-0.937688,-0.773758,-0.223658,0.131838,...,-0.606621,-0.51558,0.831894,2.520504,-0.405147,-0.355142,-0.420084,-0.400116,-0.42829,56.9
3,1.499987,-0.734998,0.401027,0.510955,-1.568468,1.114538,0.344743,-0.810265,-0.09871,1.200456,...,-0.606621,1.939563,-1.202076,-0.396746,2.468238,-0.355142,-0.420084,-0.400116,-0.42829,31.0
4,-0.259886,0.64291,1.095695,-0.878454,0.641676,0.240041,-0.424715,-0.284574,-0.09871,-0.710657,...,-0.606621,1.939563,-1.202076,2.520504,-0.405147,-0.355142,-0.420084,-0.400116,-0.42829,15.0


# Step 3: Separate Features and Target

In [18]:
X = df.drop('Parts_Per_Hour', axis=1)
y = df['Parts_Per_Hour']

# Step 4: Train-Test Split

In [22]:
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


# STEP 5: Feature Scaling

In [23]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train_raw)
X_test = scaler.transform(X_test_raw)


# Step 6: Train the Linear Regression Model

In [24]:
model = LinearRegression()
model.fit(X_train, y_train)


0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


# STEP 7: Evaluate on Test Data

In [25]:
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("MSE:", mse)
print("RMSE:", rmse)
print("R2:", r2)


MSE: 12.28033510079426
RMSE: 3.5043309062921355
R2: 0.9058926590665994


# STEP 8: Feature Importance

In [26]:
coef_df = pd.DataFrame({
    "Feature": X.columns,
    "Coefficient": model.coef_
}).sort_values(by="Coefficient", ascending=False)

coef_df


Unnamed: 0,Feature,Coefficient
11,Efficiency_Score,2.958891
3,Cooling_Time,2.666475
0,Injection_Temperature,2.29179
17,Material_Grade_Premium,1.290145
1,Injection_Pressure,1.198371
7,Operator_Experience,0.93518
18,Material_Grade_Standard,0.813022
24,Day_of_Week_Wednesday,0.623184
22,Day_of_Week_Thursday,0.340296
23,Day_of_Week_Tuesday,0.257527


# STEP 9: Save Model Artifacts

In [27]:
# Save model & scaler
joblib.dump(model, "../models/linear_regression_model.pkl")
joblib.dump(scaler, "../models/scaler.pkl")

# Save feature order
joblib.dump(X.columns.tolist(), "../models/feature_columns.pkl")

# Save RAW test data for Notebook-4
X_test_raw.to_csv("../data/processed/X_test.csv", index=False)
y_test.to_csv("../data/processed/y_test.csv", index=False)
