# Step 1: Import Required Libraries

In [1]:
import pandas as pd 
import numpy as np

# Step 2: Load Raw Dataset

In [2]:
df = pd.read_csv("../data/raw/manufacturing_dataset_1000_samples.csv")

# Step 3: Drop Unnecessary Columns

In [3]:
df.drop(columns=['Timestamp'], inplace=True)

In [5]:
df.head()

Unnamed: 0,Injection_Temperature,Injection_Pressure,Cycle_Time,Cooling_Time,Material_Viscosity,Ambient_Temperature,Machine_Age,Operator_Experience,Maintenance_Hours,Shift,Machine_Type,Material_Grade,Day_of_Week,Temperature_Pressure_Ratio,Total_Cycle_Time,Efficiency_Score,Machine_Utilization,Parts_Per_Hour
0,221.0,136.0,28.7,13.6,375.5,28.0,3.8,11.2,64,Evening,Type_B,Economy,Thursday,1.625,42.3,0.063,0.51,36.5
1,213.3,128.9,34.5,14.0,215.8,22.6,6.8,6.3,58,Night,Type_A,Standard,Wednesday,1.655,48.5,0.037,0.389,29.9
2,222.8,115.9,19.9,9.5,307.0,25.3,4.2,9.6,47,Day,Type_A,Standard,Monday,1.922,29.4,0.061,0.551,56.9
3,233.3,105.3,39.2,13.1,137.8,26.0,9.2,8.6,49,Evening,Type_A,Premium,Saturday,2.215,52.3,0.054,0.293,31.0
4,212.2,125.5,45.0,9.9,298.2,23.6,6.2,23.0,49,Night,Type_B,Premium,Monday,1.691,54.9,0.145,0.443,15.0


# Step 4: Handle Missing Values

In [6]:
df['Material_Viscosity'].fillna(df['Material_Viscosity'].mean(), inplace=True)
df['Ambient_Temperature'].fillna(df['Ambient_Temperature'].mean(), inplace=True)
df['Operator_Experience'].fillna(df['Operator_Experience'].mean(), inplace=True)


# Step 5: Verify Missing Values Are Handled

In [9]:
df.isnull().sum()

Injection_Temperature         0
Injection_Pressure            0
Cycle_Time                    0
Cooling_Time                  0
Material_Viscosity            0
Ambient_Temperature           0
Machine_Age                   0
Operator_Experience           0
Maintenance_Hours             0
Shift                         0
Machine_Type                  0
Material_Grade                0
Day_of_Week                   0
Temperature_Pressure_Ratio    0
Total_Cycle_Time              0
Efficiency_Score              0
Machine_Utilization           0
Parts_Per_Hour                0
dtype: int64

# Step 6: Encode Categorical Variables
- Convert categorical columns into numerical format using One-Hot Encoding.

In [10]:
df = pd.get_dummies(df, drop_first=True)


In [12]:
df.head()

Unnamed: 0,Injection_Temperature,Injection_Pressure,Cycle_Time,Cooling_Time,Material_Viscosity,Ambient_Temperature,Machine_Age,Operator_Experience,Maintenance_Hours,Temperature_Pressure_Ratio,...,Machine_Type_Type_B,Machine_Type_Type_C,Material_Grade_Premium,Material_Grade_Standard,Day_of_Week_Monday,Day_of_Week_Saturday,Day_of_Week_Sunday,Day_of_Week_Thursday,Day_of_Week_Tuesday,Day_of_Week_Wednesday
0,221.0,136.0,28.7,13.6,375.5,28.0,3.8,11.2,64,1.625,...,True,False,False,False,False,False,False,True,False,False
1,213.3,128.9,34.5,14.0,215.8,22.6,6.8,6.3,58,1.655,...,False,False,False,True,False,False,False,False,False,True
2,222.8,115.9,19.9,9.5,307.0,25.3,4.2,9.6,47,1.922,...,False,False,False,True,True,False,False,False,False,False
3,233.3,105.3,39.2,13.1,137.8,26.0,9.2,8.6,49,2.215,...,False,False,True,False,False,True,False,False,False,False
4,212.2,125.5,45.0,9.9,298.2,23.6,6.2,23.0,49,1.691,...,True,False,True,False,True,False,False,False,False,False


# Step 7: Define Features and Target Variable

In [13]:
X = df.drop('Parts_Per_Hour', axis=1)
y = df['Parts_Per_Hour']


# Step 8: Feature Scaling

In [14]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


# Step 9: Train-Test Split

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)


# Step 10: Save Processed Dataset

In [16]:
cleaned_df = pd.concat([pd.DataFrame(X_scaled, columns=X.columns), y.reset_index(drop=True)], axis=1)
cleaned_df.to_csv("../data/processed/cleaned_manufacturing_data.csv", index=False)

# outcomes of notebook-2
In Notebook-2, I performed data preprocessing by handling missing values, encoding categorical variables, applying feature scaling, and preparing the dataset for regression modeling.

In [18]:
import joblib
joblib.dump(scaler, "../models/scaler.pkl")

['../models/scaler.pkl']