# Importing Libraries

In [None]:
import pandas as pd 
import numpy as np
import os

import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_squared_error

In [2]:
train_folder = 'train'

# Loading Data

In [None]:
incoming_run_files = [f for f in os.listdir(train_folder) if 'incoming_run_data' in f]
incoming_dataframes = []
previous_run_id_list = []

for files in incoming_run_files:
    temp_df = pd.read_parquet(os.path.join(train_folder, files))
    print(temp_df["Tool ID"].unique())

    run_id_list = temp_df["Run ID"].unique().tolist()

    if any(run_id in previous_run_id_list for run_id in run_id_list):
        print("Duplicate Run ID found in previous_run_id_list")
    
    previous_run_id_list.extend(run_id_list)
    incoming_dataframes.append(temp_df)

incoming_df = pd.concat(incoming_dataframes, ignore_index=True)

incoming_df.to_parquet('dataset/incoming_run_data.parquet', index=False)

incoming_df

In [None]:
run_files = [f for f in os.listdir(train_folder) if 'run_data' in f and 'incoming' not in f]
run_dataframes = []
previous_run_id_list = []

for files in run_files:
    temp_df = pd.read_parquet(os.path.join(train_folder, files))
    print(temp_df["Tool ID"].unique())

    run_id_list = temp_df["Run ID"].unique().tolist()

    if any(run_id in previous_run_id_list for run_id in run_id_list):
        print("Duplicate Run ID found in previous_run_id_list")
    
    previous_run_id_list.extend(run_id_list)
    run_dataframes.append(temp_df)

run_df = pd.concat(run_dataframes, ignore_index=True)

run_df.to_parquet('dataset/run_data.parquet', index=False)

run_df

In [None]:
metrology_files = [f for f in os.listdir(train_folder) if 'metrology_data' in f]

# Read and concatenate all parquet files
metrology_dataframes = [pd.read_parquet(os.path.join(train_folder, file)) for file in metrology_files]
metrology_df = pd.concat(metrology_dataframes, ignore_index=True)

metrology_df.to_parquet('dataset/metrology_df.parquet', index=False)

metrology_df

# Data Processing

## 1. Incoming Run Data

### Finding Average Sensor Values

In [None]:
tmp_list = []

temp_grps = [(x[0], x[1], x[2], x[3], y) for x, y in incoming_df.groupby(by=["Tool ID", "Run ID", "Step ID", "Sensor Name"])]
for temp_grp in temp_grps:
    temp_tool_id = temp_grp[0]
    temp_run_id = temp_grp[1]
    temp_step_id = temp_grp[2]
    temp_sensor_name = temp_grp[3]
    temp_df = temp_grp[4]
    temp_df = temp_df.reset_index()

    temp_run_start_time = temp_df.loc[0, "Run Start Time"]
    temp_run_end_time = temp_df.loc[0, "Run End Time"]
    temp_process_step = temp_df.loc[0, "Process Step"]

    sensor_avgs = temp_df["Sensor Value"].mean()

    temp_time_taken = (temp_df["Time Stamp"].max() - temp_df["Time Stamp"].min()).total_seconds()
    temp_time_taken = int(round(temp_time_taken))
   
    tmp_list.append([
        temp_process_step,
        temp_tool_id,
        temp_run_id,
        temp_run_start_time,
        temp_run_end_time,
        temp_time_taken,
        temp_step_id,
        temp_sensor_name,
        sensor_avgs
    ]) 

columns = [
    "Process Step",
    "Tool ID",
    "Run ID",
    "Run Start Time",
    "Run End Time",
    "Time Taken (Seconds)",
    "Step ID",
    "Sensor Name",
    "Sensor Value (Average)"
] 
   
incoming_avg_df = pd.DataFrame(tmp_list, columns=columns) 
del temp_grps
incoming_avg_df = incoming_avg_df.sort_values(
    by=["Step ID", "Sensor Name"], 
    key=lambda x: x.str.extract(r'_(\d+)')[0].astype(int)
)

incoming_avg_df["Test"] = incoming_avg_df["Sensor Name"] + ";" + incoming_avg_df["Step ID"]
incoming_avg_df = incoming_avg_df.drop(columns=["Step ID", "Sensor Name"])
incoming_avg_df.to_parquet('processed/incoming_avg_df.parquet', index=False)

incoming_avg_df

### Pivot Dataframe

In [None]:
dataframe_incoming_pivot = incoming_avg_df.pivot(
    index=["Process Step", "Tool ID", "Run ID", "Run Start Time", "Run End Time"],
    columns="Test",
    values="Sensor Value (Average)"
).reset_index()

dataframe_incoming_pivot.to_parquet('processed/dataframe_incoming_pivot.parquet', index=False)

dataframe_incoming_pivot.sort_values(by=["Run End Time"])

## 2. Run Data

### Finding Average Sensor Values

In [None]:
tmp_list = []

temp_grps = [(x[0], x[1], x[2], x[3], y) for x, y in run_df.groupby(by=["Tool ID", "Run ID", "Step ID", "Sensor Name"])]
for temp_grp in temp_grps:
    temp_tool_id = temp_grp[0]
    temp_run_id = temp_grp[1]
    temp_step_id = temp_grp[2]
    temp_sensor_name = temp_grp[3]
    temp_df = temp_grp[4]
    temp_df = temp_df.reset_index()

    temp_run_start_time = temp_df.loc[0, "Run Start Time"]
    temp_run_end_time = temp_df.loc[0, "Run End Time"]
    temp_process_step = temp_df.loc[0, "Process Step"]
    temp_consumable_life = temp_df.loc[0, "Consumable Life"]

    sensor_avgs = temp_df["Sensor Value"].mean()

    temp_time_taken = (temp_df["Time Stamp"].max() - temp_df["Time Stamp"].min()).total_seconds()
    temp_time_taken = int(round(temp_time_taken))
   
    tmp_list.append([
        temp_process_step,
        temp_tool_id,
        temp_run_id,
        temp_run_start_time,
        temp_run_end_time,
        temp_time_taken,
        temp_consumable_life,
        temp_step_id,
        temp_sensor_name,
        sensor_avgs
    ]) 

columns = [
    "Process Step",
    "Tool ID",
    "Run ID",
    "Run Start Time",
    "Run End Time",
    "Time Taken (Seconds)",
    "Consumable Life",
    "Step ID",
    "Sensor Name",
    "Sensor Value (Average)"
] 
   
run_avg_df = pd.DataFrame(tmp_list, columns=columns) 
del temp_grps
run_avg_df = run_avg_df.sort_values(
    by=["Step ID"], 
    key=lambda x: x.str.extract(r'Step_(\d+)')[0].astype(int)
)

run_avg_df["Test"] = run_avg_df["Sensor Name"] + ";" + run_avg_df["Step ID"]
run_avg_df = run_avg_df.drop(columns=["Step ID", "Sensor Name"])
run_avg_df.to_parquet('processed/run_avg_df.parquet', index=False)

run_avg_df

### Pivot Dataframe

In [None]:
dataframe_run_pivot = run_avg_df.pivot(
    index=["Process Step", "Tool ID", "Run ID", "Run Start Time", "Run End Time", "Consumable Life"],
    columns="Test",
    values="Sensor Value (Average)"
).reset_index()

dataframe_run_pivot.to_parquet('processed/dataframe_run_pivot.parquet', index=False)

dataframe_run_pivot.sort_values(by=["Run End Time"])

In [7]:
dataframe_run_pivot = pd.read_parquet('processed/dataframe_run_pivot.parquet')
dataframe_run_pivot.sort_values(by=["Tool ID", "Run End Time"])

Unnamed: 0,Process Step,Tool ID,Run ID,Run Start Time,Run End Time,Consumable Life,Sensor_A;Step_0,Sensor_A;Step_1,Sensor_A;Step_10,Sensor_A;Step_11,...,Sensor_O;Step_11,Sensor_O;Step_12,Sensor_O;Step_2,Sensor_O;Step_3,Sensor_O;Step_4,Sensor_O;Step_5,Sensor_O;Step_6,Sensor_O;Step_7,Sensor_O;Step_8,Sensor_O;Step_9
128,22c8716b-1c14-525a-8c0a-0473ec7aa99e,08e35986-3a0a-5274-8990-4ad303be9d5f,86d0f032-9b57-55d9-9f79-c7989487892c,2024-01-01 00:00:00,2024-01-01 00:12:35,386.472595,-0.040958,0.006972,-0.012798,0.003125,...,968.705078,796.152527,1069.617310,1111.568604,1176.203979,1174.169067,1175.103516,1172.235107,1177.629517,1125.265503
172,22c8716b-1c14-525a-8c0a-0473ec7aa99e,08e35986-3a0a-5274-8990-4ad303be9d5f,b62dae2c-96ee-5452-9a99-40f2f5bde939,2024-01-01 00:17:35,2024-01-01 00:30:10,396.996674,0.003778,0.004490,-0.008127,-0.003675,...,996.557190,833.455383,1102.396851,1154.035156,1216.873657,1216.618164,1214.511719,1217.432251,1214.991333,1173.361206
38,22c8716b-1c14-525a-8c0a-0473ec7aa99e,08e35986-3a0a-5274-8990-4ad303be9d5f,2e42873c-6f98-5a00-a3b8-f26bd47dd4f2,2024-01-01 00:35:10,2024-01-01 00:47:45,62.887051,-0.040664,0.006137,0.030027,-0.005485,...,977.835327,816.467773,1081.037476,1129.236572,1190.228149,1189.015625,1188.604736,1187.006836,1189.207397,1144.375122
70,22c8716b-1c14-525a-8c0a-0473ec7aa99e,08e35986-3a0a-5274-8990-4ad303be9d5f,51ebf217-bb23-5a3c-be5a-db7b409cbcf1,2024-01-01 01:10:20,2024-01-01 01:22:55,289.107483,-0.023646,0.005866,0.016745,0.007834,...,1012.733398,818.252625,1128.815430,1160.368652,1222.948486,1228.408447,1225.203857,1228.098633,1224.922485,1179.987183
215,22c8716b-1c14-525a-8c0a-0473ec7aa99e,08e35986-3a0a-5274-8990-4ad303be9d5f,f55e5f9a-25b6-58ac-8a6c-38cf9e8498cc,2024-01-01 01:27:55,2024-01-01 01:40:30,388.592682,0.033246,0.013838,0.010992,0.003847,...,901.558655,732.328186,990.509277,1034.204224,1097.134277,1089.176880,1087.932983,1094.075195,1090.421875,1049.355225
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4062,22c8716b-1c14-525a-8c0a-0473ec7aa99e,e4838ac1-3788-50bf-bbca-04cf339ec369,a947787c-f0f1-576b-89bc-9b0e0259ac5c,2024-01-04 03:10:20,2024-01-04 03:22:55,326.794495,-0.008041,-0.017980,0.009752,0.010693,...,974.690125,825.583557,1061.758179,1123.687256,1185.352539,1185.476440,1185.598267,1185.295166,1185.523071,1140.474854
3937,22c8716b-1c14-525a-8c0a-0473ec7aa99e,e4838ac1-3788-50bf-bbca-04cf339ec369,14c91ad0-9961-5a5e-9aea-670402590ff8,2024-01-04 03:27:55,2024-01-04 03:40:30,241.537689,0.025563,-0.000443,0.005672,0.009067,...,961.108643,781.569397,1069.119263,1108.110474,1169.453003,1169.316895,1169.719971,1168.900879,1169.577515,1125.297241
4098,22c8716b-1c14-525a-8c0a-0473ec7aa99e,e4838ac1-3788-50bf-bbca-04cf339ec369,d295c1a0-bb64-5f4b-be2f-4026b7181772,2024-01-04 04:03:05,2024-01-04 04:15:40,408.532959,-0.046212,-0.004864,0.032343,0.000110,...,910.844482,739.432800,1014.096130,1050.317749,1108.215576,1108.217041,1107.992432,1108.000244,1108.060303,1066.401367
3982,22c8716b-1c14-525a-8c0a-0473ec7aa99e,e4838ac1-3788-50bf-bbca-04cf339ec369,4d56c4c2-a386-5e81-bb8e-44c6a6957a0e,2024-01-04 04:20:40,2024-01-04 04:33:15,151.902420,0.046696,0.000920,-0.002972,0.006100,...,1034.175659,837.819092,1152.473755,1193.076904,1258.339478,1258.333130,1258.595459,1258.417969,1258.269287,1210.724731


## 3. Merging Data

### Merge metrology and run data

In [None]:
merge_1 = pd.DataFrame()
unique_run_ids = metrology_df["Run ID"].unique().tolist()

for run_id in unique_run_ids:
    temp_metrology_df = metrology_df[metrology_df["Run ID"] == run_id].reset_index(drop=True)
    temp_run_df = dataframe_run_pivot[dataframe_run_pivot["Run ID"] == run_id].reset_index(drop=True)

    if temp_run_df.empty:
        print(f"Run ID {run_id} not found in run data.")
        continue

    for _, metrology_row in temp_metrology_df.iterrows():
        # Convert row Series to single-row DataFrame
        metrology_row_df = pd.DataFrame([metrology_row])
        
        # Merge with corresponding run data
        merged_df = pd.merge(
            metrology_row_df,
            temp_run_df,
            how="left",
            on=["Run ID", "Run Start Time", "Run End Time"]
        )

        # Append to the result DataFrame
        merge_1 = pd.concat([merge_1, merged_df], ignore_index=True)

merge_1

In [None]:
dataframe_incoming_pivot_temp = dataframe_incoming_pivot.drop(columns=["Run Start Time", "Run End Time"])

final_df = pd.DataFrame()
unique_run_ids = merge_1["Run ID"].unique().tolist()

for run_id in unique_run_ids:
    temp_merge_1 = merge_1[merge_1["Run ID"] == run_id].reset_index(drop=True)
    temp_incoming_df = dataframe_incoming_pivot_temp[dataframe_incoming_pivot_temp["Run ID"] == run_id].reset_index(drop=True)

    if temp_incoming_df.empty:
        print(f"Run ID {run_id} not found in run data.")
        continue

    for _, row in temp_merge_1.iterrows():
        # Convert row Series to single-row DataFrame
        temp_df = pd.DataFrame([row])
        
        # Merge with corresponding run data
        merged_df = pd.merge(
            temp_df,
            temp_incoming_df,
            how="left",
            on="Run ID"
        )

        # Append to the result DataFrame
        final_df = pd.concat([final_df, merged_df], ignore_index=True)

final_df.to_parquet('processed/final_df.parquet', index=False)

final_df

# Data Splitting

In [None]:
# Split the data by Tool ID 

# Model Building and Training

## 1. Linear Regression

In [None]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

## 2. Extra Trees Regressor

In [None]:
model_config = {
    'n_estimators': 300,
    'max_depth': 20,
    'criterion': 'gini',  # Alternative: 'entropy'
    'random_state': 42,
    'bootstrap':True
}

# Initialize the model with the specified configuration
et_model = ExtraTreesRegressor(**{**model_config})

et_model.fit(X_train, y_train)
y_pred_et = et_model.predict(X_test)