# **Importing Libraries**

In [254]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# **Importing Dataset**

In [255]:
def import_dataset(path: str ,nrows: int | None = None) -> pd.DataFrame:
    train_df = pd.read_csv(path, nrows=nrows)

    machine_area_df = pd.read_csv("dataset/Machine-Area.csv")
    machine_list_df = pd.read_csv("dataset/Machine-List.csv")
    area_list_df = pd.read_csv("dataset/Area-List.csv")

    machine_area_df['Last Maintenance'] = pd.to_datetime(machine_area_df['Last Maintenance'])
    machine_area_filtered_df = machine_area_df.sort_values(by=['ID_Area', 'ID_Mesin', 'Last Maintenance'], ascending=[True, True, False])
    machine_area_filtered_df = machine_area_filtered_df.drop_duplicates(subset=['ID_Area', 'ID_Mesin'], keep='first')

    combined_df = pd.merge(train_df, area_list_df, on='ID_Area', how='left')
    combined_df = pd.merge(train_df, machine_list_df, left_on='Machine', right_on='ID_Mesin', how='left')
    combined_df = pd.merge(train_df, machine_area_filtered_df, left_on=['Machine', 'ID_Area'], right_on=['ID_Mesin', 'ID_Area'], how='left')

    return combined_df
    

In [256]:
train_df = import_dataset(
    path="dataset/train.csv", 
    nrows=200_000)

# **Data Cleaning**

## Identify Missing Data

In [257]:
missing = train_df.isnull().sum()
missing_percentage = missing / len(train_df) * 100
print(missing_percentage)

ID_Transaction               0.0000
temperature_10H_max (°C)     2.8675
temperature_10H_min (°C)    19.1205
temperature-1                2.9615
temperature-2               11.4110
temperature-3                3.5470
apparent_temperature_max    21.2770
apparent_temperature_min    20.1760
ID_Area                     17.3180
Machine                     16.8720
timestamp                    0.0000
humidity                    19.4545
Voltage-L                    9.7585
Voltage-R                    6.8485
Voltage-M                    3.4415
Current-M                    1.0965
Current-R                    1.3795
Current-T                   22.6750
RPM                         11.0755
RPM-1                       13.3295
RPM-2                       18.6955
RPM-3                       18.3175
Vibration-1                  5.3790
Vibration-2                 15.0745
Power                       12.9790
Power_Backup                 3.3195
Status                       0.0000
Breakdown Category          

## Handling Missing Data

In [258]:
columns_to_impute = missing_percentage[missing_percentage < 5].index
for col in columns_to_impute:
    if train_df[col].dtype in ['float64', 'int64']:
        train_df[col] = train_df[col].fillna(train_df[col].median())

In [259]:
def random_sample_impute(series):
    non_missing = series.dropna()
    return series.apply(lambda x: np.random.choice(non_missing) if pd.isnull(x) else x)

In [260]:
from sklearn.impute import KNNImputer
def knn_impute_single_column(series, n_neighbors=2):
    series_df = series.to_frame()
    knn_imputer = KNNImputer(n_neighbors=n_neighbors)
    imputed_array = knn_imputer.fit_transform(series_df)
    return pd.Series(imputed_array.ravel(), index=series.index, name=series.name)

In [271]:
moderate_missingness_cols = missing_percentage[(missing_percentage >= 5) & (missing_percentage <= 23)].index
numerical_cols = [col for col in moderate_missingness_cols if train_df[col].dtype in ['float64', 'int64']]

for col in numerical_cols:
    train_df[col] = random_sample_impute(train_df[col])
    print(f"Imputed {col}")

In [272]:
missing = train_df.isnull().sum()
missing_percentage = missing / len(train_df) * 100
print(missing_percentage)

ID_Transaction               0.0000
temperature_10H_max (°C)     0.0000
temperature_10H_min (°C)     0.0000
temperature-1                0.0000
temperature-2                0.0000
temperature-3                0.0000
apparent_temperature_max     0.0000
apparent_temperature_min     0.0000
ID_Area                     17.3180
Machine                     16.8720
timestamp                    0.0000
humidity                     0.0000
Voltage-L                    0.0000
Voltage-R                    0.0000
Voltage-M                    0.0000
Current-M                    0.0000
Current-R                    0.0000
Current-T                    0.0000
RPM                          0.0000
RPM-1                        0.0000
RPM-2                        0.0000
RPM-3                        0.0000
Vibration-1                  0.0000
Vibration-2                  0.0000
Power                        0.0000
Power_Backup                 3.3195
Status                       0.0000
Breakdown Category          

In [273]:
train_df['Age'] = train_df['Age'].fillna(-1)

## Dealing with Outliers

## Remove Duplicates

## Feature Engineering

# **Data Preprocessing**

## Feature Scaling

In [274]:
numerical_cols = [col for col in train_df.columns if train_df[col].dtype in ['float64', 'int64']]
scaler = StandardScaler()
train_df[numerical_cols] = scaler.fit_transform(train_df[numerical_cols])


## Handling Skewness

In [264]:
skewness = train_df[numerical_cols].skew()
print(skewness)

temperature_10H_max (°C)     4.254057
temperature_10H_min (°C)    -2.460294
temperature-1                0.831635
temperature-2                0.592190
temperature-3               15.932065
apparent_temperature_max     0.597290
apparent_temperature_min    -0.599373
humidity                     0.960080
Voltage-L                   -0.965031
Voltage-R                    0.970451
Voltage-M                   -1.014719
Current-M                   -0.859843
Current-R                    0.859755
Current-T                    0.919939
RPM                          0.979914
RPM-1                        0.970137
RPM-2                        0.983558
RPM-3                        0.991424
Vibration-1                 -0.965764
Vibration-2                 -0.971846
Power                        0.975809
Age                          0.068949
dtype: float64


In [265]:
def skewness_transform(df):
    transformed_data = {}
    numerical_cols = [col for col in df.columns if df[col].dtype in ['float64', 'int64']]
    for col in numerical_cols:
        if skewness[col] < -0.5:
            max_val = df[col].max()
            reflected = max_val - df[col]
            transformed_data[col] = np.log1p(reflected) 
        elif skewness[col] > 0.5:  
            transformed_data[col] = np.log1p(df[col])
        else:
            transformed_data[col] = df[col]
    transformed_df = pd.DataFrame(transformed_data)
    return transformed_df

In [266]:
# transformed_df = skewness_transform(train_df)
# skewness = transformed_df.skew()
# print(skewness)

In [267]:
# train_df[numerical_cols] = transformed_df

## Feature Encoding

## Handling Imbalanced Dataset

## Dimensionality Reduction

### bisa pake feature hasher buat categorical

In [323]:
from tensorflow.keras.utils import to_categorical
X = train_df[numerical_cols]
y = train_df['Status'].map({'Normal': 0, 'Warning': 1, 'Breakdown': 2})

# y_one_hot = to_categorical(y, num_classes=3)


In [324]:
# from sklearn.feature_selection import SelectKBest, f_classif

# selector = SelectKBest(score_func=f_classif, k=3)
# X_reduced = selector.fit_transform(X, y)


In [325]:
# X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)

In [347]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

# Define an autoencoder
input_dim = X.shape[1]
encoding_dim = 4  # Desired reduced dimensionality

input_layer = Input(shape=(input_dim,))
encoder = Dense(encoding_dim, activation='relu')(input_layer)
decoder = Dense(input_dim, activation='sigmoid')(encoder)
autoencoder = Model(inputs=input_layer, outputs=decoder)

# Compile and train
autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(X, X, epochs=50, batch_size=32, shuffle=True, validation_split=0.2)

# Extract the encoder model
encoder_model = Model(inputs=input_layer, outputs=encoder)
X_reduced = encoder_model.predict(X)


Epoch 1/50
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - loss: 1.0014 - val_loss: 0.9091
Epoch 2/50
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step - loss: 0.8904 - val_loss: 0.8989
Epoch 3/50
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - loss: 0.8844 - val_loss: 0.8948
Epoch 4/50
[1m2249/5000[0m [32m━━━━━━━━[0m[37m━━━━━━━━━━━━[0m [1m2s[0m 854us/step - loss: 0.8761

KeyboardInterrupt: 

In [342]:
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)

# **Modelling & Validation**

In [343]:
import xgboost as xgb
from sklearn.metrics import confusion_matrix
def precision_recall_f1(tp, fp, fn):
    # Calculate Precision
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    
    # Calculate Recall
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    
    # Calculate F1 Score
    if precision + recall == 0:
        f1 = 0
    else:
        f1 = 2 * (precision * recall) / (precision + recall)
    
    return precision, recall, f1

xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.62      1.00      0.76     24679
           1       0.13      0.00      0.00      7679
           2       0.25      0.00      0.00      7642

    accuracy                           0.62     40000
   macro avg       0.33      0.33      0.25     40000
weighted avg       0.45      0.62      0.47     40000



In [346]:
report = classification_report(y_test, y_pred, output_dict=True)
macro_f1 = report['macro avg']['f1-score']
print(f"Macro F1-Score: {macro_f1}")

Macro F1-Score: 0.2548776945191021


In [348]:
def max_score_finder(num_dim):
    input_layer = Input(shape=(input_dim,))
    encoder = Dense(num_dim, activation='relu')(input_layer)
    decoder = Dense(input_dim, activation='sigmoid')(encoder)
    autoencoder = Model(inputs=input_layer, outputs=decoder)
    autoencoder.compile(optimizer='adam', loss='mse')
    autoencoder.fit(X, X, epochs=50, batch_size=32, shuffle=True, validation_split=0.2)
    encoder_model = Model(inputs=input_layer, outputs=encoder)
    X_reduced = encoder_model.predict(X)
    X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)
    xgb_model = xgb.XGBClassifier()
    xgb_model.fit(X_train, y_train)
    y_pred = xgb_model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    macro_f1 = report['macro avg']['f1-score']
    return macro_f1

In [351]:
scores = []
for i in range(1, 21):
    scores.append(max_score_finder(i))

Epoch 1/50
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 793us/step - loss: 1.0665 - val_loss: 0.9884
Epoch 2/50
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 730us/step - loss: 0.9776 - val_loss: 0.9874
Epoch 3/50
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 774us/step - loss: 0.9738 - val_loss: 0.9870
Epoch 4/50
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 722us/step - loss: 0.9734 - val_loss: 0.9866
Epoch 5/50
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 761us/step - loss: 0.9799 - val_loss: 0.9862
Epoch 6/50
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 769us/step - loss: 0.9727 - val_loss: 0.9859
Epoch 7/50
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 774us/step - loss: 0.9686 - val_loss: 0.9855
Epoch 8/50
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 719us/step - loss: 0.9777 - val_loss: 0.9850
Epoch 9/

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step - loss: 1.0384 - val_loss: 0.9633
Epoch 2/50
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step - loss: 0.9496 - val_loss: 0.9596
Epoch 3/50
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - loss: 0.9436 - val_loss: 0.9580
Epoch 4/50
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 854us/step - loss: 0.9461 - val_loss: 0.9570
Epoch 5/50
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 731us/step - loss: 0.9439 - val_loss: 0.9562
Epoch 6/50
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 726us/step - loss: 0.9454 - val_loss: 0.9552
Epoch 7/50
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 720us/step - loss: 0.9364 - val_loss: 0.9535
Epoch 8/50
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 762us/step - loss: 0.9447 - val_loss: 0.9517
Epoch 9/50
[1m5000/5000

In [355]:
max_score = max(scores)
max_index = scores.index(max_score) + 1
print(f"Max Score: {max_score}, Number of Dimensions: {max_index}")

Max Score: 0.25655670542716535, Number of Dimensions: 8


# **Result & Analysis**