# **Importing Libraries**

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedShuffleSplit
from imblearn.over_sampling import SMOTE
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical

# **Importing Dataset**

In [2]:
def import_dataset(path: str, nrows: int | None = None) -> pd.DataFrame:
    chunksize = 100_000
    machine_area_df = pd.read_csv("dataset/Machine-Area.csv")
    machine_list_df = pd.read_csv("dataset/Machine-List.csv")
    area_list_df = pd.read_csv("dataset/Area-List.csv")
    machine_area_df['Last Maintenance'] = pd.to_datetime(machine_area_df['Last Maintenance'])
    machine_area_filtered_df = machine_area_df.sort_values(
        by=['ID_Area', 'ID_Mesin', 'Last Maintenance'], 
        ascending=[True, True, False]
    ).drop_duplicates(subset=['ID_Area', 'ID_Mesin'], keep='first')

    id_list = []
    chunk_list = []
    for chunk in pd.read_csv(path, chunksize=chunksize, nrows=nrows, low_memory=False):
        id_column = chunk.pop('ID_Transaction') if 'ID_Transaction' in chunk.columns else None
        chunk = pd.merge(chunk, area_list_df, on='ID_Area', how='left')
        chunk = pd.merge(chunk, machine_list_df, left_on='Machine', right_on='ID_Mesin', how='left')
        chunk = pd.merge(chunk, machine_area_filtered_df, left_on=['Machine', 'ID_Area'], right_on=['ID_Mesin', 'ID_Area'], how='left')

        chunk_list.append(chunk)
        if id_column is not None:
            id_list.append(id_column)
    combined_df = pd.concat(chunk_list, ignore_index=True)
    id_series = pd.concat(id_list, ignore_index=True) if id_list else None
    return combined_df, id_series


In [3]:
train_df, _ = import_dataset(
    path="dataset/train.csv", 
    nrows=200_000)

In [4]:
test_df, test_id = import_dataset(
    path="dataset/test.csv", 
    nrows=None)

# **Data Cleaning**

## Identify Missing Data

In [5]:
missing = train_df.isnull().sum()
missing_percentage = missing / len(train_df) * 100
print(missing_percentage)

temperature_10H_max (°C)     2.8675
temperature_10H_min (°C)    19.1205
temperature-1                2.9615
temperature-2               11.4110
temperature-3                3.5470
apparent_temperature_max    21.2770
apparent_temperature_min    20.1760
ID_Area                     17.3180
Machine                     16.8720
timestamp                    0.0000
humidity                    19.4545
Voltage-L                    9.7585
Voltage-R                    6.8485
Voltage-M                    3.4415
Current-M                    1.0965
Current-R                    1.3795
Current-T                   22.6750
RPM                         11.0755
RPM-1                       13.3295
RPM-2                       18.6955
RPM-3                       18.3175
Vibration-1                  5.3790
Vibration-2                 15.0745
Power                       12.9790
Power_Backup                 3.3195
Status                       0.0000
Breakdown Category          73.7400
Area                        

## Handling Missing Data

In [6]:

columns_to_impute = missing_percentage[missing_percentage < 5].index
for col in columns_to_impute:
    if train_df[col].dtype in ['float64', 'int64']:
        train_df[col] = train_df[col].fillna(train_df[col].median())

In [7]:
def random_sample_impute(series):
    non_missing = series.dropna()
    return series.apply(lambda x: np.random.choice(non_missing) if pd.isnull(x) else x)

In [8]:
from sklearn.impute import KNNImputer
def knn_impute_single_column(series, n_neighbors=2):
    series_df = series.to_frame()
    knn_imputer = KNNImputer(n_neighbors=n_neighbors)
    imputed_array = knn_imputer.fit_transform(series_df)
    return pd.Series(imputed_array.ravel(), index=series.index, name=series.name)

In [9]:
moderate_missingness_cols = missing_percentage[(missing_percentage >= 5) & (missing_percentage <= 23)].index
numerical_cols = [col for col in moderate_missingness_cols if train_df[col].dtype in ['float64', 'int64']]

for col in numerical_cols:
    train_df[col] = random_sample_impute(train_df[col])
    print(f"Imputed {col}")

Imputed temperature_10H_min (°C)
Imputed temperature-2
Imputed apparent_temperature_max
Imputed apparent_temperature_min
Imputed humidity
Imputed Voltage-L
Imputed Voltage-R
Imputed Current-T
Imputed RPM
Imputed RPM-1
Imputed RPM-2
Imputed RPM-3
Imputed Vibration-1
Imputed Vibration-2
Imputed Power


In [10]:
missing = train_df.isnull().sum()
missing_percentage = missing / len(train_df) * 100
print(missing_percentage)

temperature_10H_max (°C)     0.0000
temperature_10H_min (°C)     0.0000
temperature-1                0.0000
temperature-2                0.0000
temperature-3                0.0000
apparent_temperature_max     0.0000
apparent_temperature_min     0.0000
ID_Area                     17.3180
Machine                     16.8720
timestamp                    0.0000
humidity                     0.0000
Voltage-L                    0.0000
Voltage-R                    0.0000
Voltage-M                    0.0000
Current-M                    0.0000
Current-R                    0.0000
Current-T                    0.0000
RPM                          0.0000
RPM-1                        0.0000
RPM-2                        0.0000
RPM-3                        0.0000
Vibration-1                  0.0000
Vibration-2                  0.0000
Power                        0.0000
Power_Backup                 3.3195
Status                       0.0000
Breakdown Category          73.7400
Area                        

## Dealing with Outliers

## Remove Duplicates

## Feature Engineering

# **Data Preprocessing**

## Feature Scaling

In [11]:
def scaler(df):
    numerical_cols = [col for col in df.columns if df[col].dtype in ['float64', 'int64']]
    scaler = StandardScaler()
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
    return df


## Dimensionality Reduction

### bisa pake feature hasher buat categorical

In [12]:
def reduce_dim(df, dim):
    numerical_cols = [col for col in df.columns if df[col].dtype in ['float64', 'int64']]
    cols_to_reduce = [col for col in numerical_cols if col != 'Age']
    X = df[cols_to_reduce]
    input_dim = X.shape[1]

    input_layer = Input(shape=(input_dim,))
    encoder = Dense(dim, activation='relu')(input_layer)
    decoder = Dense(input_dim, activation='sigmoid')(encoder)
    autoencoder = Model(inputs=input_layer, outputs=decoder)

    autoencoder.compile(optimizer='adam', loss='mse')
    autoencoder.fit(X, X, epochs=50, batch_size=32, shuffle=True, validation_split=0.2)

    encoder_model = Model(inputs=input_layer, outputs=encoder)
    X_reduced = encoder_model.predict(X)
    X_reduced = pd.DataFrame(X_reduced, columns=[f'feature_{i}' for i in range(dim)])
    df.drop(cols_to_reduce, axis=1, inplace=True)
    df = pd.concat([df, X_reduced], axis=1)
    return df, encoder_model, cols_to_reduce

In [13]:
def predict_reduce_dim(df, encoder_model, cols_to_reduce) :
    X = df[cols_to_reduce].copy()
    for col in X.columns:
        X[col] = pd.to_numeric(X[col], errors='coerce')
    X_reduced = encoder_model.predict(X)
    dim = X_reduced.shape[1]
    X_reduced = pd.DataFrame(X_reduced, columns=[f'feature_{i}' for i in range(dim)], index=df.index)

    df.drop(cols_to_reduce, axis=1, inplace=True)
    df = pd.concat([df, X_reduced], axis=1)
    return df

## Feature Encoding

In [14]:
def one_hot_encode(df, cols):
    if df[cols].isnull().any():
        df[cols] = df[cols].fillna("Missing")
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    onehot_encoded = encoder.fit_transform(df[[cols]])
    encoded_df = pd.DataFrame(onehot_encoded, columns=encoder.get_feature_names_out([cols]), index = df.index)
    for column in encoded_df.columns:
        encoded_df[column] = pd.Categorical(encoded_df[column])
    df_encoded = pd.concat([df.drop(columns=cols), encoded_df], axis=1)
    return df_encoded

In [15]:
def time_encoding(df):
    df['Last Maintenance'] = pd.to_datetime(df['Last Maintenance'])
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['days_since_last_maintenance'] = (df['timestamp'] - df['Last Maintenance']).dt.days
    df.drop(['Last Maintenance', 'timestamp'], axis=1, inplace=True)
    return df

## Handling Imbalanced Dataset

In [16]:
def smote_transform(df):
    target = df['Status'].map({'Normal': 0, 'Warning': 1, 'Breakdown': 2})
    df = df.drop('Status', axis=1)
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(df, target)
    X_resampled = pd.DataFrame(X_resampled, columns=df.columns)
    X_resampled['Status'] = y_resampled
    return X_resampled

# Pipeline

In [17]:
def pipeline(df, is_train=False, encoder_model=None, cols_to_reduce=None):
    df = scaler(df)
    if is_train:
        df, encoder_model, cols_to_reduce = reduce_dim(df, 3)
    else:
        if encoder_model is None or cols_to_reduce is None:
            raise ValueError("For non-training data, 'encoder_model' and 'cols_to_reduce' must be provided.")
        df = predict_reduce_dim(df, encoder_model, cols_to_reduce)   
    df = time_encoding(df)
    df = one_hot_encode(df, 'Priority')
    df = one_hot_encode(df, 'Status Sparepart')
    df = one_hot_encode(df, 'Power_Backup')
    if is_train:
        df.drop(columns=['ID_Area', 'ID_Mesin_x', 'Machine', 'Breakdown Category',
                          'Area', 'ID_Mesin_y', 'Mesin_x', 'Mesin_y', 'Country Machine_x', 'Country Machine_y'], inplace=True)
    else:
        df.drop(columns=['ID_Area', 'ID_Mesin_x', 'Machine', 'Area', 'ID_Mesin_y', 'Mesin_x', 'Mesin_y',
                          'Country Machine_x', 'Country Machine_y'], inplace=True)
    for col in df.select_dtypes(include=['category']).columns:
        df[col] = df[col].cat.add_categories([-1])
        df[col] = df[col].fillna(-1)
    df.fillna(-1, inplace=True)
    if is_train:
        df = smote_transform(df)
    return df, encoder_model, cols_to_reduce

# **Modelling & Validation**

In [18]:
train_df, encoder_model, cols_to_reduce = pipeline(train_df, is_train=True)

Epoch 1/50
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 715us/step - loss: 1.0135 - val_loss: 0.9337
Epoch 2/50
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 672us/step - loss: 0.9194 - val_loss: 0.9270
Epoch 3/50
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 849us/step - loss: 0.9144 - val_loss: 0.9242
Epoch 4/50
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - loss: 0.9123 - val_loss: 0.9225
Epoch 5/50
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - loss: 0.9136 - val_loss: 0.9213
Epoch 6/50
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - loss: 0.9073 - val_loss: 0.9204
Epoch 7/50
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - loss: 0.9102 - val_loss: 0.9197
Epoch 8/50
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 985us/step - loss: 0.9086 - val_loss: 0.9191
Epoch 9/50
[1m5

In [19]:
num_chunks = len(test_df) // 100000 + (len(test_df) % 100000 > 0)
chunks = np.array_split(test_df, num_chunks)
processed_chunks = []

for chunk in chunks:
    processed_chunk, _, a = pipeline(chunk, is_train=False, encoder_model=encoder_model, cols_to_reduce=cols_to_reduce)
    processed_chunks.append(processed_chunk)

testing = pd.concat(processed_chunks, ignore_index=True)

  return bound(*args, **kwds)


[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 742us/step
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 726us/step
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 728us/step
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 581us/step
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 580us/step
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 623us/step
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 606us/step
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 604us/step
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 572us/step
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 802us/step
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 755us/step
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 614us/step
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

In [20]:
# test_df.to_csv("test_processed.csv", index=False)

In [21]:
# chunk_size = 100_000
# chunk_list = []
# for chunk in pd.read_csv("test_processed.csv", chunksize=chunk_size, low_memory=False):
#     chunk_list.append(chunk)

# test_df = pd.concat(chunk_list, ignore_index=True)

In [22]:
X = train_df.drop(columns='Status')
y = train_df['Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
y.value_counts()

Status
1    123236
0    123236
2    123236
Name: count, dtype: int64

In [24]:
import xgboost as xgb
from sklearn.metrics import confusion_matrix
def precision_recall_f1(tp, fp, fn):
    # Calculate Precision
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    
    # Calculate Recall
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    
    # Calculate F1 Score
    if precision + recall == 0:
        f1 = 0
    else:
        f1 = 2 * (precision * recall) / (precision + recall)
    
    return precision, recall, f1

xgb_model = xgb.XGBClassifier(enable_categorical=True)
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)

In [25]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.56      0.71      0.62     24576
           1       0.49      0.42      0.45     24675
           2       0.50      0.44      0.47     24691

    accuracy                           0.52     73942
   macro avg       0.52      0.52      0.52     73942
weighted avg       0.52      0.52      0.52     73942



In [26]:
from sklearn.tree import DecisionTreeClassifier

# X = pd.DataFrame(train_df['days_since_last_maintenance'])
# y = train_df['Status'].map({'Normal': 0, 'Warning': 1, 'Breakdown': 2})
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
y_pred = dt_model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.56      0.53      0.55     24576
           1       0.52      0.53      0.53     24675
           2       0.53      0.55      0.54     24691

    accuracy                           0.54     73942
   macro avg       0.54      0.54      0.54     73942
weighted avg       0.54      0.54      0.54     73942



In [27]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.60      0.79      0.68     24576
           1       0.70      0.59      0.64     24675
           2       0.71      0.61      0.65     24691

    accuracy                           0.66     73942
   macro avg       0.67      0.66      0.66     73942
weighted avg       0.67      0.66      0.66     73942



In [28]:
def predict_in_chunks(test_df, model, batch_size=1000, preprocess_fn=None):
    chunks = np.array_split(test_df, len(test_df) // batch_size + 1)
    predictions = []
    
    for i, chunk in enumerate(chunks):
        print(f"Processing chunk {i + 1}/{len(chunks)}...")
        preds = model.predict(chunk)
        predictions.append(preds)
    all_predictions = np.concatenate(predictions, axis=0)
    
    return all_predictions


In [29]:
#rename col
testing.rename(columns={'Power_Backup_ ': 'Power_Backup_Missing'}, inplace=True)

In [30]:
predictions = predict_in_chunks(testing, rf_model, batch_size=100000)

  return bound(*args, **kwds)


Processing chunk 1/71...
Processing chunk 2/71...
Processing chunk 3/71...
Processing chunk 4/71...
Processing chunk 5/71...
Processing chunk 6/71...
Processing chunk 7/71...
Processing chunk 8/71...
Processing chunk 9/71...
Processing chunk 10/71...
Processing chunk 11/71...
Processing chunk 12/71...
Processing chunk 13/71...
Processing chunk 14/71...
Processing chunk 15/71...
Processing chunk 16/71...
Processing chunk 17/71...
Processing chunk 18/71...
Processing chunk 19/71...
Processing chunk 20/71...
Processing chunk 21/71...
Processing chunk 22/71...
Processing chunk 23/71...
Processing chunk 24/71...
Processing chunk 25/71...
Processing chunk 26/71...
Processing chunk 27/71...
Processing chunk 28/71...
Processing chunk 29/71...
Processing chunk 30/71...
Processing chunk 31/71...
Processing chunk 32/71...
Processing chunk 33/71...
Processing chunk 34/71...
Processing chunk 35/71...
Processing chunk 36/71...
Processing chunk 37/71...
Processing chunk 38/71...
Processing chunk 39/7

In [56]:
submission = pd.read_csv("dataset/submission.csv")

In [55]:
test_predictions = pd.DataFrame({
    'ID_Transaction': test_id,
    'Status': predictions})

test_predictions['Status'] = test_predictions['Status'].map({0: 'Normal', 1: 'Warning', 2: 'Breakdown'})

In [67]:
test_predictions['Status'].value_counts()

Status
Normal       5740479
Breakdown     906504
Name: count, dtype: int64

In [65]:
test_predictions.to_csv("submission2.csv", index=False)

In [57]:
submission = pd.merge(submission, test_predictions, on='ID_Transaction', how='left')

In [59]:
submission = submission.drop(columns='Status_x')

In [61]:
submission.fillna('Normal', inplace=True)

In [62]:
submission.to_csv("submission1.csv", index=False)

In [47]:
# Keep only the column from the predictions DataFrame
submission = submission.drop(columns=['Status_x'])  # Adjust the column name as needed

# Optionally rename 'Status_y' to 'Target' (or your desired name)
submission.rename(columns={'Status_y': 'Status'}, inplace=True)


In [48]:
submission.to_csv("submission1.csv", index=False)

# **Result & Analysis**