# **Importing Libraries**

In [65]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers # type: ignore
from tensorflow.keras.models import Sequential # type: ignore
from tensorflow.keras.layers import Dense, Dropout, Input, BatchNormalization # type: ignore
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau # type: ignore
from tensorflow.keras.models import load_model # type: ignore
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedShuffleSplit
from imblearn.over_sampling import SMOTE
from tensorflow.keras.models import Model # type: ignore
from tensorflow.keras.utils import to_categorical # type: ignore
from sklearn.ensemble import RandomForestClassifier

import os
import pickle


# **FUNCTION DECLARATION**

## apply_combine(df)

In [66]:
machine_area_df = pd.read_csv("dataset/Machine-Area.csv")
machine_list_df = pd.read_csv("dataset/Machine-List.csv")
area_list_df = pd.read_csv("dataset/Area-List.csv")
machine_area_df['Last Maintenance'] = pd.to_datetime(machine_area_df['Last Maintenance'])
maintenance_frequency = machine_area_df.groupby('ID_Mesin').size().rename('maintenance_count')
machine_area_df = pd.merge(machine_area_df, maintenance_frequency, left_on='ID_Mesin', right_index=True, how='left')
machine_area_filtered_df = machine_area_df.sort_values(
    by=['ID_Area', 'ID_Mesin', 'Last Maintenance'], 
    ascending=[True, True, False]
).drop_duplicates(subset=['ID_Area', 'ID_Mesin'], keep='first')

def apply_combine(df: pd.DataFrame) -> pd.DataFrame:
    ret: pd.DataFrame = df.copy()
    
    ret = pd.merge(ret, area_list_df, on='ID_Area', how='left')
    ret = pd.merge(ret, machine_list_df, left_on='Machine', right_on='ID_Mesin', how='left')
    ret = pd.merge(ret, machine_area_filtered_df, left_on=['Machine', 'ID_Area'], right_on=['ID_Mesin', 'ID_Area'], how='left')
        
    return ret

## apply_fill_na(df)

In [67]:
_columns_to_keep = ['Last Maintenance', 'Status Sparepart', 'Power_Backup', 'Priority', 'Status']

_columns_to_impute: list[str] = []
_columns_to_drop: list[str] = []

def apply_fill_na(df: pd.DataFrame) -> pd.DataFrame:
    ret = df.copy()

    avg_timespan = 10
    valid_rows = ret['ID_Mesin_x'].notnull() & ret['ID_Area'].notnull()

    if valid_rows.any():
        ret.loc[valid_rows, 'predicted_age'] = (
            ret.loc[valid_rows]
            .groupby(['ID_Mesin_x', 'ID_Area'])['Last Maintenance']
            .transform('min')
            .dt.year + avg_timespan
        )
        ret.loc[valid_rows, 'Age'] = ret.loc[valid_rows, 'Age'].fillna(ret.loc[valid_rows, 'predicted_age'])

    ret.loc[~valid_rows, 'Age'] = np.nan

    ret = ret.drop(columns=['predicted_age'], errors='ignore')

    missing = ret.isnull().sum()
    missing_percentage = missing / len(ret) * 100

    columns_to_impute = missing_percentage[missing_percentage < 5].index
    for col in columns_to_impute:
        if ret[col].dtype in ['float64', 'int64']:
            ret[col] = ret[col].fillna(ret[col].median())

    missing_percentage = ret.isnull().sum() / len(ret) * 100
    columns_to_drop = missing_percentage[missing_percentage > 5].index
    filtered_columns_to_drop = [col for col in columns_to_drop if col not in _columns_to_keep]
    ret = ret.drop(columns=filtered_columns_to_drop)

    _columns_to_impute = columns_to_impute
    _columns_to_drop = filtered_columns_to_drop

    return ret

In [68]:
def apply_fill_na_predict(df: pd.DataFrame) -> pd.DataFrame:
    ret = df.copy()

    avg_timespan = 10
    valid_rows = ret['ID_Mesin_x'].notnull() & ret['ID_Area'].notnull()

    if valid_rows.any():
        ret.loc[valid_rows, 'predicted_age'] = (
            ret.loc[valid_rows]
            .groupby(['ID_Mesin_x', 'ID_Area'])['Last Maintenance']
            .transform('min')
            .dt.year + avg_timespan
        )
        ret.loc[valid_rows, 'Age'] = ret.loc[valid_rows, 'Age'].fillna(ret.loc[valid_rows, 'predicted_age'])

    ret.loc[~valid_rows, 'Age'] = np.nan

    ret = ret.drop(columns=['predicted_age'], errors='ignore')

    missing = ret.isnull().sum()
    missing_percentage = missing / len(ret) * 100

    for col in _columns_to_impute:
        if ret[col].dtype in ['float64', 'int64']:
            ret[col] = ret[col].fillna(ret[col].median())

    ret = ret.drop(columns=_columns_to_drop)

    return ret

## apply_scaler(df)

In [69]:
scaler = StandardScaler()


In [70]:
def apply_scaler(df:pd.DataFrame, train: bool) -> pd.DataFrame:
    ret = df.copy()
    numerical_cols = [col for col in df.columns if ret[col].dtype in ['float64', 'int64']]
    print(numerical_cols)
    if train:
        scaler.fit(ret[numerical_cols])
    ret[numerical_cols] = scaler.transform(ret[numerical_cols])
    return ret

## aply_reduce_dim()

In [71]:
def apply_reduce_dim(df: pd.DataFrame, dim: int, epochs: int = 25) -> tuple:
    ret = df.copy()
    numerical_cols = [col for col in ret.columns if ret[col].dtype in ['float64', 'int64']]
    cols_to_reduce = [col for col in numerical_cols if col != 'Age']
    X = ret[cols_to_reduce]
    input_dim = X.shape[1]

    input_layer = Input(shape=(input_dim,))
    encoder = Dense(dim, activation='relu')(input_layer)
    encoder = BatchNormalization()(encoder)
    decoder = Dense(input_dim, activation='sigmoid')(encoder)
    autoencoder = Model(inputs=input_layer, outputs=decoder)

    autoencoder.compile(optimizer='adam', loss='mse')
    autoencoder.fit(X, X, epochs=epochs, batch_size=32, shuffle=True, validation_split=0.2)

    encoder_model = Model(inputs=input_layer, outputs=encoder)
    X_reduced = encoder_model.predict(X)
    X_reduced = pd.DataFrame(X_reduced, columns=[f'feature_{i}' for i in range(dim)])
    ret.drop(cols_to_reduce, axis=1, inplace=True)
    ret = pd.concat([ret, X_reduced], axis=1)
    return ret, encoder_model, cols_to_reduce

In [72]:
def predict_reduce_dim(df, encoder_model, cols_to_reduce) -> pd.DataFrame:
    X = df[cols_to_reduce].copy()
    for col in X.columns:
        X[col] = pd.to_numeric(X[col], errors='coerce')
    X_reduced = encoder_model.predict(X)
    dim = X_reduced.shape[1]
    X_reduced = pd.DataFrame(X_reduced, columns=[f'feature_{i}' for i in range(dim)], index=df.index)

    df.drop(cols_to_reduce, axis=1, inplace=True)
    df = pd.concat([df, X_reduced], axis=1)
    return df

## apply_time_encoding(df)

In [73]:
def apply_time_encoding(df: pd.DataFrame) -> pd.DataFrame:
    ret = df.copy()
    ret['Last Maintenance'] = pd.to_datetime(ret['Last Maintenance'])
    ret['timestamp'] = pd.to_datetime(ret['timestamp'])
    ret['days_since_last_maintenance'] = (ret['timestamp'] - ret['Last Maintenance']).dt.days
    ret.drop(['Last Maintenance', 'timestamp'], axis=1, inplace=True)
    return ret

## apply_one_hot_encode(df, cols)

In [74]:
def apply_one_hot_encode(df: pd.DataFrame, cols):
    if df[cols].isnull().any():
        df[cols] = df[cols].fillna("Missing")
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    onehot_encoded = encoder.fit_transform(df[[cols]])
    encoded_df = pd.DataFrame(onehot_encoded, columns=encoder.get_feature_names_out([cols]), index = df.index)
    for column in encoded_df.columns:
        encoded_df[column] = pd.Categorical(encoded_df[column])
    df_encoded = pd.concat([df.drop(columns=cols), encoded_df], axis=1)
    return df_encoded

## apply_feature_selection(df)

In [75]:
def apply_feature_selection(df: pd.DataFrame):
    ret = df.copy()
    # ret = ret.drop(columns=[
    #     'ID_Area', 'ID_Mesin_x', 'Machine', 'Breakdown Category', 'Area', 'ID_Transaction',
    #     'ID_Mesin_y', 'Mesin_x', 'Mesin_y', 'Country Machine_x', 'Country Machine_y'], errors='ignore')
    # if 'Breakdown Category' in ret.columns:
    #     ret = ret.drop(columns=['Breakdown Category'])

    ret = ret[['feature_0', 'feature_1', 'feature_2', 'days_since_last_maintenance',
       'Priority_High', 'Priority_Low', 'Priority_Medium', 'Priority_Missing',
       'Status Sparepart_Broken', 'Status Sparepart_Empty',
       'Status Sparepart_In Use', 'Status Sparepart_Missing',
       'Status Sparepart_On Check', 'Status Sparepart_Ready',
       'Status Sparepart_Repair', 'Power_Backup_Missing', 'Power_Backup_No',
       'Power_Backup_Yes']]
    
    if "Status" in df.columns:
        ret['Status'] = df['Status']

    return ret

## apply_fill_category(df)

In [76]:
def apply_fill_category(df: pd.DataFrame) -> pd.DataFrame:
    ret = df.copy()
    for col in ret.select_dtypes(include=['category']).columns:
            ret[col] = ret[col].cat.add_categories([-1])
            ret[col] = ret[col].fillna(-1)
    ret = ret.fillna(-1)

    return ret

## apply_smote(df)

In [77]:
def apply_smote(df: pd.DataFrame) -> pd.DataFrame:
    ret = df.copy()
    target = ret['Status'].map({'Normal': 0, 'Warning': 1, 'Breakdown': 2})
    ret = ret.drop('Status', axis=1)
    smote = SMOTE(random_state=42, n_jobs=-1)
    X_resampled, y_resampled = smote.fit_resample(ret, target)
    X_resampled = pd.DataFrame(X_resampled, columns=ret.columns)
    X_resampled['Status'] = y_resampled
    return X_resampled

# **TRAINING**

In [78]:
first_run = False
idx = 0
cols_to_reduce = ['temperature_10H_max (°C)',
 'temperature-1',
 'temperature-3',
 'Voltage-M',
 'Current-M',
 'Current-R']

for chunk in pd.read_csv("dataset/train.csv", chunksize=500_000):
    df = chunk.copy()
    df = apply_combine(df)
    df = apply_fill_na(df)

    if first_run:
        df, encoder_model, cols_to_reduce = apply_reduce_dim(df, 3)
        encoder_model.save("saved_models/encoder.keras")
        first_run = False
    else:
        encoder_model = load_model("saved_models/encoder.keras")
        df = predict_reduce_dim(df, encoder_model, cols_to_reduce)
    
    df = apply_time_encoding(df)
    df = apply_one_hot_encode(df, 'Priority')
    df = apply_one_hot_encode(df, 'Status Sparepart')
    df = apply_one_hot_encode(df, 'Power_Backup')
    df = apply_fill_category(df)
    df = apply_feature_selection(df)
    df = apply_scaler(df, train=True)
    df = apply_smote(df)

    X = df.drop(columns='Status')
    y = df['Status']

    rf = RandomForestClassifier(n_estimators=25, max_depth=4, n_jobs=-1, random_state=42)
    rf.fit(X, y)
    pickle.dump(rf, open(f"saved_models/rf_{idx}.pkl", 'wb'))

    idx +=1
    print(idx)

del df

KeyboardInterrupt: 

# **PREDICTING**

In [32]:
def predict(id: pd.Series, df: pd.DataFrame) -> pd.DataFrame:
    predictions = []
    
    for i in range(26):
        rf = pickle.load(open(f"saved_models/rf_{i}.pkl", 'rb'))
        
        y_pred = rf.predict_proba(df)
        
        max_indices = y_pred.argmax(axis=1)
        
        predictions.append(max_indices)
    
    majority_votes = pd.DataFrame(predictions).mode(axis=0).iloc[0]
    
    result = pd.DataFrame({
        'ID': id,
        'Prediction': majority_votes
    })
    
    return result


In [None]:
pred_result = []
for chunk in pd.read_csv("dataset/test.csv", chunksize=100_000):
    df = chunk.copy()
    ID = df['ID_Transaction'].copy()
    df = df.replace(r'^\s*$', np.nan, regex=True)
    df = apply_combine(df)
    df = apply_fill_na_predict(df)

    df = predict_reduce_dim(df, encoder_model, cols_to_reduce)
    
    df = apply_time_encoding(df)
    df = apply_one_hot_encode(df, 'Priority')
    df = apply_one_hot_encode(df, 'Status Sparepart')
    df = apply_one_hot_encode(df, 'Power_Backup')
    df = apply_feature_selection(df)
    df = apply_fill_category(df)
    df = apply_scaler(df, train=False)

    y_pred = predict(df)

    break

[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step
['days_since_last_maintenance']
