# **Importing Libraries**

In [33]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedShuffleSplit
from imblearn.over_sampling import SMOTE
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
from sklearn.ensemble import RandomForestClassifier


# **FUNCTION DECLARTAION**

## apply_combine(df)

In [34]:
machine_area_df = pd.read_csv("dataset/Machine-Area.csv")
machine_list_df = pd.read_csv("dataset/Machine-List.csv")
area_list_df = pd.read_csv("dataset/Area-List.csv")
machine_area_df['Last Maintenance'] = pd.to_datetime(machine_area_df['Last Maintenance'])
machine_area_filtered_df = machine_area_df.sort_values(
    by=['ID_Area', 'ID_Mesin', 'Last Maintenance'], 
    ascending=[True, True, False]
).drop_duplicates(subset=['ID_Area', 'ID_Mesin'], keep='first')

def apply_combine(df: pd.DataFrame) -> pd.DataFrame:
    ret: pd.DataFrame = df.copy()
    
    ret = pd.merge(ret, area_list_df, on='ID_Area', how='left')
    ret = pd.merge(ret, machine_list_df, left_on='Machine', right_on='ID_Mesin', how='left')
    ret = pd.merge(ret, machine_area_filtered_df, left_on=['Machine', 'ID_Area'], right_on=['ID_Mesin', 'ID_Area'], how='left')
    
    return ret

## apply_fill_na(df)

In [35]:
def apply_fill_na(df: pd.DataFrame) -> pd.DataFrame:
    ret = df.copy()
    missing = ret.isnull().sum()

    # missing_percentage = missing / len(ret) * 100
    # columns_to_impute = missing_percentage[missing_percentage < 5].index
    columns_to_impute = ret.columns

    for col in columns_to_impute:
        if ret[col].dtype in ['float64', 'int64']:
            ret[col] = ret[col].fillna(ret[col].median())
    
    return ret

## apply_scaler(df)

In [36]:
scaler = StandardScaler()

def apply_scaler(df:pd.DataFrame, train: bool) -> pd.DataFrame:
    ret = df.copy()
    numerical_cols = [col for col in df.columns if ret[col].dtype in ['float64', 'int64']]
    if train:
        scaler.fit(ret[numerical_cols])
    ret[numerical_cols] = scaler.transform(ret[numerical_cols])
    return ret

## aply_reduce_dim()

In [60]:
def apply_reduce_dim(df: pd.DataFrame, dim: int, epochs: int = 25) -> tuple:
    ret = df.copy()
    numerical_cols = [col for col in ret.columns if ret[col].dtype in ['float64', 'int64']]
    cols_to_reduce = [col for col in numerical_cols if col != 'Age']
    X = ret[cols_to_reduce]
    input_dim = X.shape[1]

    input_layer = Input(shape=(input_dim,))
    encoder = Dense(dim, activation='relu')(input_layer)
    decoder = Dense(input_dim, activation='sigmoid')(encoder)
    autoencoder = Model(inputs=input_layer, outputs=decoder)

    autoencoder.compile(optimizer='adam', loss='mse')
    autoencoder.fit(X, X, epochs=epochs, batch_size=32, shuffle=True, validation_split=0.2)

    encoder_model = Model(inputs=input_layer, outputs=encoder)
    X_reduced = encoder_model.predict(X)
    X_reduced = pd.DataFrame(X_reduced, columns=[f'feature_{i}' for i in range(dim)])
    ret.drop(cols_to_reduce, axis=1, inplace=True)
    ret = pd.concat([ret, X_reduced], axis=1)
    return ret, encoder_model, cols_to_reduce


def apply_reduce_dim_gpu(df: pd.DataFrame, dim: int, epochs: int = 25) -> tuple:
    ret = df.copy()
    numerical_cols = [col for col in ret.columns if ret[col].dtype in ['float64', 'int64']]
    cols_to_reduce = [col for col in numerical_cols if col != 'Age']
    X = ret[cols_to_reduce]
    input_dim = X.shape[1]

    input_layer = Input(shape=(input_dim,))
    encoder = Dense(dim, activation='relu')(input_layer)
    decoder = Dense(input_dim, activation='sigmoid')(encoder)
    autoencoder = Model(inputs=input_layer, outputs=decoder)

    autoencoder.compile(optimizer='adam', loss='mse')
    autoencoder.fit(X, X, epochs=epochs, batch_size=32, shuffle=True, validation_split=0.2)

    encoder_model = Model(inputs=input_layer, outputs=encoder)
    X_reduced = encoder_model.predict(X)
    X_reduced = pd.DataFrame(X_reduced, columns=[f'feature_{i}' for i in range(dim)])
    ret.drop(cols_to_reduce, axis=1, inplace=True)
    ret = pd.concat([ret, X_reduced], axis=1)
    return ret, encoder_model, cols_to_reduce

In [63]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0


## apply_time_encoding(df)

In [38]:
def apply_time_encoding(df: pd.DataFrame) -> pd.DataFrame:
    ret = df.copy()
    ret['Last Maintenance'] = pd.to_datetime(ret['Last Maintenance'])
    ret['timestamp'] = pd.to_datetime(ret['timestamp'])
    ret['days_since_last_maintenance'] = (ret['timestamp'] - ret['Last Maintenance']).dt.days
    ret.drop(['Last Maintenance', 'timestamp'], axis=1, inplace=True)
    return ret

## apply_one_hot_encode(df, cols)

In [39]:
def apply_one_hot_encode(df: pd.DataFrame, cols):
    if df[cols].isnull().any():
        df[cols] = df[cols].fillna("Missing")
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    onehot_encoded = encoder.fit_transform(df[[cols]])
    encoded_df = pd.DataFrame(onehot_encoded, columns=encoder.get_feature_names_out([cols]), index = df.index)
    for column in encoded_df.columns:
        encoded_df[column] = pd.Categorical(encoded_df[column])
    df_encoded = pd.concat([df.drop(columns=cols), encoded_df], axis=1)
    return df_encoded

## apply_feature_selection(df)

In [50]:
def apply_feature_selection(df: pd.DataFrame):
    ret = df.copy()
    ret = ret.drop(columns=[
        'ID_Area', 'ID_Mesin_x', 'Machine', 'Breakdown Category', 'Area', 'ID_Transaction',
        'ID_Mesin_y', 'Mesin_x', 'Mesin_y', 'Country Machine_x', 'Country Machine_y'])
    if 'Breakdown Category' in ret.columns:
        ret = ret.drop(columns=['Breakdown Category'])
    
    return ret

## apply_fill_category(df)

In [44]:
def apply_fill_category(df: pd.DataFrame) -> pd.DataFrame:
    ret = df.copy()
    for col in ret.select_dtypes(include=['category']).columns:
            ret[col] = ret[col].cat.add_categories([-1])
            ret[col] = ret[col].fillna(-1)
    ret = ret.fillna(-1)

    return ret

## apply_smote(df)

In [45]:
def apply_smote(df: pd.DataFrame) -> pd.DataFrame:
    ret = df.copy()
    target = ret['Status'].map({'Normal': 0, 'Warning': 1, 'Breakdown': 2})
    ret = ret.drop('Status', axis=1)
    smote = SMOTE(random_state=42, n_jobs=-1)
    X_resampled, y_resampled = smote.fit_resample(ret, target)
    X_resampled = pd.DataFrame(X_resampled, columns=ret.columns)
    X_resampled['Status'] = y_resampled
    return X_resampled

# **TRAINING**

In [61]:
models = []

for chunk in pd.read_csv("dataset/train.csv", chunksize=100_000):
    df = chunk.copy()
    df = apply_combine(df)
    df = apply_fill_na(df)
    df = apply_scaler(df, train=True)
    try:
        df, encoder_model, cols_to_reduce = apply_reduce_dim_gpu(df, 3, 1)
    except:
        df, encoder_model, cols_to_reduce = apply_reduce_dim(df, 3, 1)
    df = apply_time_encoding(df)
    df = apply_one_hot_encode(df, 'Priority')
    df = apply_one_hot_encode(df, 'Status Sparepart')
    df = apply_one_hot_encode(df, 'Power_Backup')
    df = apply_feature_selection(df)
    df = apply_fill_category(df)
    df = apply_smote(df)

    X = df.drop(columns='Status')
    y = df['Status']

    rf = RandomForestClassifier(n_jobs=-1)
    rf.fit(X, y)
    models.append(rf)
    
    break

[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 4ms/step - loss: 1.0545 - val_loss: 0.9382
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step


AttributeError: 'list' object has no attribute 'push'

In [55]:
y

1           Normal
2           Normal
3        Breakdown
4        Breakdown
           ...    
99995    Breakdown
99997    Breakdown
99998       Normal
99999       Normal
Name: Status, Length: 100000, dtype: object