# **Importing Libraries**

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers # type: ignore
from tensorflow.keras.models import Sequential # type: ignore
from tensorflow.keras.layers import Dense, Dropout, Input, BatchNormalization # type: ignore
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau # type: ignore
from tensorflow.keras.models import load_model # type: ignore
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedShuffleSplit
from imblearn.over_sampling import SMOTE
from tensorflow.keras.models import Model # type: ignore
from tensorflow.keras.utils import to_categorical # type: ignore
from sklearn.ensemble import RandomForestClassifier

import os
import pickle
import joblib



# **FUNCTION DECLARATION**

## apply_combine(df)

In [2]:
machine_area_df = pd.read_csv("dataset/Machine-Area.csv")
machine_list_df = pd.read_csv("dataset/Machine-List.csv")
area_list_df = pd.read_csv("dataset/Area-List.csv")
machine_area_df['Last Maintenance'] = pd.to_datetime(machine_area_df['Last Maintenance'])
maintenance_frequency = machine_area_df.groupby('ID_Mesin').size().rename('maintenance_count')
machine_area_df = pd.merge(machine_area_df, maintenance_frequency, left_on='ID_Mesin', right_index=True, how='left')
machine_area_filtered_df = machine_area_df.sort_values(
    by=['ID_Area', 'ID_Mesin', 'Last Maintenance'], 
    ascending=[True, True, False]
).drop_duplicates(subset=['ID_Area', 'ID_Mesin'], keep='first')

def apply_combine(df: pd.DataFrame) -> pd.DataFrame:
    ret: pd.DataFrame = df.copy()
    
    ret = pd.merge(ret, area_list_df, on='ID_Area', how='left')
    ret = pd.merge(ret, machine_list_df, left_on='Machine', right_on='ID_Mesin', how='left')
    ret = pd.merge(ret, machine_area_filtered_df, left_on=['Machine', 'ID_Area'], right_on=['ID_Mesin', 'ID_Area'], how='left')
        
    return ret

## apply_fill_na(df)

In [116]:
_columns_to_keep = ['Last Maintenance', 'Status Sparepart', 'Power_Backup', 'Priority', 'Status', 'Age', 'maintenance_count', 'Country Machine_x']

_columns_to_impute: list[str] = []
_columns_to_drop: list[str] = []

def apply_fill_na(df: pd.DataFrame) -> pd.DataFrame:
    ret = df.copy()

    avg_timespan = 10
    valid_rows = ret['ID_Mesin_x'].notnull() & ret['ID_Area'].notnull()

    if valid_rows.any():
        ret.loc[valid_rows, 'predicted_age'] = (
            ret.loc[valid_rows]
            .groupby(['ID_Mesin_x', 'ID_Area'])['Last Maintenance']
            .transform('min')
            .dt.year + avg_timespan
        )
        ret.loc[valid_rows, 'Age'] = ret.loc[valid_rows, 'Age'].fillna(ret.loc[valid_rows, 'predicted_age'])

    ret.loc[~valid_rows, 'Age'] = np.nan

    ret = ret.drop(columns=['predicted_age'], errors='ignore')

    missing = ret.isnull().sum()
    missing_percentage = missing / len(ret) * 100

    columns_to_impute = missing_percentage[missing_percentage < 5].index
    for col in columns_to_impute:
        if ret[col].dtype in ['float64', 'int64']:
            ret[col] = ret[col].fillna(ret[col].median())

    missing_percentage = ret.isnull().sum() / len(ret) * 100
    columns_to_drop = missing_percentage[missing_percentage > 5].index
    filtered_columns_to_drop = [col for col in columns_to_drop if col not in _columns_to_keep]
    ret = ret.drop(columns=filtered_columns_to_drop)

    return ret

In [4]:
def apply_fill_na_predict(df: pd.DataFrame) -> pd.DataFrame:
    ret = df.copy()

    avg_timespan = 10
    valid_rows = ret['ID_Mesin_x'].notnull() & ret['ID_Area'].notnull()

    if valid_rows.any():
        ret.loc[valid_rows, 'predicted_age'] = (
            ret.loc[valid_rows]
            .groupby(['ID_Mesin_x', 'ID_Area'])['Last Maintenance']
            .transform('min')
            .dt.year + avg_timespan
        )
        ret.loc[valid_rows, 'Age'] = ret.loc[valid_rows, 'Age'].fillna(ret.loc[valid_rows, 'predicted_age'])

    ret.loc[~valid_rows, 'Age'] = np.nan

    ret = ret.drop(columns=['predicted_age'], errors='ignore')

    missing = ret.isnull().sum()
    missing_percentage = missing / len(ret) * 100

    for col in _columns_to_impute:
        if ret[col].dtype in ['float64', 'int64']:
            ret[col] = ret[col].fillna(ret[col].median())

    ret = ret.drop(columns=_columns_to_drop)

    return ret

## apply_scaler(df)

In [5]:
scaler = StandardScaler()


In [6]:
def apply_scaler(df:pd.DataFrame, train: bool) -> pd.DataFrame:
    ret = df.copy()
    numerical_cols = ['temperature_10H_max (°C)', 'temperature-1', 'temperature-3',
       'Voltage-M', 'Current-M', 'Current-R']
    if train:
        scaler.fit(ret[numerical_cols])
    ret[numerical_cols] = scaler.transform(ret[numerical_cols])
    return ret

## aply_reduce_dim()

In [7]:
def apply_reduce_dim(df, dim):
    cols_to_reduce = ['temperature_10H_max (°C)',
                      'temperature-1', 'temperature-3',
                      'Voltage-M', 'Current-M', 'Current-R']
    X = df[cols_to_reduce]
    input_dim = X.shape[1]

    input_layer = Input(shape=(input_dim,))
    encoder = Dense(dim, activation='relu')(input_layer)
    decoder = Dense(input_dim, activation='sigmoid')(encoder)
    autoencoder = Model(inputs=input_layer, outputs=decoder)

    autoencoder.compile(optimizer='adam', loss='mse')
    autoencoder.fit(X, X, epochs=22, batch_size=32, shuffle=True, validation_split=0.2)

    encoder_model = Model(inputs=input_layer, outputs=encoder)
    X_reduced = encoder_model.predict(X)
    X_reduced = pd.DataFrame(X_reduced, columns=[f'feature_{i}' for i in range(dim)], index=df.index)
    df.drop(cols_to_reduce, axis=1, inplace=True)
    df = pd.concat([df, X_reduced], axis=1)
    return df, encoder_model

In [8]:
def predict_reduce_dim(df, encoder_model) :
    cols_to_reduce = ['temperature_10H_max (°C)',
                      'temperature-1', 'temperature-3',
                      'Voltage-M', 'Current-M', 'Current-R']
    X = df[cols_to_reduce].copy()
    for col in X.columns:
        X[col] = pd.to_numeric(X[col], errors='coerce')
    X_reduced = encoder_model.predict(X)
    dim = X_reduced.shape[1]
    X_reduced = pd.DataFrame(X_reduced, columns=[f'feature_{i}' for i in range(dim)], index=df.index)

    df.drop(cols_to_reduce, axis=1, inplace=True)
    df = pd.concat([df, X_reduced], axis=1)
    return df

## apply_time_encoding(df)

In [9]:
def apply_time_encoding(df: pd.DataFrame) -> pd.DataFrame:
    ret = df.copy()
    ret['Last Maintenance'] = pd.to_datetime(ret['Last Maintenance'])
    ret['timestamp'] = pd.to_datetime(ret['timestamp'])
    ret['days_since_last_maintenance'] = (ret['timestamp'] - ret['Last Maintenance']).dt.days
    ret.drop(['Last Maintenance', 'timestamp'], axis=1, inplace=True)
    return ret

## apply_one_hot_encode(df, cols)

In [10]:
def apply_one_hot_encode(df: pd.DataFrame, cols):
    if df[cols].isnull().any():
        df[cols] = df[cols].fillna("Missing")
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    onehot_encoded = encoder.fit_transform(df[[cols]])
    encoded_df = pd.DataFrame(onehot_encoded, columns=encoder.get_feature_names_out([cols]), index = df.index)
    for column in encoded_df.columns:
        encoded_df[column] = pd.Categorical(encoded_df[column])
    df_encoded = pd.concat([df.drop(columns=cols), encoded_df], axis=1)
    return df_encoded

## apply_feature_selection(df)

In [198]:
def apply_feature_selection(df: pd.DataFrame):
    ret = df.copy()
    # ret = ret.drop(columns=[
    #     'ID_Area', 'ID_Mesin_x', 'Machine', 'Breakdown Category', 'Area', 'ID_Transaction',
    #     'ID_Mesin_y', 'Mesin_x', 'Mesin_y', 'Country Machine_x', 'Country Machine_y'], errors='ignore')
    # if 'Breakdown Category' in ret.columns:
    #     ret = ret.drop(columns=['Breakdown Category'])

    ret = ret[['feature_0', 'feature_1', 'days_since_last_maintenance',
       'Priority_High', 'Priority_Low', 'Priority_Medium',
       'Status Sparepart_Broken', 'Status Sparepart_Empty',
       'Status Sparepart_In Use',
       'Status Sparepart_On Check', 'Status Sparepart_Ready',
       'Status Sparepart_Repair', 'Power_Backup_No',
       'Power_Backup_Yes']]
    
    if "Status" in df.columns:
        ret['Status'] = df['Status']

    return ret

## apply_fill_category(df)

In [12]:
def apply_fill_category(df: pd.DataFrame) -> pd.DataFrame:
    ret = df.copy()
    for col in ret.select_dtypes(include=['category']).columns:
            ret[col] = ret[col].cat.add_categories([-1])
            ret[col] = ret[col].fillna(-1)
    ret = ret.fillna(-1)

    return ret

## apply_smote(df)

In [13]:
def apply_smote(df: pd.DataFrame) -> pd.DataFrame:
    ret = df.copy()
    target = ret['Status'].map({'Normal': 0, 'Warning': 1, 'Breakdown': 2})
    ret = ret.drop('Status', axis=1)
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(ret, target)
    X_resampled = pd.DataFrame(X_resampled, columns=ret.columns)
    X_resampled['Status'] = y_resampled
    return X_resampled

# **TRAINING**

In [205]:
first_run = False
idx = 0
cols_to_reduce = ['temperature_10H_max (°C)',
 'temperature-1',
 'temperature-3',
 'Voltage-M',
 'Current-M',
 'Current-R']

for chunk in pd.read_csv("dataset/train.csv", chunksize=300_000):
    df = chunk.copy()
    df = apply_combine(df)
    df = apply_fill_na(df)

    df = apply_scaler(df, train=True)
    if first_run:
        df, encoder_model = apply_reduce_dim(df, 3)
        encoder_model.save("saved_models/encoder.keras")
        first_run = False
    else:
        encoder_model = load_model("saved_models/encoder.keras")
        df = predict_reduce_dim(df, encoder_model)
    
    df = apply_time_encoding(df)
    df = apply_one_hot_encode(df, 'Priority')
    df = apply_one_hot_encode(df, 'Status Sparepart')
    df = apply_one_hot_encode(df, 'Power_Backup')
    df = apply_fill_category(df)
    df = apply_feature_selection(df)
    df = apply_smote(df)

    X = df.drop(columns='Status')
    y = df['Status']

    rf = RandomForestClassifier(n_jobs=-1, random_state=42, n_estimators=50, max_depth=15)

    rf.fit(X, y)
    joblib.dump(rf, f"saved_models/rf_{idx}.pkl")

    idx +=1
    print(idx)

    del rf
    del df

[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 826us/step
1
[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 669us/step
2
[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 771us/step
3
[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 647us/step
4
[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 671us/step
5
[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 834us/step
6
[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 855us/step
7
[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 682us/step
8
[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 763us/step
9
[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 689us/step
10
[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 771us/step
11
[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 675us/step
12
[1m9375/9375[0m [32m━━

KeyboardInterrupt: 

# **PREDICTING**

In [225]:
def predict(id: pd.Series, df: pd.DataFrame) -> pd.DataFrame:
    summed_probabilities = None
    final_predictions = pd.Series()
    
    for i in range(21):
        rf = joblib.load(f"saved_models/rf_{i}.pkl")
        
        y_pred = rf.predict_proba(df)
        
        if summed_probabilities is None:
            summed_probabilities = y_pred
        else:
            summed_probabilities += y_pred
    
    final_predictions = summed_probabilities.argmax(axis=1)
    
    result = pd.DataFrame({
        'ID': id,
        'Status': final_predictions
    })

    result['Status'] = result['Status'].map({0: 'Normal', 1: 'Warning', 2: 'Breakdown'})
    
    return result


In [231]:
def predict_medok(id: pd.Series, df: pd.DataFrame) -> pd.DataFrame:
    all_predictions = []

    for i in range(21):
        rf = joblib.load(f"saved_models/rf_{i}.pkl")
        
        y_pred = rf.predict(df)
        
        all_predictions.append(y_pred)
    
    all_predictions_df = pd.DataFrame(all_predictions).T
    
    final_predictions = all_predictions_df.mode(axis=1)[0]
    
    result = pd.DataFrame({
        'ID_Transaction': id,
        'Status': final_predictions
    })

    result['Status'] = result['Status'].map({0: 'Normal', 1: 'Warning', 2: 'Breakdown'})
    
    return result


In [230]:
pred_result = []
submission = pd.DataFrame()

for chunk in pd.read_csv("dataset/test.csv", chunksize=100_000):
    df = chunk.copy()
    ID = df['ID_Transaction'].copy()
    df = df.replace(r'^\s*$', np.nan, regex=True)
    df = apply_combine(df)
    df = apply_fill_na_predict(df)

    df = apply_scaler(df, train=False)
    encoder_model = load_model("saved_models/encoder.keras")
    df = predict_reduce_dim(df, encoder_model)
    
    df = apply_time_encoding(df)
    df = apply_one_hot_encode(df, 'Priority')
    df = apply_one_hot_encode(df, 'Status Sparepart')
    df = apply_one_hot_encode(df, 'Power_Backup')
    df = apply_feature_selection(df)
    df = apply_fill_category(df)

    y_pred = predict_medok(ID, df)

    submission = pd.concat([submission, y_pred])

[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 837us/step


AxisError: axis 1 is out of bounds for array of dimension 1

In [211]:
submission['Status'].value_counts()

Status
Normal       6984085
Breakdown       4564
Name: count, dtype: int64

In [222]:
submission.columns = ['ID_Transaction', 'Status']

In [223]:
submission

Unnamed: 0,ID_Transaction,Status
0,TRXb7e33ef41eea9cfdc0d1c338bad7f0d4,Normal
1,TRX8fc5889e25fbf66b21063d165228745f,Normal
2,TRX5c56664724a974cf6c87bd2659fd7046,Normal
3,TRX85fe42a5737897b7649a24fede27e90f,Normal
4,TRX16fae83f3c8c66fb15f088e7da7713ee,Normal
...,...,...
6999995,TRX853c4fa3ea09c9872e544ad425144181,Normal
6999996,TRXec71df6ec2cc1ad5b69d08fdd8d3aea9,Normal
6999997,TRX2cf42b2824f7d87d85549ddbb81d4e51,Normal
6999998,TRX4bc41281d855f597df32e02f0d4e8fc1,Normal


In [224]:
submission.to_csv("submission.csv")

# **TESTING**

In [213]:
farrell = pd.read_csv("farrell.csv")

In [220]:
farrell.columns

Index(['ID_Transaction', 'Status'], dtype='object')

train

In [None]:
# first_run = False
# idx = 0
# cols_to_reduce = ['temperature_10H_max (°C)',
#  'temperature-1',
#  'temperature-3',
#  'Voltage-M',
#  'Current-M',
#  'Current-R']

# for chunk in pd.read_csv("dataset/train.csv", chunksize=500_000):
#     df = chunk.copy()
#     df = apply_combine(df)
#     df = apply_fill_na(df)

#     df = apply_scaler(df, train=True)
#     if first_run:
#         df, encoder_model = apply_reduce_dim(df, 2)
#         encoder_model.save("saved_models/encoder_2.keras")
#         first_run = False
#     else:
#         encoder_model = load_model("saved_models/encoder.keras")
#         df = predict_reduce_dim(df, encoder_model)
    
#     df = apply_time_encoding(df)
#     df = apply_one_hot_encode(df, 'Priority')
#     df = apply_one_hot_encode(df, 'Status Sparepart')
#     df = apply_one_hot_encode(df, 'Power_Backup')
#     df = apply_fill_category(df)
#     df = apply_feature_selection(df)
#     df = apply_smote(df)

#     X = df.drop(columns='Status')
#     y = df['Status']

#     rf = RandomForestClassifier(n_jobs=-1, random_state=42, n_estimators=50, max_depth=15)
#     rf.fit(X, y)

#     break

# del df

[1m15625/15625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 1ms/step


test

In [229]:
skip = 0
for chunk in pd.read_csv("dataset/train.csv", chunksize=300_000):
    if skip == 5:
        skip += 1
        continue
    df = chunk.copy()
    df = apply_combine(df)
    df = apply_fill_na_predict(df)

    df = apply_scaler(df, train=True)
    
    encoder_model = load_model("saved_models/encoder.keras")
    df = predict_reduce_dim(df, encoder_model)
    
    df = apply_time_encoding(df)
    df = apply_one_hot_encode(df, 'Priority')
    df = apply_one_hot_encode(df, 'Status Sparepart')
    df = apply_one_hot_encode(df, 'Power_Backup')
    df = apply_fill_category(df)
    df = apply_feature_selection(df)

    X = df.drop(columns='Status')
    y = df['Status']
    
    y_pred = rf.predict(X)

    break

[1m2770/9375[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m6s[0m 909us/step

KeyboardInterrupt: 

In [202]:
# from sklearn.metrics import f1_score

# # Calculate F1 score
# _y = y.map({'Normal': 0, 'Warning': 1, 'Breakdown': 2})
# f1 = f1_score(_y, y_pred, average='weighted') 
# # 
# print("F1 Score:", f1)

F1 Score: 0.4846896103436038
