# **Importing Libraries**

In [72]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedShuffleSplit
from imblearn.over_sampling import SMOTE
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical

# **Importing Dataset**

In [73]:
def import_dataset(path: str, nrows: int | None = None, is_train=False) -> pd.DataFrame:
    chunksize = 100_000
    machine_area_df = pd.read_csv("/kaggle/input/penyisihan-data-vers-anava-19/Machine-Area.csv")
    machine_list_df = pd.read_csv("/kaggle/input/penyisihan-data-vers-anava-19/Machine-List.csv")
    area_list_df = pd.read_csv("/kaggle/input/penyisihan-data-vers-anava-19/Area-List.csv")
    machine_area_df['Last Maintenance'] = pd.to_datetime(machine_area_df['Last Maintenance'])
    machine_area_filtered_df = machine_area_df.sort_values(
        by=['ID_Area', 'ID_Mesin', 'Last Maintenance'], 
        ascending=[True, True, False]
    ).drop_duplicates(subset=['ID_Area', 'ID_Mesin'], keep='first')

    id_list = []
    chunk_list = []
    for chunk in pd.read_csv(path, chunksize=chunksize, nrows=nrows, low_memory=False):
        if is_train:
            chunk = chunk[chunk['Machine'].isin(machine_area_df['ID_Mesin'])]
        id_column = chunk.pop('ID_Transaction') if 'ID_Transaction' in chunk.columns else None
        chunk = pd.merge(chunk, area_list_df, on='ID_Area', how='left')
        chunk = pd.merge(chunk, machine_list_df, left_on='Machine', right_on='ID_Mesin', how='left')
        chunk = pd.merge(chunk, machine_area_filtered_df, left_on=['Machine', 'ID_Area'], right_on=['ID_Mesin', 'ID_Area'], how='left')

        chunk_list.append(chunk)
        if id_column is not None:
            id_list.append(id_column)
    combined_df = pd.concat(chunk_list, ignore_index=True)
    maintenance_frequency = machine_area_df.groupby('ID_Mesin').size().rename('maintenance_count')
    combined_df = pd.merge(combined_df, maintenance_frequency, left_on='Machine', right_index=True, how='left')
    id_series = pd.concat(id_list, ignore_index=True) if id_list else None
    return combined_df, id_series


In [74]:
train_df, _ = import_dataset(
    path="/kaggle/input/penyisihan-data-vers-anava-19/train.csv", 
    nrows=200_000, is_train=True)

In [75]:
test_df, test_id = import_dataset(
    path="/kaggle/input/penyisihan-data-vers-anava-19/test.csv", 
    nrows=200_000, is_train=False)

KeyboardInterrupt: 

In [None]:
train_df = train_df.dropna(subset=['Last Maintenance', 'Status Sparepart', 'Age'])

# **Data Cleaning**

## Identify Missing Data

In [None]:
missing = train_df.isnull().sum()
missing_percentage = missing / len(train_df) * 100
print(missing_percentage)

## Handling Missing Data

In [None]:

columns_to_impute = missing_percentage[missing_percentage < 5].index
for col in columns_to_impute:
    if train_df[col].dtype in ['float64', 'int64']:
        train_df[col] = train_df[col].fillna(train_df[col].median())

In [None]:
moderate_missingness_cols = missing_percentage[(missing_percentage >= 5) & (missing_percentage <= 23)].index
numerical_cols = [col for col in moderate_missingness_cols if train_df[col].dtype in ['float64', 'int64']]

moderate_missingness_cols

# for col in numerical_cols:
#     train_df[col] = random_sample_impute(train_df[col])
#     print(f"Imputed {col}")

In [None]:
def drop_cols(df):
    cols = ['temperature_10H_min (°C)', 'temperature-2', 'apparent_temperature_max',
       'apparent_temperature_min', 'humidity', 'Voltage-L', 'Voltage-R',
       'Current-T', 'RPM', 'RPM-1', 'RPM-2', 'RPM-3', 'Vibration-1',
       'Vibration-2', 'Power']
    df.drop(columns = cols, inplace=True)
    return df

In [None]:
missing = train_df.isnull().sum()
missing_percentage = missing / len(train_df) * 100
print(missing_percentage)

## Dealing with Outliers

## Remove Duplicates

## Feature Engineering

# **Data Preprocessing**

## Feature Scaling

In [None]:
def scaler(df):
    numerical_cols = [col for col in df.columns if df[col].dtype in ['float64', 'int64']]
    scaler = StandardScaler()
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
    return df


## Dimensionality Reduction

### bisa pake feature hasher buat categorical

In [None]:
def reduce_dim(df, dim):
    cols_to_reduce = ['temperature_10H_max (°C)',
                      'temperature-1', 'temperature-3',
                      'Voltage-M', 'Current-M', 'Current-R']
    X = df[cols_to_reduce]
    input_dim = X.shape[1]

    input_layer = Input(shape=(input_dim,))
    encoder = Dense(dim, activation='relu')(input_layer)
    decoder = Dense(input_dim, activation='sigmoid')(encoder)
    autoencoder = Model(inputs=input_layer, outputs=decoder)

    autoencoder.compile(optimizer='adam', loss='mse')
    autoencoder.fit(X, X, epochs=22, batch_size=32, shuffle=True, validation_split=0.2)

    encoder_model = Model(inputs=input_layer, outputs=encoder)
    X_reduced = encoder_model.predict(X)
    X_reduced = pd.DataFrame(X_reduced, columns=[f'feature_{i}' for i in range(dim)], index=df.index)
    df.drop(cols_to_reduce, axis=1, inplace=True)
    df = pd.concat([df, X_reduced], axis=1)
    return df, encoder_model

In [None]:
def predict_reduce_dim(df, encoder_model) :
    cols_to_reduce = ['temperature_10H_max (°C)',
                      'temperature-1', 'temperature-3',
                      'Voltage-M', 'Current-M', 'Current-R']
    X = df[cols_to_reduce].copy()
    for col in X.columns:
        X[col] = pd.to_numeric(X[col], errors='coerce')
    X_reduced = encoder_model.predict(X)
    dim = X_reduced.shape[1]
    X_reduced = pd.DataFrame(X_reduced, columns=[f'feature_{i}' for i in range(dim)], index=df.index)

    df.drop(cols_to_reduce, axis=1, inplace=True)
    df = pd.concat([df, X_reduced], axis=1)
    return df

## Feature Encoding

In [None]:
def one_hot_encode(df, cols):
    if df[cols].isnull().any():
        df[cols] = df[cols].fillna("Missing")
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    onehot_encoded = encoder.fit_transform(df[[cols]])
    encoded_df = pd.DataFrame(onehot_encoded, columns=encoder.get_feature_names_out([cols]), index = df.index)
    for column in encoded_df.columns:
        encoded_df[column] = pd.Categorical(encoded_df[column])
    df_encoded = pd.concat([df.drop(columns=cols), encoded_df], axis=1)
    return df_encoded

In [None]:
def time_encoding(df):
    df['Last Maintenance'] = pd.to_datetime(df['Last Maintenance'])
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['days_since_last_maintenance'] = (df['timestamp'] - df['Last Maintenance']).dt.days
    df.drop(['Last Maintenance', 'timestamp'], axis=1, inplace=True)
    return df

## Handling Imbalanced Dataset

In [None]:
def smote_transform(df):
    target = df['Status'].map({'Normal': 0, 'Warning': 1, 'Breakdown': 2})
    df = df.drop('Status', axis=1)
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(df, target)
    X_resampled = pd.DataFrame(X_resampled, columns=df.columns)
    X_resampled['Status'] = y_resampled
    return X_resampled

# Pipeline

In [None]:
def pipeline(df, is_train=False, encoder_model=None, cols_to_reduce=None):
    df = drop_cols(df)
    df = scaler(df)
    if is_train:
        df, encoder_model, cols_to_reduce = reduce_dim(df, 3)
    else:
        if encoder_model is None or cols_to_reduce is None:
            raise ValueError("For non-training data, 'encoder_model' and 'cols_to_reduce' must be provided.")
        df = predict_reduce_dim(df, encoder_model, cols_to_reduce)   
    df = time_encoding(df)
    df = one_hot_encode(df, 'Priority')
    df = one_hot_encode(df, 'Status Sparepart')
    df = one_hot_encode(df, 'Power_Backup')
    if is_train:
        df.drop(columns=['ID_Area', 'ID_Mesin_x', 'Machine', 'Breakdown Category',
                          'Area', 'ID_Mesin_y', 'Mesin_x', 'Mesin_y', 'Country Machine_x', 'Country Machine_y'], inplace=True)
    else:
        df.drop(columns=['ID_Area', 'ID_Mesin_x', 'Machine', 'Area', 'ID_Mesin_y', 'Mesin_x', 'Mesin_y',
                          'Country Machine_x', 'Country Machine_y'], inplace=True)
    for col in df.select_dtypes(include=['category']).columns:
        df[col] = df[col].cat.add_categories([-1])
        df[col] = df[col].fillna(-1)
    df.fillna(-1, inplace=True)
    if is_train:
        df = smote_transform(df)
    return df, encoder_model, cols_to_reduce

# **Modelling & Validation**

In [None]:
# train_df, encoder_model, cols_to_reduce = pipeline(train_df, is_train=True)

In [None]:
check = drop_cols(train_df)
check.isnull().sum()

In [None]:
check2 = scaler(check)
check2.isnull().sum()

In [None]:
check3,_ = reduce_dim(check2,3)
check3.isnull().sum()

In [None]:
check4 = time_encoding(check3)
check4.isnull().sum()

In [None]:
num_chunks = len(test_df) // 100000 + (len(test_df) % 100000 > 0)
chunks = np.array_split(test_df, num_chunks)
processed_chunks = []

for chunk in chunks:
    processed_chunk, _, a = pipeline(chunk, is_train=False, encoder_model=encoder_model, cols_to_reduce=cols_to_reduce)
    processed_chunks.append(processed_chunk)

testing = pd.concat(processed_chunks, ignore_index=True)

In [None]:
# test_df.to_csv("test_processed.csv", index=False)

In [None]:
# chunk_size = 100_000
# chunk_list = []
# for chunk in pd.read_csv("test_processed.csv", chunksize=chunk_size, low_memory=False):
#     chunk_list.append(chunk)

# test_df = pd.concat(chunk_list, ignore_index=True)

In [None]:
X = train_df.drop(columns='Status')
y = train_df['Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
y.value_counts()

In [None]:
import xgboost as xgb
from sklearn.metrics import confusion_matrix
def precision_recall_f1(tp, fp, fn):
    # Calculate Precision
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    
    # Calculate Recall
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    
    # Calculate F1 Score
    if precision + recall == 0:
        f1 = 0
    else:
        f1 = 2 * (precision * recall) / (precision + recall)
    
    return precision, recall, f1

xgb_model = xgb.XGBClassifier(enable_categorical=True)
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

print(classification_report(y_test, y_pred))

In [None]:
def predict_in_chunks(test_df, model, batch_size=1000, preprocess_fn=None):
    chunks = np.array_split(test_df, len(test_df) // batch_size + 1)
    predictions = []
    
    for i, chunk in enumerate(chunks):
        print(f"Processing chunk {i + 1}/{len(chunks)}...")
        preds = model.predict(chunk)
        predictions.append(preds)
    all_predictions = np.concatenate(predictions, axis=0)
    
    return all_predictions


In [None]:
#rename col
testing.rename(columns={'Power_Backup_ ': 'Power_Backup_Missing'}, inplace=True)

In [None]:
predictions = predict_in_chunks(testing, rf_model, batch_size=100000)

In [None]:
submission = pd.read_csv("dataset/submission.csv")

In [None]:
test_predictions = pd.DataFrame({
    'ID_Transaction': test_id,
    'Status': predictions})

test_predictions['Status'] = test_predictions['Status'].map({0: 'Normal', 1: 'Warning', 2: 'Breakdown'})

In [None]:
test_predictions['Status'].value_counts()

In [None]:
test_predictions.to_csv("submission2.csv", index=False)

In [None]:
submission = pd.merge(submission, test_predictions, on='ID_Transaction', how='left')

In [None]:
submission = submission.drop(columns='Status_x')

In [None]:
submission.fillna('Normal', inplace=True)

In [None]:
submission.to_csv("submission1.csv", index=False)

In [None]:
# Keep only the column from the predictions DataFrame
submission = submission.drop(columns=['Status_x'])  # Adjust the column name as needed

# Optionally rename 'Status_y' to 'Target' (or your desired name)
submission.rename(columns={'Status_y': 'Status'}, inplace=True)


In [None]:
submission.to_csv("submission1.csv", index=False)

# **Result & Analysis**