<a href="https://colab.research.google.com/github/ben-velastegui/dxc-ai-assessment/blob/main/CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from tensorflow.keras.utils import to_categorical


In [None]:
# Core libraries
import numpy as np
import pandas as pd

# Preprocessing
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split

# Deep learning
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv1D, Dense, Flatten, Dropout, BatchNormalization
from tensorflow.keras.utils import to_categorical

# Evaluation
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Feature importance
!pip install shap
import shap


In [None]:
df = pd.read_csv("/content/full_data_long.csv")


In [None]:
numeric_features = [
    "total_logs", "error_logs", "warning_logs", "info_logs",
    "cpu_user_mean", "cpu_user_max",
    "mem_used_mean", "mem_used_max",
    "load1_mean", "load5_mean", "load15_mean",
    "total_traces", "missing_data"
]


categorical_features = ["Hostname", "program", "pid", "user_id"]


In [None]:
target = "operation"


In [None]:
time_column = "bin_time"


In [None]:
scaler = StandardScaler()
df[numeric_features] = scaler.fit_transform(df[numeric_features])


In [None]:
from sklearn.preprocessing import OneHotEncoder

# Corrected for newer scikit-learn
ohe = OneHotEncoder(sparse_output=False)
cat_encoded = ohe.fit_transform(df[categorical_features])

# Combine numeric + categorical features
df_encoded = pd.concat([
    df[numeric_features].reset_index(drop=True),
    pd.DataFrame(cat_encoded, columns=ohe.get_feature_names_out(categorical_features))
], axis=1)


In [None]:
label_encoder = LabelEncoder()
df["operation_label"] = label_encoder.fit_transform(df[target])
y = to_categorical(df["operation_label"])


In [None]:
# Convert bin_time to datetime, filling missing time as 00:00:00
df["bin_time"] = pd.to_datetime(df["bin_time"], errors="coerce", infer_datetime_format=True)

# For entries that were just dates, pandas will automatically set time to 00:00:00
# Verify
print(df["bin_time"].head())

# Sort by Hostname and bin_time
df = df.sort_values(by=["Hostname", "bin_time"]).reset_index(drop=True)


In [None]:
sequence_length = 10  # number of time steps per sequence

X_sequences = []
y_sequences = []

for host in df["Hostname"].unique():
    host_data = df[df["Hostname"] == host]
    host_features = df_encoded.loc[host_data.index].values
    host_labels = df.loc[host_data.index, "operation_label"].values

    for i in range(len(host_features) - sequence_length + 1):
        X_sequences.append(host_features[i:i+sequence_length])
        y_sequences.append(host_labels[i+sequence_length-1])  # label at last timestep

X_sequences = np.array(X_sequences)
y_sequences = np.array(y_sequences)
y_sequences_cat = to_categorical(y_sequences)

print("Sequences shape:", X_sequences.shape)
print("Labels shape:", y_sequences_cat.shape)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_sequences, y_sequences_cat, test_size=0.2, random_state=42, shuffle=True
)

print("Train shape:", X_train.shape, y_train.shape)
print("Test shape:", X_test.shape, y_test.shape)


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv1D, Dense, Flatten, Dropout, BatchNormalization

input_shape = X_train.shape[1:]  # (sequence_length, num_features)

inputs = Input(shape=input_shape)
x = Conv1D(64, kernel_size=3, activation='relu', padding='same')(inputs)
x = BatchNormalization()(x)
x = Conv1D(32, kernel_size=3, activation='relu', padding='same')(x)
x = BatchNormalization()(x)
x = Flatten()(x)
x = Dense(64, activation='relu')(x)
x = Dropout(0.3)(x)
outputs = Dense(y_sequences_cat.shape[1], activation='softmax')(x)

model = Model(inputs, outputs)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()


In [None]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=20,
    batch_size=32
)


In [None]:
# Make sure operation column is clean before encoding
df[target] = df[target].astype(str)

label_encoder = LabelEncoder()
df["operation_label"] = label_encoder.fit_transform(df[target])
y = to_categorical(df["operation_label"])


In [None]:
# Replace np.nan or string "nan" with "unknown"
df[target] = df[target].replace("nan", np.nan)  # catch string nan
df[target] = df[target].fillna("unknown").astype(str)

# Refit label encoder
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

label_encoder = LabelEncoder()
df["operation_label"] = label_encoder.fit_transform(df[target])
y = to_categorical(df["operation_label"])

print("Classes:", label_encoder.classes_)


In [None]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Use integer labels, not one-hot
class_weights_array = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(df["operation_label"]),
    y=df["operation_label"]
)
class_weights = dict(enumerate(class_weights_array))

print("Class weights:", class_weights)


In [None]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=20,
    batch_size=32,
    class_weight=class_weights
)


In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Predictions
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_test, axis=1)

# Report
print(classification_report(y_true, y_pred_classes, target_names=label_encoder.classes_))

# Confusion Matrix
cm = confusion_matrix(y_true, y_pred_classes)
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix")
plt.show()


In [None]:
!pip install imbalanced-learn

from imblearn.over_sampling import RandomOverSampler
import numpy as np

# Flatten sequences for oversampling
X_flat = X_sequences.reshape(len(X_sequences), -1)
y_flat = y_sequences  # integer labels, not one-hot

ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_flat, y_flat)

# Reshape back into sequence form
X_resampled = X_resampled.reshape(-1, sequence_length, X_sequences.shape[2])
y_resampled_cat = to_categorical(y_resampled)

print("Original class distribution:", np.bincount(y_flat))
print("Resampled class distribution:", np.bincount(y_resampled))


In [None]:
history = model.fit(
    X_resampled, y_resampled_cat,
    validation_data=(X_test, y_test),
    epochs=30,
    batch_size=32,
    verbose=1
)


In [None]:
# Get predictions
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_test, axis=1)

print("Predictions shape:", y_pred_classes.shape)
print("True labels shape:", y_true.shape)

In [None]:
from sklearn.metrics import classification_report

target_names = [str(c) for c in label_encoder.classes_]

report = classification_report(y_true, y_pred_classes, target_names=target_names, zero_division=0)
print(report)

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_true, y_pred_classes)

plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=target_names,
            yticklabels=target_names)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix")
plt.show()

In [None]:
import numpy as np

for i, class_name in enumerate(target_names):
    idx = y_true == i
    class_acc = np.sum(y_pred_classes[idx] == i) / np.sum(idx)
    print(f"Accuracy for {class_name}: {class_acc:.2f}")

In [None]:
plt.figure(figsize=(12,4))

# Accuracy
plt.subplot(1,2,1)
plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='val')
plt.title("Accuracy over epochs")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()

# Loss
plt.subplot(1,2,2)
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='val')
plt.title("Loss over epochs")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()

plt.show()

this explains why you’re still seeing all predictions as boot_delete. The model literally did not learn anything for the minority classes, which causes both:
	1.	Precision/recall = 0 for all non-majority classes.
	2.	SHAP errors, because the outputs for other classes are effectively zero, so the explanation cannot properly distribute contributions.

This happens because:
	•	Extreme class imbalance → boot_delete dominates (~70% of sequences).
	•	Minority classes too rare → the CNN never sees enough examples to learn patterns.
	•	Class weights alone aren’t enough; we need oversampling or sequence-level augmentation.

In [None]:
!pip install imbalanced-learn
from imblearn.over_sampling import RandomOverSampler
import numpy as np

# Flatten sequences for oversampling
X_flat = X_sequences.reshape(len(X_sequences), -1)
y_flat = y_sequences

ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_flat, y_flat)

# Reshape back into sequences
X_resampled = X_resampled.reshape(-1, X_sequences.shape[1], X_sequences.shape[2])
y_resampled_cat = to_categorical(y_resampled)

print("Before oversampling:", np.bincount(y_flat))
print("After oversampling:", np.bincount(y_resampled))

In [None]:
history = model.fit(
    X_resampled, y_resampled_cat,
    validation_data=(X_test, y_test),  # keep test set unchanged
    epochs=30,
    batch_size=32
)

In [None]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Predictions on test set
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_test, axis=1)

print("Predictions shape:", y_pred_classes.shape)
print("True labels shape:", y_true.shape)

# Classification report
target_names = [str(c) for c in label_encoder.classes_]
report = classification_report(y_true, y_pred_classes, target_names=target_names, zero_division=0)
print("Classification Report:\n")
print(report)

# Confusion matrix
cm = confusion_matrix(y_true, y_pred_classes)
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=target_names,
            yticklabels=target_names)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix")
plt.show()

# Per-class accuracy
print("Per-class accuracy:")
for i, class_name in enumerate(target_names):
    idx = y_true == i
    if np.sum(idx) > 0:
        class_acc = np.sum(y_pred_classes[idx] == i) / np.sum(idx)
        print(f"  {class_name}: {class_acc:.2f}")
    else:
        print(f"  {class_name}: No samples in test set")

# Training history plots
if 'history' in globals():
    plt.figure(figsize=(12,4))

    # Accuracy plot
    plt.subplot(1,2,1)
    plt.plot(history.history['accuracy'], label='train')
    plt.plot(history.history['val_accuracy'], label='val')
    plt.title("Model Accuracy")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.legend()

    # Loss plot
    plt.subplot(1,2,2)
    plt.plot(history.history['loss'], label='train')
    plt.plot(history.history['val_loss'], label='val')
    plt.title("Model Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()

    plt.show()

In [None]:
# Rebuild CNN from scratch
input_shape = X_resampled.shape[1:]  # (sequence_length, num_features)

inputs = tf.keras.Input(shape=input_shape)
x = tf.keras.layers.Conv1D(64, kernel_size=3, activation='relu', padding='same')(inputs)
x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.Conv1D(32, kernel_size=3, activation='relu', padding='same')(x)
x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.Flatten()(x)
x = tf.keras.layers.Dense(64, activation='relu')(x)
x = tf.keras.layers.Dropout(0.3)(x)
outputs = tf.keras.layers.Dense(y_resampled_cat.shape[1], activation='softmax')(x)

model = tf.keras.Model(inputs, outputs)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
X_resampled = np.nan_to_num(X_resampled, nan=0.0, posinf=1e5, neginf=-1e5)

In [None]:
from sklearn.preprocessing import StandardScaler

# Only scale numeric columns
scaler = StandardScaler()
df[numeric_features] = scaler.fit_transform(df[numeric_features])

# One-hot categorical columns are already 0/1 → no scaling needed

In [None]:
# Clip extreme values to [-10, 10] to prevent exploding activations
X_resampled = np.clip(X_resampled, -10, 10)

In [None]:
from tensorflow.keras.optimizers import Adam

optimizer = Adam(learning_rate=1e-4)  # lower LR for stability
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
import numpy as np

print("Any NaNs in y_resampled_cat?", np.isnan(y_resampled_cat).any())
print("Any Infs in y_resampled_cat?", np.isinf(y_resampled_cat).any())
print("Min/Max values in y_resampled_cat:", y_resampled_cat.min(), y_resampled_cat.max())

In [None]:
# Replace any remaining NaNs and infinities
X_resampled = np.nan_to_num(X_resampled, nan=0.0, posinf=1e5, neginf=-1e5)

# Convert to float32
X_resampled = X_resampled.astype(np.float32)
y_resampled_cat = y_resampled_cat.astype(np.float32)

# Clip extreme values for stability
X_resampled = np.clip(X_resampled, -10, 10)

In [None]:
from tensorflow.keras.optimizers import Adam

optimizer = Adam(learning_rate=1e-4)  # smaller LR prevents exploding gradients

In [None]:
import tensorflow as tf

inputs = tf.keras.Input(shape=X_resampled.shape[1:])
x = tf.keras.layers.Conv1D(32, 3, activation='relu', padding='same')(inputs)
x = tf.keras.layers.Conv1D(16, 3, activation='relu', padding='same')(x)
x = tf.keras.layers.Flatten()(x)
x = tf.keras.layers.Dense(32, activation='relu')(x)
outputs = tf.keras.layers.Dense(y_resampled_cat.shape[1], activation='softmax')(x)

model = tf.keras.Model(inputs, outputs)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
# Clean up validation/test set
X_test = np.nan_to_num(X_test, nan=0.0, posinf=1e5, neginf=-1e5)
X_test = X_test.astype(np.float32)

y_test = y_test.astype(np.float32)

# Clip extreme values for stability
X_test = np.clip(X_test, -10, 10)

In [None]:
# Retrain briefly after cleaning X_test
history = model.fit(
    X_resampled, y_resampled_cat,
    validation_data=(X_test, y_test),
    epochs=5,            # short run, just to verify val_loss is no longer NaN
    batch_size=32,
    verbose=1
)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

y_pred_probs = model.predict(X_test)
y_pred_classes = np.argmax(y_pred_probs, axis=1)
y_true = np.argmax(y_test, axis=1)

print("Classification Report:")
print(classification_report(y_true, y_pred_classes, target_names=label_encoder.classes_))

cm = confusion_matrix(y_true, y_pred_classes)
print("Confusion Matrix:")
print(cm)

In [None]:
model.save("cnn_model.h5")
# Later or in another notebook
from tensorflow.keras.models import load_model
model = load_model("cnn_model.h5")

“too good” an accuracy?

	1.	Train is heavily oversampled → the model sees tons of synthetic repetitions of minority classes. This makes it very easy to memorize class patterns.
	2.	If the features are strongly correlated with the labels (like program or pid giving away the operation), the model may have “shortcuts” that make predictions trivial.
	3.	The gap: Train ≈ 99.8% accuracy vs Test ≈ 99.6% — very small → possible leakage in preprocessing (scaler/ohe fit on all data).


  Right now, RandomOverSampler is fully balancing all classes → can cause overfitting.

Better strategies:
	•	imblearn.over_sampling.SMOTE (synthesizes new minority samples instead of duplicating).
	•	Limit oversampling: instead of making classes perfectly balanced, bring them up to e.g. 50% of the majority class.

In [None]:
from sklearn.impute import SimpleImputer

# Flatten to 2D for imputer
X_train_2d = X_train.reshape(len(X_train), -1)

imputer = SimpleImputer(strategy="median")
X_train_imputed = imputer.fit_transform(X_train_2d)

# Reshape back to sequences
X_train = X_train_imputed.reshape(X_train.shape[0], X_train.shape[1], X_train.shape[2])

In [None]:
from imblearn.over_sampling import SMOTE

# Get class counts
unique, counts = np.unique(y_train_int, return_counts=True)
class_counts = dict(zip(unique, counts))
print("Original class counts:", class_counts)

# Define sampling strategy (e.g. minority = 50% of majority)
majority_class = max(class_counts, key=class_counts.get)
majority_count = class_counts[majority_class]

sampling_strategy = {
    cls: int(0.5 * majority_count) if count < 0.5 * majority_count else count
    for cls, count in class_counts.items()
}

smote = SMOTE(sampling_strategy=sampling_strategy, random_state=42)

X_train_res, y_train_res = smote.fit_resample(
    X_train.reshape(len(X_train), -1), y_train_int
)

# Reshape back to sequences
X_train_res = X_train_res.reshape(-1, X_train.shape[1], X_train.shape[2])
y_train_res_cat = to_categorical(y_train_res)

print("Resampled class counts:", np.bincount(y_train_res))

In [None]:
from sklearn.metrics import accuracy_score
import numpy as np

# Baseline accuracy
y_pred_base = model.predict(X_test)
baseline_acc = accuracy_score(np.argmax(y_test, axis=1), np.argmax(y_pred_base, axis=1))

importances = []
for i in range(X_test.shape[2]):  # loop over features
    X_test_perm = X_test.copy()
    np.random.shuffle(X_test_perm[:, :, i])  # permute feature
    y_pred_perm = model.predict(X_test_perm)
    acc_perm = accuracy_score(np.argmax(y_test, axis=1), np.argmax(y_pred_perm, axis=1))
    importances.append(baseline_acc - acc_perm)

# Rank features by importance
feature_names = numeric_features + list(ohe.get_feature_names_out(categorical_features))
feat_importances = sorted(zip(feature_names, importances), key=lambda x: x[1], reverse=True)

print("Top 15 most important features:")
for name, score in feat_importances[:15]:
    print(f"{name}: {score:.4f}")

Observations from feature importance

	•	The top features are mostly numeric system metrics:
	•	mem_used_max, load15_mean, cpu_user_max, warning_logs, total_traces, missing_data, etc.
	•	Some categorical identifiers appear (Hostname_wally123, pid_6.0) but their importance is much smaller.
	•	Many features have near-zero contribution → keeping them might just add noise and risk overfitting.

  What this tells us

	1.	System metrics drive the model — makes sense, the operation affects CPU/memory/load.
	2.	Identifiers (hostname/pid) contribute, but minimally → including them is optional.
	•	Keeping them might help a little, but can also cause overfitting or memorization of host-specific patterns.
	3.	Top 10–15 features explain most of the predictive power.
	•	You can safely drop the rest to simplify the model and improve generalization.

In [None]:
top_features = [name for name, _ in feat_importances[:15]]

# For numeric features
X_train_top = X_train_res[:, :, [numeric_features.index(f) for f in top_features if f in numeric_features]]

# For categorical features
cat_indices = [i for i, f in enumerate(feature_names) if f in top_features and f not in numeric_features]
# If using OHE, you can slice columns accordingly
# X_train_top_cat = X_train_res[:, :, cat_indices]

In [None]:
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv1D, BatchNormalization, Flatten, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Select top features
top_features = [name for name, _ in feat_importances[:15]]

# Get indices for numeric features
numeric_idx = [numeric_features.index(f) for f in top_features if f in numeric_features]

# Get indices for categorical features (OHE columns)
cat_idx = [i for i, f in enumerate(ohe.get_feature_names_out(categorical_features)) if f in top_features]

# Combine indices
top_indices = numeric_idx + [len(numeric_features) + i for i in cat_idx]

# Extract top features
X_train_top = X_train[:, :, top_indices]
X_test_top = X_test[:, :, top_indices]

# Impute any remaining NaNs
imputer = SimpleImputer(strategy="median")
X_train_top = imputer.fit_transform(X_train_top.reshape(len(X_train_top), -1))
X_test_top = imputer.transform(X_test_top.reshape(len(X_test_top), -1))

# Reshape back to sequences
seq_len = X_train.shape[1]
X_train_top = X_train_top.reshape(-1, seq_len, len(top_indices))
X_test_top = X_test_top.reshape(-1, seq_len, len(top_indices))

# Step 3: Re-run SMOTE on training data
y_train_int = np.argmax(y_train, axis=1)  # ensure integer labels

# Define partial oversampling strategy (50% of majority)
unique, counts = np.unique(y_train_int, return_counts=True)
class_counts = dict(zip(unique, counts))
majority_count = max(class_counts.values())
sampling_strategy = {cls: int(0.5*majority_count) if count < 0.5*majority_count else count
                     for cls, count in class_counts.items()}

smote = SMOTE(sampling_strategy=sampling_strategy, random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_top.reshape(len(X_train_top), -1), y_train_int)

# Reshape back to sequences
X_train_res = X_train_res.reshape(-1, seq_len, len(top_indices))
y_train_res_cat = to_categorical(y_train_res)

print("Resampled class counts:", np.bincount(y_train_res))

# Build smaller CNN
input_shape = (seq_len, len(top_indices))
inputs = Input(shape=input_shape)
x = Conv1D(32, kernel_size=3, activation='relu', padding='same')(inputs)
x = BatchNormalization()(x)
x = Conv1D(16, kernel_size=3, activation='relu', padding='same')(x)
x = BatchNormalization()(x)
x = Flatten()(x)
x = Dense(32, activation='relu')(x)
x = Dropout(0.3)(x)
outputs = Dense(y_train_res_cat.shape[1], activation='softmax')(x)

model_small = Model(inputs, outputs)
model_small.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train model
history = model_small.fit(
    X_train_res, y_train_res_cat,
    validation_data=(X_test_top, y_test),
    epochs=20,
    batch_size=32,
    verbose=1
)

# Evaluate
y_pred = np.argmax(model_small.predict(X_test_top), axis=1)
y_true = np.argmax(y_test, axis=1)

print("Classification Report:")
print(classification_report(y_true, y_pred, target_names=label_encoder.classes_))

cm = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:\n", cm)

In [None]:
model_small.save("cnn_top15_features.h5")
# Save preprocessing objects
import joblib
joblib.dump(imputer, "imputer.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(ohe, "ohe.pkl")
joblib.dump(label_encoder, "label_encoder.pkl")