# **AI Models for Engine Data 📊**

### Engine Data Preprocessing

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE

# Assume the dataset is in a CSV file
df = pd.read_csv('/content/engine_data.csv')

# Reshape the data so that each car has its own row
df_pivot = df.pivot_table(index='Car ID',
                          columns=df.groupby('Car ID').cumcount() + 1,  # Use cumcount to get a numerical reading starting from 1
                          values=['Engine rpm', 'Lub oil pressure', 'Fuel pressure', 'Coolant pressure', 'lub oil temp', 'Coolant temp'],
                          aggfunc='first')

# Rename the columns in the format: FeatureR1, FeatureR2, ...
df_pivot.columns = [f'{col[0]}R{col[1]}' for col in df_pivot.columns]

# Arrange the columns in the desired order
columns = []
for i in range(1, 11):  # To cover 10 readings (from R1 to R10)
    columns.extend([f'Engine rpmR{i}', f'Lub oil pressureR{i}', f'Fuel pressureR{i}', f'Coolant pressureR{i}',
                    f'Lub oil tempR{i}', f'Coolant tempR{i}'])

# Check existing columns and filter out any missing ones
existing_columns = [col for col in columns if col in df_pivot.columns]
df_pivot = df_pivot[existing_columns]

# Add the "Label" column from the last row of each "Car ID"
df_labels = df.groupby('Car ID').last()[['Label']]  # Get the label from the last row for each Car ID

# Merge the reshaped data with the "Label"
df_pivot = df_pivot.merge(df_labels, on='Car ID', how='left')

# Save the result to a new CSV file
df_pivot.to_csv('transformed_dataset_2.csv', index=True)

# Display the result
print(df_pivot.head())

# Load the dataset
file_path = "/content/transformed_dataset_2.csv"
df = pd.read_csv(file_path)

# Drop the row with missing values
df = df.dropna()

# Drop 'Car ID' column as it's not useful
df = df.drop(columns=['Car ID'])

# Encode categorical labels
label_encoder = LabelEncoder()
df['Label'] = label_encoder.fit_transform(df['Label'])

# Standardize numerical features
scaler = StandardScaler()
feature_columns = df.columns[:-1]  # All columns except 'Label'
df[feature_columns] = scaler.fit_transform(df[feature_columns])

# Address class imbalance using SMOTE
X = df[feature_columns]
y = df['Label']
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Create balanced dataframe
df_balanced = pd.DataFrame(X_resampled, columns=feature_columns)
df_balanced['Label'] = y_resampled

# Shuffle the dataset
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the preprocessed dataset
df_balanced.to_csv("preprocessed_car_engine_data.csv", index=False)

print("Preprocessing complete with SMOTE applied and data shuffled. Preprocessed dataset saved as 'preprocessed_car_engine_data.csv'.")

### XGBOOST Model - Engine Data

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
file_path = "preprocessed_car_engine_data.csv"
df = pd.read_csv(file_path)

# Encode target labels
label_encoder = LabelEncoder()
df['Label'] = label_encoder.fit_transform(df['Label'])

# Define features and target variable
X = df.drop(columns=['Label', 'Car ID'])  # Exclude 'Car ID' if not useful
y = df['Label']

# Split into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Convert data to DMatrix format for XGBoost
train_dmatrix = xgb.DMatrix(X_train, label=y_train)
val_dmatrix = xgb.DMatrix(X_val, label=y_val)
test_dmatrix = xgb.DMatrix(X_test, label=y_test)

# Set XGBoost parameters
params = {
    'objective': 'multi:softmax',  # Multi-class classification
    'num_class': len(np.unique(y)),
    'eval_metric': 'mlogloss',
    'eta': 0.1,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42
}

# Train the XGBoost model
watchlist = [(train_dmatrix, 'train'), (val_dmatrix, 'eval')]
model = xgb.train(params, train_dmatrix, num_boost_round=100, evals=watchlist, early_stopping_rounds=10)

# Make predictions
preds = model.predict(test_dmatrix)

# Evaluate the model
accuracy = accuracy_score(y_test, preds)
print(f'Accuracy: {accuracy:.4f}')
print('Classification Report:\n', classification_report(y_test, preds))

# Confusion Matrix
plt.figure(figsize=(6,4))
sns.heatmap(confusion_matrix(y_test, preds), annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Feature Importance
xgb.plot_importance(model)
plt.show()


### ANN Model - Engine Data

In [None]:
import pandas as pd

# Load the new dataset
new_file_path = "/content/preprocessed_car_engine_data.csv"
df_new = pd.read_csv(new_file_path)

# Display the first few rows to inspect the structure
df_new.head()

from sklearn.model_selection import train_test_split
import numpy as np

# Separate features and target
X_new = df_new.drop(columns=['Label'])  # Feature columns
y_new = df_new['Label']  # Target column

# Split data into train (70%), validation (15%), and test (15%)
X_train_new, X_temp_new, y_train_new, y_temp_new = train_test_split(
    X_new, y_new, test_size=0.3, random_state=42, stratify=y_new
)
X_val_new, X_test_new, y_val_new, y_test_new = train_test_split(
    X_temp_new, y_temp_new, test_size=0.5, random_state=42, stratify=y_temp_new
)

# Check dataset sizes
X_train_new.shape, X_val_new.shape, X_test_new.shape

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns

# Define the ANN model
model_new = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(50,)),
    layers.Dense(32, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(len(y_new.unique()), activation='softmax')  # Assuming classification with multiple classes
])

# Compile the model
model_new.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# Train the model
history_new = model_new.fit(
    X_train_new, y_train_new,
    validation_data=(X_val_new, y_val_new),
    epochs=30,
    batch_size=32,
    verbose=1
)

# Evaluate the model on the test set
test_loss_new, test_accuracy_new = model_new.evaluate(X_test_new, y_test_new, verbose=1)
print(f'Test Accuracy: {test_accuracy_new:.4f}')

# Visualization of training history
plt.figure(figsize=(12, 5))

# Plot training & validation accuracy values
plt.subplot(1, 2, 1)
plt.plot(history_new.history['accuracy'], label='Train Accuracy')
plt.plot(history_new.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

# Plot training & validation loss values
plt.subplot(1, 2, 2)
plt.plot(history_new.history['loss'], label='Train Loss')
plt.plot(history_new.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

# Predictions on test data
y_pred_new = np.argmax(model_new.predict(X_test_new), axis=1)

# Classification report
print("Classification Report:")
print(classification_report(y_test_new, y_pred_new))

# Confusion matrix
conf_matrix_new = confusion_matrix(y_test_new, y_pred_new)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_new, annot=True, fmt='d', cmap='Blues', xticklabels=np.unique(y_new), yticklabels=np.unique(y_new))
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()

# Model summary
print("Model Summary:")
model_new.summary()

### Bagging Model - Engine Data

In [None]:
import numpy as np
import pandas as pd
import joblib  # Library for saving and loading models
import sklearn  # To check the scikit-learn version
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

# ✅ Fixing the __sklearn_tags__ issue in the latest version
if not hasattr(sklearn.base.ClassifierMixin, "__sklearn_tags__"):
    sklearn.base.ClassifierMixin.__sklearn_tags__ = lambda self: {}

# ✅ Print the current scikit-learn version for compatibility
print(f"Using scikit-learn version: {sklearn.__version__}")

# Load the dataset
df = pd.read_csv('/content/preprocessed_car_engine_data.csv')

# Fix column names
df.columns = df.columns.str.replace(' ', '_')

# Separate features and target variable
X = df.drop(columns=['Label'])
y = df['Label']

# Split data: 70% train, 15% validation, 15% test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Initialize base models
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
xgb_model = XGBClassifier(random_state=42, n_jobs=-1)
lgb_model = LGBMClassifier(random_state=42, n_jobs=-1, verbose=-1)

# ✅ Create BaggingClassifier using estimator instead of base_estimator
bagging_clf = BaggingClassifier(
    estimator=rf_model,  # ✅ Using RandomForest as base model
    n_estimators=10,  # Number of models inside Bagging
    bootstrap=True,  # ✅ Enable bootstrapping to increase variance
    random_state=42,
    n_jobs=-1
)

# ✅ Train the model on training data
bagging_clf.fit(X_train, y_train)

# ✅ Save the model with the current scikit-learn version
joblib.dump(bagging_clf, "bagging_model.pkl")
print("✅ Model saved successfully as bagging_model.pkl")

# ✅ Evaluate on validation set
y_val_pred = bagging_clf.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {val_accuracy:.4f}')

# ✅ Evaluate on test set
y_test_pred = bagging_clf.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Test Accuracy: {test_accuracy:.4f}')

# ✅ Print confusion matrix
conf_matrix = confusion_matrix(y_test, y_test_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=np.unique(y), yticklabels=np.unique(y))
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

# ✅ Print classification report
print('Classification Report:')
print(classification_report(y_test, y_test_pred))

train_mean = np.mean(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)

plt.figure(figsize=(8, 6))
plt.plot(train_sizes, train_mean, 'o-', color='blue', label='Training Score')
plt.plot(train_sizes, test_mean, 'o-', color='green', label='Validation Score')

plt.xlabel('Training Examples')
plt.ylabel('Accuracy')
plt.title('Learning Curve')

# Adjust the Y-axis scale to make the differences more visible
plt.ylim([0.0, 1.01])  # Change the Y-axis range to better highlight differences

plt.legend(loc='best')
plt.grid()
plt.show()

from sklearn.metrics import roc_curve, auc

plt.figure(figsize=(8, 6))

for i, class_label in enumerate(np.unique(y)):
    fpr, tpr, _ = roc_curve(y_test_binarized[:, i], y_prob[:, i])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=2, label=f'Class {class_label} (AUC = {roc_auc:.2f})')

plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (One-vs-Rest)')
plt.legend(loc='lower right')
plt.grid()
plt.show()



### Random Forest Model - Engine Data

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load data
file_path = "/content/preprocessed_car_engine_data.csv"
df = pd.read_csv(file_path)

# Separate features and target
X = df.drop(columns=["Label"])  # Features
y = df["Label"]  # Target variable

# Split data into 70% training and 30% validation + test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Split remaining 30% into 15% validation and 15% test
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Create Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on training data
model.fit(X_train, y_train)

# Predict on validation data
y_val_pred = model.predict(X_val)

# Evaluate model on validation data
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation accuracy: {val_accuracy:.4f}")

# Predict on test data
y_test_pred = model.predict(X_test)

# Evaluate model on test data
test_accuracy = accuracy_score(y_test, y_test_pred)
conf_matrix = confusion_matrix(y_test, y_test_pred)
class_report = classification_report(y_test, y_test_pred)

# Print results
print(f"Test accuracy: {test_accuracy:.4f}")
print("\nClassification Report:\n", class_report)

# Plot confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=[0, 1], yticklabels=[0, 1])
plt.xlabel("Predictions")
plt.ylabel("Actual Values")
plt.title("Confusion Matrix")
plt.show()
