# Sensor Fault Detection: EDA and Modeling

This notebook performs an exploratory data analysis (EDA) and evaluates multiple machine learning models for the Sensor Fault Detection dataset. 

**Enhancements for Kaggle:**
*   All required libraries are installed in the first cell.
*   Computationally expensive steps like data imputation and resampling are cached. The notebook saves the results of these steps and loads them on subsequent runs to save time.
*   Code errors from the original file have been fixed and highlighted.

In [None]:
# <<< NEW SECTION >>>
# This cell installs all required libraries for the notebook to run on Kaggle.
# The -q flag is used for a quieter installation output.
!pip install -q xgboost catboost scikit-learn imbalanced-learn miceforest kneed prettytable

In [None]:
# <<< NEW SECTION >>>
# All imports are consolidated here for better organization and to avoid errors.

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from statistics import mean
import warnings
import joblib
import json
import os

# Preprocessing and Imputation
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PowerTransformer, LabelEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from imblearn.combine import SMOTETomek
from sklearn.decomposition import PCA
from kneed import KneeLocator
import miceforest as mf

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# Metrics and Evaluation
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn import metrics

# Settings
warnings.filterwarnings("ignore")
%matplotlib inline

In [None]:
df = pd.read_csv(r'https://raw.githubusercontent.com/avnyadav/sensor-fault-detection/main/aps_failure_training_set1.csv', na_values='na')

In [None]:
df

## Exploratory Data Analysis (EDA)

In [None]:
#define numeric and categorical columns
numeric_columns = [col for col in df.columns if pd.api.types.is_numeric_dtype(df[col])]
categorical_columns = [col for col in df.columns if df[col].dtype == 'object']

print("We have {} numeric columns: {}".format(len(numeric_columns), numeric_columns))
print("We have {} categorical columns: {}".format(len(categorical_columns), categorical_columns))

In [None]:
# Check and visualize missing values in each column

# Calculate missing value percentage
missing_values = df.isna().sum().div(df.shape[0]).mul(100).to_frame(name='missing_percent')

# Sort in descending order
missing_values = missing_values.sort_values(by='missing_percent', ascending=False)

# Plot
fig, ax = plt.subplots(figsize=(15, 5))
ax.bar(missing_values.index, missing_values['missing_percent'], color='orange')
plt.xticks(rotation=90)
plt.ylabel('Percentage of missing values')
plt.title('Missing Values in Each Column')
plt.tight_layout()
plt.show()


### Exclude Columns with 70%+ Null Values

In [None]:
dropcols = missing_values[missing_values['missing_percent'] > 70]
# Drop columns with more than 70% missing values
dropcols

In [None]:
df.drop(columns=dropcols.index, inplace=True)
df.shape

In [None]:
# <<< CHANGED SECTION >>>
# This cell now correctly calculates the total missing values *after* dropping the highly-null columns.

missing_values_count = df.isna().sum()
total_cells = np.prod(df.shape)
total_missing = missing_values_count.sum()
print(f"Total missing values: {total_missing} out of {total_cells} cells ({(total_missing / total_cells) * 100:.2f}%)")

### Visualize the Target Variable

In [None]:
positive_count = df['class'].value_counts().get('pos', 0)
negative_count = df['class'].value_counts().get('neg', 0)
print("positive: " + str(positive_count), ", Negative: " + str(negative_count))
sns.catplot(data=df, x='class', kind='count', palette="winter_r", alpha = 0.6)
plt.show()

## Helper Functions for Model Evaluation

In [None]:
def evaluate_clf(true, predicted):
    """
    This function takes in true values and predicted values
    Returns: Accuracy, F1-Score, Precision, Recall, Roc-auc Score
    """

    acc = accuracy_score(true, predicted) # Calculate Accuracy
    
    f1 = f1_score(true, predicted) # Calculate F I-score
    
    precision = precision_score(true, predicted) # Calculate Precision
   
    recall = recall_score(true, predicted) # Calculate Recall
   
    roc_auc = roc_auc_score(true, predicted) #Calculate Roc
    
    return acc, f1, precision, recall, roc_auc

In [None]:
def total_cost(y_true, y_pred):
    '''function accepts y_true and y_pred and returns the total cost of misclassification'''
    # tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel() if cm.shape == (2, 2) else (cm[0,0], 0, 0, 0) if cm.shape == (1,1) else (0,0,0,0) # Handle edge cases
    cost = fp*10 + fn*500
    return cost

In [None]:
# <<< CHANGED SECTION >>>
# Fixed a bug where the function would return after only the first model.
# The creation of the report and the return statement are now correctly placed outside the loop.
def evaluate_models(X, y, models):
    """
    This function takes in x, y and a list of models
    Returns: A dataframe with model name and test performance metrics.
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    results_list = []
    
    for name, model in models.items():
        model.fit(X_train, y_train)  # train the model
        
        # make predictions
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
        
        # Training set performance
        model_train_accuracy, model_train_f1, model_train_precision, model_train_recall, model_train_rocauc_score = evaluate_clf(y_train, y_train_pred)
        train_cost = total_cost(y_train, y_train_pred)
        
        # Test set performance
        model_test_accuracy, model_test_f1, model_test_precision, model_test_recall, model_test_rocauc_score = evaluate_clf(y_test, y_test_pred)
        test_cost = total_cost(y_test, y_test_pred)
        
        print(f"Model: {name}")
        print('--- Model performance for Training set ---')
        print(f"- Accuracy: {model_train_accuracy:.4f}")
        print(f"- F1-Score: {model_train_f1:.4f}")
        print(f"- Precision: {model_train_precision:.4f}")
        print(f"- Recall: {model_train_recall:.4f}")
        print(f"- ROC-AUC Score: {model_train_rocauc_score:.4f}")
        print(f"- Total Cost: {train_cost:.2f}")
        
        print("--------------------------------------------------------------")
        
        print('--- Model performance for Test set ---')
        print(f"- Accuracy: {model_test_accuracy:.4f}")
        print(f"- F1-Score: {model_test_f1:.4f}")
        print(f"- Precision: {model_test_precision:.4f}")
        print(f"- Recall: {model_test_recall:.4f}")
        print(f"- ROC-AUC Score: {model_test_rocauc_score:.4f}")
        print(f"- Total Cost: {test_cost:.2f}")
        print("==============================================================\n")
        
        results_list.append({
            'Model Name': name,
            'Accuracy': model_test_accuracy,
            'F1-Score': model_test_f1,
            'Precision': model_test_precision,
            'Recall': model_test_recall,
            'ROC-AUC': model_test_rocauc_score,
            'Total Cost': test_cost
        })
        
    report = pd.DataFrame(results_list).sort_values(by='Total Cost', ascending=True)
    return report

## Data Preprocessing

In [None]:
X = df.drop('class', axis=1)
y = df['class']

In [None]:
y = y.replace({'pos': 1, 'neg': 0})  # Convert target variable to binary

### Experiment 1: KNN Imputer with Robust Scaling

KNN Imputer is computationally expensive. The imputed and resampled datasets will be saved to disk to avoid re-computation in subsequent runs.

In [None]:
# <<< NEW SECTION >>>
# The following code for finding the best K for KNNImputer was likely run once.
# We will proceed with the chosen K=3 based on the original notebook's implicit choice.
# This step is very time-consuming and is shown here for completeness.

# results = []
# strategies = [str(i) for i in [1,3,5,7,9]]
# for s in strategies:
#     pipeline = Pipeline(steps=[('i', KNNImputer(n_neighbors=int(s))), ('m',LogisticRegression())])
#     scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=2, n_jobs=-1)
#     results.append(scores)
#     print(f"n_neighbours = {s} - Accuracy: {mean(scores):.4f} ± {np.std(scores):.4f}")

In [None]:
# <<< NEW SECTION >>>
# Save/Load functionality for the KNN imputed data.

KNN_IMPUTED_DATA_PATH = 'X_knn_imputed.pkl'

if os.path.exists(KNN_IMPUTED_DATA_PATH):
    print(f"Loading pre-computed data from {KNN_IMPUTED_DATA_PATH}")
    X_knn = joblib.load(KNN_IMPUTED_DATA_PATH)
else:
    print("Performing KNN Imputation (this may take a long time)...")
    knn_pipeline = Pipeline(steps=[
        ('imputer', KNNImputer(n_neighbors=3)),
        ('RobustScaler', RobustScaler())
    ])
    X_knn = knn_pipeline.fit_transform(X)
    joblib.dump(X_knn, KNN_IMPUTED_DATA_PATH)
    print(f"Saved imputed data to {KNN_IMPUTED_DATA_PATH}")

In [None]:
# <<< NEW SECTION >>>
# Save/Load functionality for the resampled data.

X_RESAMPLED_KNN_PATH = 'X_resampled_knn.pkl'
Y_RESAMPLED_KNN_PATH = 'y_resampled_knn.pkl'

if os.path.exists(X_RESAMPLED_KNN_PATH) and os.path.exists(Y_RESAMPLED_KNN_PATH):
    print("Loading pre-resampled KNN data...")
    X_res, y_res = joblib.load(X_RESAMPLED_KNN_PATH), joblib.load(Y_RESAMPLED_KNN_PATH)
else:
    print("Performing SMOTETomek resampling for KNN data...")
    smote_tomek = SMOTETomek(random_state=42, sampling_strategy='minority', n_jobs=-1)
    X_res, y_res = smote_tomek.fit_resample(X_knn, y)
    joblib.dump(X_res, X_RESAMPLED_KNN_PATH)
    joblib.dump(y_res, Y_RESAMPLED_KNN_PATH)
    print("Saved resampled KNN data.")

In [None]:
# Dictionary which contains models for all experiments
models = {
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Logistic Regression": LogisticRegression(),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "XGBClassifier": XGBClassifier(), 
    "CatBoosting Classifier": CatBoostClassifier(verbose=False),
    "AdaBoost Classifier": AdaBoostClassifier()
}

In [None]:
report_knn = evaluate_models(X_res, y_res, models)

In [None]:
report_knn

### Experiment 2: Simple Imputer with Strategy 'Median'

In [None]:
# <<< NEW SECTION >>>
# Save/Load functionality for the median imputed data.

MEDIAN_IMPUTED_DATA_PATH = 'X_median_imputed.pkl'

if os.path.exists(MEDIAN_IMPUTED_DATA_PATH):
    print(f"Loading pre-computed data from {MEDIAN_IMPUTED_DATA_PATH}")
    X_median = joblib.load(MEDIAN_IMPUTED_DATA_PATH)
else:
    print("Performing Median Imputation...")
    median_pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('RobustScaler', RobustScaler())
    ])
    X_median = median_pipeline.fit_transform(X)
    joblib.dump(X_median, MEDIAN_IMPUTED_DATA_PATH)
    print(f"Saved imputed data to {MEDIAN_IMPUTED_DATA_PATH}")

In [None]:
# <<< NEW SECTION >>>
# Save/Load functionality for the resampled data.

X_RESAMPLED_MEDIAN_PATH = 'X_resampled_median.pkl'
Y_RESAMPLED_MEDIAN_PATH = 'y_resampled_median.pkl'

if os.path.exists(X_RESAMPLED_MEDIAN_PATH) and os.path.exists(Y_RESAMPLED_MEDIAN_PATH):
    print("Loading pre-resampled median data...")
    X_res, y_res = joblib.load(X_RESAMPLED_MEDIAN_PATH), joblib.load(Y_RESAMPLED_MEDIAN_PATH)
else:
    print("Performing SMOTETomek resampling for median data...")
    smt = SMOTETomek(random_state=42, sampling_strategy='minority', n_jobs=-1)
    X_res, y_res = smt.fit_resample(X_median, y)
    joblib.dump(X_res, X_RESAMPLED_MEDIAN_PATH)
    joblib.dump(y_res, Y_RESAMPLED_MEDIAN_PATH)
    print("Saved resampled median data.")

In [None]:
# Training the models
report_median = evaluate_models(X_res, y_res, models)

In [None]:
report_median

### Experiment 3: MICE Forest for Imputing Null Values

In [None]:
# <<< NEW SECTION >>>
# Save/Load functionality for the MICE imputed data.

MICE_IMPUTED_DATA_PATH = 'X_mice_imputed.pkl'

if os.path.exists(MICE_IMPUTED_DATA_PATH):
    print(f"Loading pre-computed data from {MICE_IMPUTED_DATA_PATH}")
    X_mice_imputed = joblib.load(MICE_IMPUTED_DATA_PATH)
else:
    print("Performing MICE Imputation (this may take a while)...")
    X_mice = X.copy()
    kernel = mf.ImputationKernel(
      X_mice,
      save_all_iterations=True,
      random_state=1989
    )
    kernel.mice(3) # Run MICE for 3 iterations
    X_mice_imputed = kernel.complete_data()
    joblib.dump(X_mice_imputed, MICE_IMPUTED_DATA_PATH)
    print(f"Saved imputed data to {MICE_IMPUTED_DATA_PATH}")

# Scale the data after imputation
mice_pipeline = Pipeline(steps=[
    ('RobustScaler', RobustScaler())
])
X_mice = mice_pipeline.fit_transform(X_mice_imputed)

In [None]:
# <<< NEW SECTION >>>
# Save/Load functionality for the resampled data.

X_RESAMPLED_MICE_PATH = 'X_resampled_mice.pkl'
Y_RESAMPLED_MICE_PATH = 'y_resampled_mice.pkl'

if os.path.exists(X_RESAMPLED_MICE_PATH) and os.path.exists(Y_RESAMPLED_MICE_PATH):
    print("Loading pre-resampled MICE data...")
    X_res, y_res = joblib.load(X_RESAMPLED_MICE_PATH), joblib.load(Y_RESAMPLED_MICE_PATH)
else:
    print("Performing SMOTETomek resampling for MICE data...")
    smt = SMOTETomek(random_state=42, sampling_strategy='minority', n_jobs=-1)
    X_res, y_res = smt.fit_resample(X_mice, y)
    joblib.dump(X_res, X_RESAMPLED_MICE_PATH)
    joblib.dump(y_res, Y_RESAMPLED_MICE_PATH)
    print("Saved resampled MICE data.")

In [None]:
# Training the models
report_mice = evaluate_models(X_res, y_res, models)

In [None]:
report_mice

### Experiment 4: Simple Imputer with Strategy 'Constant'

In [None]:
# <<< NEW SECTION >>>
# Save/Load functionality for the constant imputed data.

CONST_IMPUTED_DATA_PATH = 'X_const_imputed.pkl'

if os.path.exists(CONST_IMPUTED_DATA_PATH):
    print(f"Loading pre-computed data from {CONST_IMPUTED_DATA_PATH}")
    X_const = joblib.load(CONST_IMPUTED_DATA_PATH)
else:
    print("Performing Constant Imputation...")
    constant_pipeline = Pipeline(steps=[
        ('Imputer', SimpleImputer(strategy='constant', fill_value=0)),
        ('RobustScaler', RobustScaler())
    ])
    X_const = constant_pipeline.fit_transform(X)
    joblib.dump(X_const, CONST_IMPUTED_DATA_PATH)
    print(f"Saved imputed data to {CONST_IMPUTED_DATA_PATH}")

In [None]:
# <<< NEW SECTION >>>
# Save/Load functionality for the resampled data.

X_RESAMPLED_CONST_PATH = 'X_resampled_const.pkl'
Y_RESAMPLED_CONST_PATH = 'y_resampled_const.pkl'

if os.path.exists(X_RESAMPLED_CONST_PATH) and os.path.exists(Y_RESAMPLED_CONST_PATH):
    print("Loading pre-resampled constant data...")
    X_res, y_res = joblib.load(X_RESAMPLED_CONST_PATH), joblib.load(Y_RESAMPLED_CONST_PATH)
else:
    print("Performing SMOTETomek resampling for constant data...")
    smt = SMOTETomek(random_state=42, sampling_strategy='minority', n_jobs=-1)
    X_res, y_res = smt.fit_resample(X_const, y)
    joblib.dump(X_res, X_RESAMPLED_CONST_PATH)
    joblib.dump(y_res, Y_RESAMPLED_CONST_PATH)
    print("Saved resampled constant data.")

In [None]:
# training the models
report_const = evaluate_models(X_res, y_res, models)

In [None]:
report_const

### Experiment 5: Simple Imputer with Strategy 'Mean'

In [None]:
# <<< NEW SECTION >>>
# Save/Load functionality for the mean imputed data.

MEAN_IMPUTED_DATA_PATH = 'X_mean_imputed.pkl'

if os.path.exists(MEAN_IMPUTED_DATA_PATH):
    print(f"Loading pre-computed data from {MEAN_IMPUTED_DATA_PATH}")
    X_mean = joblib.load(MEAN_IMPUTED_DATA_PATH)
else:
    print("Performing Mean Imputation...")
    mean_pipeline = Pipeline(steps=[
        ('Imputer', SimpleImputer(strategy='mean')),
        ('RobustScaler', RobustScaler())
    ])
    X_mean = mean_pipeline.fit_transform(X)
    joblib.dump(X_mean, MEAN_IMPUTED_DATA_PATH)
    print(f"Saved imputed data to {MEAN_IMPUTED_DATA_PATH}")

In [None]:
# <<< NEW SECTION >>>
# Save/Load functionality for the resampled data.

X_RESAMPLED_MEAN_PATH = 'X_resampled_mean.pkl'
Y_RESAMPLED_MEAN_PATH = 'y_resampled_mean.pkl'

if os.path.exists(X_RESAMPLED_MEAN_PATH) and os.path.exists(Y_RESAMPLED_MEAN_PATH):
    print("Loading pre-resampled mean data...")
    X_res, y_res = joblib.load(X_RESAMPLED_MEAN_PATH), joblib.load(Y_RESAMPLED_MEAN_PATH)
else:
    print("Performing SMOTETomek resampling for mean data...")
    smt = SMOTETomek(random_state=42, sampling_strategy='minority', n_jobs=-1)
    X_res, y_res = smt.fit_resample(X_mean, y)
    joblib.dump(X_res, X_RESAMPLED_MEAN_PATH)
    joblib.dump(y_res, Y_RESAMPLED_MEAN_PATH)
    print("Saved resampled mean data.")

In [None]:
# Training all models
report_mean = evaluate_models(X_res, y_res, models)

In [None]:
report_mean

### Experiment 6: Principle Component Analysis (PCA)

Using the best imputation method from above (constant fill) and then applying PCA for dimensionality reduction.

In [None]:
# Data is already imputed and scaled from Experiment 4, stored in X_const
X_pca_input = X_const

In [None]:
#Applying PCA to find optimal number of components
var_ratio={}
for n in range(2,150):
    pc=PCA(n_components=n)
    df_pca=pc.fit(X_pca_input)
    var_ratio[n]=sum(df_pca.explained_variance_ratio_)

#### Variance Plot

In [None]:
# plotting variance ratio
pd.Series(var_ratio).plot()

#### K-Need Algorithm to Find the Elbow Point

In [None]:
i = np.arange(len(var_ratio))
variance_ratio_list = list(var_ratio.values())
components = list(var_ratio.keys())
knee = KneeLocator(components, variance_ratio_list, S=1.0, curve='concave', direction='increasing')

fig = plt.figure(figsize=(5, 5))
knee.plot_knee()
plt.xlabel("Number of Components")
plt.ylabel("Explained Variance")
plt.show()

optimal_k = knee.knee
print('Optimal number of components (k) found by KneeLocator:', optimal_k)

In [None]:
# Reducing the dimensions of the data based on the optimal k
pca_final = PCA(n_components=optimal_k, random_state=42)
X_pca_reduced = pca_final.fit_transform(X_pca_input)

In [None]:
# <<< NEW SECTION >>>
# Save/Load functionality for the resampled PCA data.

X_RESAMPLED_PCA_PATH = 'X_resampled_pca.pkl'
Y_RESAMPLED_PCA_PATH = 'y_resampled_pca.pkl'

if os.path.exists(X_RESAMPLED_PCA_PATH) and os.path.exists(Y_RESAMPLED_PCA_PATH):
    print("Loading pre-resampled PCA data...")
    X_res, y_res = joblib.load(X_RESAMPLED_PCA_PATH), joblib.load(Y_RESAMPLED_PCA_PATH)
else:
    print("Performing SMOTETomek resampling for PCA data...")
    smt = SMOTETomek(random_state=42, sampling_strategy='minority', n_jobs=-1)
    X_res, y_res = smt.fit_resample(X_pca_reduced, y)
    joblib.dump(X_res, X_RESAMPLED_PCA_PATH)
    joblib.dump(y_res, Y_RESAMPLED_PCA_PATH)
    print("Saved resampled PCA data.")

In [None]:
# Training all models on PCA-transformed data
report_pca = evaluate_models(X_res, y_res, models)

In [None]:
report_pca

## Final Results Summary

The XGBoost Classifier with Simple Imputation (Constant fill value) provided the lowest total cost, making it the best model for this problem.

In [None]:
from prettytable import PrettyTable

pt=PrettyTable()
pt.field_names=["Model","Imputation_method","Total_cost"]
pt.add_row(["XGBClassifier","Simple Imputer-Constant", report_const.loc[report_const['Model Name'] == 'XGBClassifier', 'Total Cost'].iloc[0]])
pt.add_row(["XGBClassifier","Mice", report_mice.loc[report_mice['Model Name'] == 'XGBClassifier', 'Total Cost'].iloc[0]])
pt.add_row(["XGBClassifier","Knn-Imputer", report_knn.loc[report_knn['Model Name'] == 'XGBClassifier', 'Total Cost'].iloc[0]])
pt.add_row(["XGBClassifier","Simple Imputer-Mean", report_mean.loc[report_mean['Model Name'] == 'XGBClassifier', 'Total Cost'].iloc[0]])
pt.add_row(["CatBoostClassifier","Median", report_median.loc[report_median['Model Name'] == 'CatBoosting Classifier', 'Total Cost'].iloc[0]])
pt.add_row(["Random Forest","PCA", report_pca.loc[report_pca['Model Name'] == 'Random Forest', 'Total Cost'].iloc[0]])
print(pt)

## Final Model Training (Best Performing Model)

In [None]:
# We will use the resampled data from the best experiment (Constant Imputation)
X_final, y_final = joblib.load(X_RESAMPLED_CONST_PATH), joblib.load(Y_RESAMPLED_CONST_PATH)

final_model = XGBClassifier()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.2, random_state=42)

final_model = final_model.fit(X_train, y_train)
y_pred = final_model.predict(X_test)

In [None]:
print("Final XGBoost Classifier Accuracy Score (Train) :", final_model.score(X_train,y_train))
print("Final XGBoost Classifier Accuracy Score (Test) :", accuracy_score(y_test, y_pred))

In [None]:
print("Final XGBoost Classifier Cost Metric(Test) :",total_cost(y_test, y_pred))

In [None]:
# <<< CHANGED SECTION >>>
# `plot_confusion_matrix` is deprecated. Using `ConfusionMatrixDisplay.from_estimator` instead.

# plots Confusion matrix
fig, ax = plt.subplots(figsize=(8, 6))
ConfusionMatrixDisplay.from_estimator(final_model, X_test, y_test, cmap='Blues', values_format='d', ax=ax)
plt.title('Confusion Matrix for Final XGBoost Model')
plt.show()