In [1]:
# Import packages
import pandas as pd
import seaborn as sns
import plotly.express as px
%matplotlib inline
import matplotlib.pyplot as plt

from dash import Dash, html, dash_table, dcc
from jupyter_dash import JupyterDash
from dash.dependencies import Input, Output

sns.set_theme(style="darkgrid")

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_info_columns', 200)
pd.set_option('display.max_colwidth', None)

# Suppress SettingWithCopyWarning
pd.options.mode.chained_assignment = None

# Initialize the app
app = Dash()

## Read in CSV File

In [2]:
file_path = '/Users/alihushyar/Documents/Development/Python/NaaS/incidents_2024-08-31_L90.csv'
df_inc = pd.read_csv(file_path, low_memory=False, index_col=0)

# Get number of rows and columns in dataset
df_inc.shape

(76251, 94)

In [3]:
df_inc.info()

<class 'pandas.core.frame.DataFrame'>
Index: 76251 entries, 0 to 76250
Data columns (total 94 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   state                         76251 non-null  object 
 1   priority                      76251 non-null  object 
 2   identified_by                 76251 non-null  object 
 3   alert_id                      68324 non-null  object 
 4   impacted_device_count         76251 non-null  int64  
 5   recommendations               76251 non-null  object 
 6   ai_category                   72816 non-null  object 
 7   ai_sub_category               72816 non-null  object 
 8   is_actionable                 76251 non-null  bool   
 9   device_name                   47725 non-null  object 
 10  account_name                  76251 non-null  object 
 11  sub_account_name              76251 non-null  object 
 12  site_name                     76251 non-null  object 
 13  sub_si

## Clean data

In [4]:
# Drop any column that is completely empty
df_inc.dropna(axis=1, how='all', inplace=True)
df_inc.shape

(76251, 84)

In [54]:
cols = ['number',  'priority', 'state',  'incCreatedAt', 'case_number__pk', 'case_state', 'resolved_at', 'alert_id', 'alert_last_occurred_at', 'ai_category', 'ai_sub_category', 'issue_type', 'sub_issue_type', 'shortDescription', 'ticketClassification', 'device_name', 'configurationItem', 'assetSerialNum', 'account_id', 'account_name', 'sub_site_id', 'sub_site_name', 'incCreatedBy', 'assignmentGroup', 'resolved_by']
cols1 = ['number',  'priority', 'state',  'alert_last_occurred_at', 'incCreatedAt', 'resolved_at', 'case_number__pk', 'case_state', 'ai_category', 'ai_sub_category', 'device_name', 'configurationItem', 'account_id', 'incCreatedBy']
df_incf = df_inc[cols1]
#df_incf.head()

In [6]:
# Auto created incidents only
df_incf = df_incf[df_incf['incCreatedBy'] == 'serviceinsights']
df_incf.drop(columns=['incCreatedBy'], inplace=True)
df_incf.head(10)

Unnamed: 0,number,priority,state,alert_last_occurred_at,incCreatedAt,resolved_at,case_number__pk,case_state,ai_category,ai_sub_category,device_name,configurationItem,account_id
0,INC1024863,2 - High,New,,2024-08-31 00:16:20+00:00,,,,Security,Vulnerabilities,switch,6200F,ACCT0015152
1,INC1024862,2 - High,New,,2024-08-31 00:16:17+00:00,,,,Security,Vulnerabilities,switch,6300F,ACCT0010456
2,INC1024861,1 - Critical,On Hold,2024-08-31 00:12:06+00:00,2024-08-31 00:13:22+00:00,,,,Switch,Switch Port Input Errors,switch,AOS-CX Switches Default Model,ACCT0010456
3,INC1024860,3 - Moderate,New,2024-08-30 23:50:54+00:00,2024-08-31 00:10:52+00:00,,,,IAP,Radio Offline,iap,IAP - Other,ACCT0011196
4,INC1024859,2 - High,On Hold,,2024-08-31 00:03:57+00:00,,,,Security,Vulnerabilities,switch,8325,ACCT0010456
5,INC1024858,2 - High,On Hold,,2024-08-31 00:03:40+00:00,,,,Security,Vulnerabilities,switch,AOS-CX Switches Default Model,ACCT0011332
6,INC1024857,3 - Moderate,New,2024-08-31 00:03:01+00:00,2024-08-31 00:03:08+00:00,,,,PL-MULTI-SITE-INCIDENT,PL-MULTI-SITE-INCIDENT-VSAT,,,ACCT0011196
7,INC1024856,2 - High,On Hold,,2024-08-31 00:03:07+00:00,,,,Security,Vulnerabilities,switch,AOS-CX Switches Default Model,ACCT0010456
8,INC1024853,2 - High,In Progress,,2024-08-31 00:02:50+00:00,,,,Security,Vulnerabilities,switch,AOS-CX Switches Default Model,ACCT0011332
9,INC1024852,4 - Low,New,2024-08-31 00:02:03+00:00,2024-08-31 00:02:30+00:00,,,,IAP,Client Authentication Failure,,,ACCT0011332


In [7]:
df_incf.info()

<class 'pandas.core.frame.DataFrame'>
Index: 76075 entries, 0 to 76250
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   number                  76075 non-null  object
 1   priority                76075 non-null  object
 2   state                   76075 non-null  object
 3   alert_last_occurred_at  47079 non-null  object
 4   incCreatedAt            76075 non-null  object
 5   resolved_at             44795 non-null  object
 6   case_number__pk         21240 non-null  object
 7   case_state              21240 non-null  object
 8   ai_category             72816 non-null  object
 9   ai_sub_category         72816 non-null  object
 10  device_name             47656 non-null  object
 11  configurationItem       61750 non-null  object
 12  account_id              76075 non-null  object
dtypes: object(13)
memory usage: 8.1+ MB


In [8]:
# Rename priority column values
df_incf.loc[df_incf['priority'] == '1 - Critical', 'priority'] = 'P1'
df_incf.loc[df_incf['priority'] == '2 - High', 'priority'] = 'P2'
df_incf.loc[df_incf['priority'] == '3 - Moderate', 'priority'] = 'P3'
df_incf.loc[df_incf['priority'] == '4 - Low', 'priority'] = 'P4'

In [9]:
priority_mapping = {'P1': 1, 'P2': 2, 'P3': 3, 'P4': 4}
df_incf['priority'] = df_incf['priority'].map(priority_mapping)


In [10]:
# Replace values
df_incf['ai_category'] = df_incf['ai_category'].fillna('NOTASSIGNED')
df_incf['ai_sub_category'] = df_incf['ai_sub_category'].fillna('NOTASSIGNED')
df_incf['configurationItem'] = df_incf['configurationItem'].fillna('Other')

In [11]:
# Filter out New and In Progress tickets from state column
ticket_state = ['New', 'In Progress']
df_incf = df_incf.query('state not in @ticket_state').reset_index(drop=True)
print(f'# Records after filter on \'ticket state\' is {df_incf.shape[0]}')

# Records after filter on 'ticket state' is 75442


In [12]:
df_incf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75442 entries, 0 to 75441
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   number                  75442 non-null  object
 1   priority                75442 non-null  int64 
 2   state                   75442 non-null  object
 3   alert_last_occurred_at  46783 non-null  object
 4   incCreatedAt            75442 non-null  object
 5   resolved_at             44795 non-null  object
 6   case_number__pk         21089 non-null  object
 7   case_state              21089 non-null  object
 8   ai_category             75442 non-null  object
 9   ai_sub_category         75442 non-null  object
 10  device_name             47198 non-null  object
 11  configurationItem       75442 non-null  object
 12  account_id              75442 non-null  object
dtypes: int64(1), object(12)
memory usage: 7.5+ MB


In [55]:
#df_incf['ai_sub_category'].value_counts().reset_index()

In [56]:
#df_incf['configurationItem'].value_counts().reset_index()

In [57]:
#df_incf['account_id'].value_counts().reset_index()

In [16]:
df_incf['state'].value_counts().reset_index()

Unnamed: 0,state,count
0,Resolved,43924
1,Cancelled,21261
2,On Hold,9386
3,Closed,871


In [17]:
df_incf['mystate'] = df_incf.apply(lambda row: 'RWC' if row['state'] == 'Resolved' and pd.notna(row['case_number__pk']) 
                                  else 'RNC' if row['state'] == 'Resolved' and pd.isna(row['case_number__pk']) 
                                  else 'CWC' if row['state'] == 'Cancelled' and pd.notna(row['case_number__pk']) 
                                  else 'CNC' if row['state'] == 'Cancelled' and pd.isna(row['case_number__pk']) 
                                  else 'HWC' if row['state'] == 'On Hold' and pd.notna(row['case_number__pk']) 
                                  else 'HNC' if row['state'] == 'On Hold' and pd.isna(row['case_number__pk']) 
                                  else row['state'], axis=1)


In [18]:
df_incf['mystate'].value_counts().reset_index()

Unnamed: 0,mystate,count
0,RNC,26218
1,CNC,20619
2,RWC,17706
3,HNC,7503
4,HWC,1883
5,Closed,871
6,CWC,642


In [58]:
#df_incf.head(10)

In [20]:
df_incf['mystate'].value_counts().reset_index()

Unnamed: 0,mystate,count
0,RNC,26218
1,CNC,20619
2,RWC,17706
3,HNC,7503
4,HWC,1883
5,Closed,871
6,CWC,642


In [None]:
cols2 = ['priority', 'ai_category', 'ai_sub_category', 'configurationItem', 'account_id', 'mystate']
df_class = df_incf[cols2]
#df_class.head(10)

In [22]:
df_class.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75442 entries, 0 to 75441
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   priority           75442 non-null  int64 
 1   ai_category        75442 non-null  object
 2   ai_sub_category    75442 non-null  object
 3   configurationItem  75442 non-null  object
 4   account_id         75442 non-null  object
 5   mystate            75442 non-null  object
dtypes: int64(1), object(5)
memory usage: 3.5+ MB


In [23]:
df_class['account_id'] = pd.factorize(df_class['account_id'])[0]
df_class.reset_index(drop=True, inplace=True)


In [24]:
df_class.head(10)

Unnamed: 0,priority,ai_category,ai_sub_category,configurationItem,account_id,mystate
0,1,Switch,Switch Port Input Errors,AOS-CX Switches Default Model,0,HNC
1,2,Security,Vulnerabilities,8325,0,HNC
2,2,Security,Vulnerabilities,AOS-CX Switches Default Model,1,HNC
3,2,Security,Vulnerabilities,AOS-CX Switches Default Model,0,HNC
4,3,uxi,internal,Aruba UXI G5C sensor (LTE),0,HNC
5,3,uxi,internal,Aruba UXI G5C sensor (LTE),0,HNC
6,3,IAP,Radio Offline,Instant Access Points(IAP) Default Model,2,RNC
7,4,Switch,NAE Status,6300M,3,HWC
8,3,uxi,internal,Aruba UXI G5C sensor (LTE),0,RNC
9,3,uxi,internal,Aruba UXI G5C sensor (LTE),0,RNC


In [25]:
df_class['account_id'].value_counts().reset_index()

Unnamed: 0,account_id,count
0,3,39052
1,5,12282
2,6,11437
3,0,7773
4,4,1686
5,1,1500
6,2,1498
7,7,214


### Baseline multi-class logistic regression on state

In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Assume df_class is your DataFrame
df = df_class.copy()

# One-hot encode categorical features (excluding target column 'mystate')
categorical_features = ['priority', 'ai_category', 'ai_sub_category', 'configurationItem', 'account_id']
df_encoded = pd.get_dummies(df, columns=categorical_features, drop_first=True)

# Encode the target variable
label_encoder = LabelEncoder()
df_encoded['mystate'] = label_encoder.fit_transform(df['mystate'])

# Split into features (X) and target (y)
X = df_encoded.drop(columns=['mystate'])
y = df_encoded['mystate']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Logistic Regression with class weighting
#model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=500, class_weight='balanced')
model = LogisticRegression(solver='lbfgs', max_iter=500, class_weight='balanced')

model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")


Accuracy: 0.5908


### Decision Tree

In [27]:
import pandas as pd
import time
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Assume df_class is your DataFrame
df = df_class.copy()

# One-hot encode categorical features (excluding target column 'mystate')
categorical_features = ['ai_category', 'ai_sub_category', 'configurationItem', 'account_id']
df_encoded = pd.get_dummies(df, columns=categorical_features, drop_first=True)

# Encode the target variable (mystate)
label_encoder = LabelEncoder()
df_encoded['mystate'] = label_encoder.fit_transform(df['mystate'])

# Split into features (X) and target (y)
X = df_encoded.drop(columns=['mystate'])
y = df_encoded['mystate']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42, class_weight='balanced')
start_time = time.time()
dt_model.fit(X_train, y_train)
dt_train_time = time.time() - start_time

# Evaluate Decision Tree
y_pred_dt = dt_model.predict(X_test)
dt_accuracy = accuracy_score(y_test, y_pred_dt)

print("Decision Tree Results:")
print(f"Train Time: {dt_train_time:.4f} seconds")
print(f"Accuracy: {dt_accuracy:.4f}")
print(classification_report(y_test, y_pred_dt, target_names=label_encoder.classes_))


Decision Tree Results:
Train Time: 0.1830 seconds
Accuracy: 0.5992
              precision    recall  f1-score   support

         CNC       0.99      0.81      0.89      4130
         CWC       0.04      0.50      0.08       133
      Closed       0.08      0.79      0.14       168
         HNC       0.91      0.91      0.91      1501
         HWC       0.13      0.51      0.21       403
         RNC       0.79      0.57      0.66      5253
         RWC       0.59      0.26      0.36      3501

    accuracy                           0.60     15089
   macro avg       0.50      0.62      0.47     15089
weighted avg       0.78      0.60      0.66     15089



### GridSearchCV with Random Forest

In [28]:
# Define hyperparameter grid for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of trees
    'max_depth': [10, 20, None],  # Tree depth
    'min_samples_split': [2, 5, 10],  # Minimum samples to split a node
    'min_samples_leaf': [1, 2, 4],  # Minimum samples per leaf
    'class_weight': ['balanced']
}

# Initialize Random Forest
rf_model = RandomForestClassifier(random_state=42)

# Grid Search
grid_search = GridSearchCV(rf_model, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
start_time = time.time()
grid_search.fit(X_train, y_train)
rf_train_time = time.time() - start_time

# Best model from grid search
best_rf = grid_search.best_estimator_

# Evaluate Random Forest
y_pred_rf = best_rf.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred_rf)

print("Random Forest Results:")
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Train Time: {rf_train_time:.4f} seconds")
print(f"Accuracy: {rf_accuracy:.4f}")
print(classification_report(y_test, y_pred_rf, target_names=label_encoder.classes_))


Random Forest Results:
Best Parameters: {'class_weight': 'balanced', 'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}
Train Time: 106.3309 seconds
Accuracy: 0.6140
              precision    recall  f1-score   support

         CNC       0.99      0.81      0.89      4130
         CWC       0.04      0.47      0.08       133
      Closed       0.08      0.82      0.14       168
         HNC       0.93      0.90      0.92      1501
         HWC       0.16      0.48      0.24       403
         RNC       0.76      0.64      0.69      5253
         RWC       0.61      0.24      0.34      3501

    accuracy                           0.61     15089
   macro avg       0.51      0.62      0.47     15089
weighted avg       0.77      0.61      0.66     15089



###  GridSearchCV with Multinomial Logistic Regression

In [36]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

# Define hyperparameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100]  # Regularization strength
}

# Initialize logistic regression model
log_reg = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')

# Grid search with cross-validation
grid_search = GridSearchCV(log_reg, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Get best model
best_log_reg = grid_search.best_estimator_
print(f"Best Parameters: {grid_search.best_params_}")

# Make predictions
y_pred = best_log_reg.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best Parameters: {'C': 100}
Accuracy: 0.5890
              precision    recall  f1-score   support

         CNC       0.98      0.80      0.88      4130
         CWC       0.04      0.47      0.07       133
      Closed       0.08      0.81      0.14       168
         HNC       0.87      0.90      0.89      1501
         HWC       0.13      0.41      0.20       403
         RNC       0.75      0.59      0.66      5253
         RWC       0.57      0.21      0.31      3501

    accuracy                           0.59     15089
   macro avg       0.49      0.60      0.45     15089
weighted avg       0.75      0.59      0.64     15089



### Stacking Random Forest with XGBoost

In [40]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# Initialize Logistic Regression with the best C parameter (from previous tuning)
log_reg = LogisticRegression(max_iter=500, C=10, random_state=42)

# Initialize Random Forest with balanced class weight
rf_model = RandomForestClassifier(
    class_weight='balanced',  # Handle class imbalance
    max_depth=20,
    min_samples_leaf=2,
    min_samples_split=5,
    n_estimators=100,
    random_state=42
)

# Initialize XGBoost (no scale_pos_weight as we're using multiclass)
xgb_model = xgb.XGBClassifier(
    n_estimators=100, 
    learning_rate=0.1, 
    max_depth=3, 
    random_state=42
)

# Create a Stacking Classifier with both models
stacking_clf = StackingClassifier(
    estimators=[('rf', rf_model), ('xgb', xgb_model)],
    final_estimator=LogisticRegression()  # Meta-model
)

# Train the Stacking Classifier
stacking_clf.fit(X_train, y_train)

# Make predictions
y_pred = stacking_clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Detailed classification report
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Accuracy: 0.7714
              precision    recall  f1-score   support

         CNC       0.99      0.82      0.89      4130
         CWC       0.00      0.00      0.00       133
      Closed       0.26      0.05      0.09       168
         HNC       0.97      0.89      0.93      1501
         HWC       0.44      0.10      0.16       403
         RNC       0.69      0.90      0.78      5253
         RWC       0.65      0.62      0.63      3501

    accuracy                           0.77     15089
   macro avg       0.57      0.48      0.50     15089
weighted avg       0.77      0.77      0.76     15089




Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



### Stacking Random Forest with XGBoost changing base learners

In [47]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import GradientBoostingClassifier

# Base learners
#base_learners = [
#    ('rf', RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')),
#    ('xgb', XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)),
#    ('svc', SVC(kernel='linear', random_state=42, class_weight='balanced'))
#]

# Best Parameters: {'class_weight': 'balanced', 'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}
base_learners = [
    ('rf', RandomForestClassifier(n_estimators=100, max_depth=20, min_samples_leaf=2, min_samples_split=5, class_weight='balanced', random_state=42)),
    ('xgb', XGBClassifier(n_estimators=300, learning_rate=0.05, max_depth=5, random_state=42)),
    ('svc', SVC(kernel='linear', random_state=42, class_weight='balanced'))
]

# Meta-model (Logistic Regression)
#meta_model = LogisticRegression(max_iter=5000, random_state=42)

meta_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)


# Stacking model with cross-validation and improved base learners
stacking_model = StackingClassifier(estimators=base_learners, final_estimator=meta_model, cv=5)

# Fit stacking model
stacking_model.fit(X_train, y_train)

# Make predictions
y_pred = stacking_model.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Accuracy: 0.7724
              precision    recall  f1-score   support

         CNC       0.98      0.82      0.89      4130
         CWC       0.00      0.00      0.00       133
      Closed       0.29      0.01      0.02       168
         HNC       0.97      0.89      0.93      1501
         HWC       0.43      0.10      0.16       403
         RNC       0.69      0.90      0.78      5253
         RWC       0.65      0.62      0.64      3501

    accuracy                           0.77     15089
   macro avg       0.57      0.48      0.49     15089
weighted avg       0.77      0.77      0.76     15089



### Neural Networks

In [52]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, classification_report

# Encode target variable
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df_class['mystate'])  # Target column

# Encode categorical features
for col in ['ai_category', 'ai_sub_category', 'configurationItem']:
    df_class[col] = LabelEncoder().fit_transform(df_class[col])

X = df_class.drop(columns=['mystate'])  # Drop target column

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}

# Define Neural Network Model with Regularization
model = keras.Sequential([
    keras.layers.Input(shape=(X_train.shape[1],)),
    keras.layers.Dense(256, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001)),
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(0.4),
    keras.layers.Dense(128, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001)),
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(0.4),
    keras.layers.Dense(64, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001)),
    keras.layers.Dense(len(np.unique(y_train)), activation='softmax')  # Multi-class classification
])

# Compile Model with Learning Rate Scheduler
lr_schedule = keras.optimizers.schedules.ExponentialDecay(initial_learning_rate=0.001, decay_steps=1000, decay_rate=0.9)
optimizer = keras.optimizers.Adam(learning_rate=lr_schedule)

model.compile(optimizer=optimizer,
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Implement Early Stopping
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the Model with Early Stopping
history = model.fit(X_train, y_train, epochs=150, batch_size=32, validation_data=(X_test, y_test),
                    class_weight=class_weight_dict, callbacks=[early_stopping], verbose=1)

# Make Predictions
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

# Evaluate Model
accuracy = accuracy_score(y_test, y_pred)
print(f"Neural Network Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Epoch 1/150
[1m1887/1887[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 984us/step - accuracy: 0.4701 - loss: 1.7405 - val_accuracy: 0.5116 - val_loss: 1.4375
Epoch 2/150
[1m1887/1887[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.5263 - loss: 1.4152 - val_accuracy: 0.5405 - val_loss: 1.2718
Epoch 3/150
[1m1887/1887[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 935us/step - accuracy: 0.5349 - loss: 1.3010 - val_accuracy: 0.5472 - val_loss: 1.2166
Epoch 4/150
[1m1887/1887[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 928us/step - accuracy: 0.5569 - loss: 1.2328 - val_accuracy: 0.5703 - val_loss: 1.2063
Epoch 5/150
[1m1887/1887[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 925us/step - accuracy: 0.5557 - loss: 1.2075 - val_accuracy: 0.5778 - val_loss: 1.1275
Epoch 6/150
[1m1887/1887[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 934us/step - accuracy: 0.5685 - loss: 1.1618 - val_accuracy: 0.5713 - val_loss: 1.1465
