In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

# Load the dataset
file_path = 'DataSet.csv'
df = pd.read_csv(file_path)

# 1. Drop unnecessary columns (like 'index')
if 'index' in df.columns:
    df.drop(columns=['index'], inplace=True)

# 2. Handle missing values (fill NaN with the mean of the column)
df.fillna(df.mean(), inplace=True)

# Define scaling criteria
def needs_scaling(row):
    if row['cpu'] > 70 or row['ram'] > 85 or row['disk'] > 70 or row['network'] > 15:
        return 1  # Scaling needed
    return 0  # No scaling needed

# Apply the criteria to the dataset
df['needs_scaling'] = df.apply(needs_scaling, axis=1)

# Split the dataset into features (X) and target (y)
X = df.drop(columns=['needs_scaling'])  # Features
y = df['needs_scaling']                # Target variable

# Balance the dataset using SMOTE if necessary
smote = SMOTE(random_state=42)
X, y = smote.fit_resample(X, y)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize models
models = {
    'SVM': SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000, C=1.0, random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=7),
    'Naive Bayes': GaussianNB(),
    'LDA': LinearDiscriminantAnalysis()
}

results = {}

# Train and evaluate models
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy:.2f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("-" * 50)

# Cross-validation for further evaluation
print("\nCross-Validation Results:")
for name, model in models.items():
    cv_scores = cross_val_score(model, X_train, y_train, cv=5)
    print(f"{name} - Mean CV Accuracy: {cv_scores.mean():.2f}")

# Compare results and highlight the best model
best_model = max(results, key=results.get)
print(f"\nBest Model: {best_model} with Test Accuracy: {results[best_model]:.2f}")


Model: SVM
Accuracy: 0.97
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       165
           1       0.97      0.97      0.97       167

    accuracy                           0.97       332
   macro avg       0.97      0.97      0.97       332
weighted avg       0.97      0.97      0.97       332

--------------------------------------------------
Model: Logistic Regression
Accuracy: 0.82
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.79      0.81       165
           1       0.80      0.85      0.83       167

    accuracy                           0.82       332
   macro avg       0.82      0.82      0.82       332
weighted avg       0.82      0.82      0.82       332

--------------------------------------------------
Model: KNN
Accuracy: 0.96
Classification Report:
              precision    recall  f1-score   support

           0       0.93   