<a href="https://colab.research.google.com/github/castronyabola/test/blob/master/CS7642ASS1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. IT Expenditure Classification - Decision Tree Model

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, learning_curve, validation_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder

# Load the dataset
file_path = 'IT_Expenditure_Data_Classification.csv'  # Replace with your dataset file path
it_expenditure_df = pd.read_csv(file_path)

# Preprocess the data: One-hot encode categorical variables
one_hot_encoder = OneHotEncoder()
X_categorical = one_hot_encoder.fit_transform(it_expenditure_df[['Company Size', 'Sector']])
X_numerical = it_expenditure_df[['Total Revenue', 'IT Spending as % of Revenue']].values
X = np.hstack((X_categorical.toarray(), X_numerical))
y = it_expenditure_df['Rating']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Decision Tree model using Entropy
decision_tree = DecisionTreeClassifier(criterion='entropy', random_state=42)
decision_tree.fit(X_train, y_train)

# Evaluate the model
y_pred = decision_tree.predict(X_test)
print('Classification Report for Decision Trees (IT Expenditure Classification)')
print(classification_report(y_test, y_pred))

# Function to plot learning curves
def plot_learning_curve(estimator, X, y, cv=5, n_jobs=4, train_sizes=np.linspace(.1, 1.0, 5)):
    train_sizes, train_scores, test_scores, fit_times, _ = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, return_times=True)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.figure(figsize=(10, 6))
    plt.grid()
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
    plt.title("Decision Trees Learning Curve (IT Expenditure Classification)")
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    plt.legend(loc="best")
    plt.show()

# Plotting the learning curve
plot_learning_curve(decision_tree, X, y)

# Function to plot model complexity graph for 'max_depth'
def plot_model_complexity_graph(estimator, X, y, cv=5, n_jobs=4, max_depth_range=np.arange(1, 15)):
    train_scores, test_scores = validation_curve(
        estimator, X, y, param_name="max_depth", param_range=max_depth_range, cv=cv, scoring='accuracy', n_jobs=n_jobs)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.figure(figsize=(10, 6))
    plt.grid()
    plt.fill_between(max_depth_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(max_depth_range, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(max_depth_range, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(max_depth_range, test_scores_mean, 'o-', color="g", label="Cross-validation score")
    plt.title("Decision Trees Model Complexity - Max Depth (IT Expenditure Classification)")
    plt.xlabel("Max Depth")
    plt.ylabel("Score")
    plt.legend(loc="best")
    plt.show()

# Plotting model complexity graph for 'max_depth'
plot_model_complexity_graph(decision_tree, X, y)


It Expenditure Classification - Neural Networks

In [None]:
# Complete code to preprocess the data, train an MLPClassifier, and plot learning and validation curves.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, learning_curve, validation_curve
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import classification_report

# Load the dataset with noise added previously
file_path = 'IT_Expenditure_Data_Classification.csv'
it_expenditure_noisy_df = pd.read_csv(file_path)

# Preprocess the data: One-hot encode categorical variables and scale numerical features
one_hot_encoder = OneHotEncoder()
scaler = StandardScaler()

# One-hot encoding for categorical features
X_categorical_encoded = one_hot_encoder.fit_transform(it_expenditure_noisy_df[['Company Size', 'Sector']]).toarray()

# Scaling numerical features
X_numerical_scaled = scaler.fit_transform(it_expenditure_noisy_df.drop(columns=['Company Size', 'Sector', 'Rating']))

# Combine categorical and numerical features
X_combined = np.hstack((X_categorical_encoded, X_numerical_scaled))
y = it_expenditure_noisy_df['Rating']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Define and train the MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(50,), activation='relu', solver='adam', max_iter=500, random_state=42)
mlp.fit(X_train, y_train)

# Predict and print the classification report
y_pred = mlp.predict(X_test)
print('Classification Report for Neural Networks (IT Expenditure Classification)')
print(classification_report(y_test, y_pred))

# Plot learning curve
train_sizes, train_scores, test_scores = learning_curve(mlp, X_train, y_train, cv=5, n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 5))

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.figure(figsize=(10, 6))
plt.title("Neural Networks Learning Curve for MLPClassifier (IT Expenditure Classification)")
plt.xlabel("Training examples")
plt.ylabel("Score")
plt.ylim(0.7, 1.01)
plt.grid()
plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.legend(loc="best")
plt.show()

# Plot validation curve for 'max_iter' hyperparameter
max_iter_range = np.linspace(100, 1000, 5).astype(int)
train_scores, test_scores = validation_curve(mlp, X_train, y_train, param_name="max_iter", param_range=max_iter_range, cv=5, scoring="accuracy", n_jobs=-1)

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.figure(figsize=(10, 6))
plt.title("Neural Networks Validation Curve with MLPClassifier - max_iter (IT Expenditure Classification)")
plt.xlabel("max_iter")
plt.ylabel("Score")
plt.ylim(0.7, 1.01)
plt.grid()
plt.plot(max_iter_range, train_scores_mean,'o-', color="r", label="Training score")
plt.fill_between(max_iter_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
plt.plot(max_iter_range, test_scores_mean, 'o-', color="g", label="Cross-validation score")
plt.fill_between(max_iter_range, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.legend(loc="best")
plt.show()



IT Expenditure Classification - Boosted Decision trees


In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import validation_curve, learning_curve
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Load the dataset
file_path = 'IT_Expenditure_Data_Classification.csv'
it_expenditure_noisy_df = pd.read_csv(file_path)

# Preprocess the data
one_hot_encoder = OneHotEncoder()
scaler = StandardScaler()
X_categorical_encoded = one_hot_encoder.fit_transform(it_expenditure_noisy_df[['Company Size', 'Sector']]).toarray()
X_numerical_scaled = scaler.fit_transform(it_expenditure_noisy_df.drop(columns=['Company Size', 'Sector', 'Rating']))
X_combined = np.hstack((X_categorical_encoded, X_numerical_scaled))
y = it_expenditure_noisy_df['Rating']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Initialize AdaBoost with Decision Trees as the base estimator
ada_boost = AdaBoostClassifier(
    base_estimator=DecisionTreeClassifier(max_depth=1),
    n_estimators=50,
    random_state=42
)

# Train the AdaBoost model
ada_boost.fit(X_train, y_train)

# Evaluate the model
y_pred_ada = ada_boost.predict(X_test)
print('Classification Report for Boosted Decision Trees (IT Expenditure Classification)')
print(classification_report(y_test, y_pred_ada))

# Corrected plot_learning_curve function
def plot_learning_curve(estimator, title, X, y, cv=5, n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.figure(figsize=(10, 6))
    plt.title(title)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    plt.ylim(0.7, 1.01)
    plt.grid()
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
    plt.legend(loc="best")
    plt.show()

# Plot the learning curve
plot_learning_curve(ada_boost, "Learning Curve for AdaBoost (IT Expenditure Classification)", X_train, y_train)

# Function to plot model complexity curve
def plot_model_complexity_curve(param_range, train_scores, test_scores, title):
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.figure(figsize=(10, 6))
    plt.title(title)
    plt.xlabel("Number of Estimators")
    plt.ylabel("Score")
    plt.ylim(0.7, 1.01)
    plt.plot(param_range, train_scores_mean, 'o-',label="Training score", color="r")
    plt.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.plot(param_range, test_scores_mean, 'o-',label="Cross-validation score", color="g")
    plt.fill_between(param_range, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.legend(loc="best")
    plt.show()

# Plot the validation curve for AdaBoost 'n_estimators' parameter
n_estimators_range = np.arange(10, 110, 10)
train_scores, test_scores = validation_curve(
    ada_boost, X_train, y_train, param_name="n_estimators", param_range=n_estimators_range, cv=5, scoring="accuracy", n_jobs=-1)
plot_model_complexity_curve(n_estimators_range, train_scores, test_scores, "AdaBoost Model Complexity - Number of Estimators (IT Expenditure Classification)")



IT Expenditure Classification - Support Vector Machines

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Load and preprocess the dataset
file_path = 'IT_Expenditure_Data_Classification.csv'
it_expenditure_noisy_df = pd.read_csv(file_path)
one_hot_encoder = OneHotEncoder()
scaler = StandardScaler()
X_categorical_encoded = one_hot_encoder.fit_transform(it_expenditure_noisy_df[['Company Size', 'Sector']]).toarray()
X_numerical_scaled = scaler.fit_transform(it_expenditure_noisy_df.drop(columns=['Company Size', 'Sector', 'Rating']))
X_combined = np.hstack((X_categorical_encoded, X_numerical_scaled))
y = it_expenditure_noisy_df['Rating']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# SVM with RBF kernel
svm_rbf = SVC(C=1.0, kernel='rbf', gamma='scale', random_state=42)
svm_rbf.fit(X_train, y_train)
y_pred_rbf = svm_rbf.predict(X_test)
print("Classification Report for SVM with RBF Kernel (IT Expenditure Classification):")
print(classification_report(y_test, y_pred_rbf))

# SVM with Linear kernel
svm_linear = SVC(C=1.0, kernel='linear', random_state=42)
svm_linear.fit(X_train, y_train)
y_pred_linear = svm_linear.predict(X_test)
print("\nClassification Report for SVM with Linear Kernel (IT Expenditure Classification):")
print(classification_report(y_test, y_pred_linear))

# Function to plot learning curve
def plot_learning_curve(estimator, title, X, y, cv=5, n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.figure(figsize=(10, 6))
    plt.title(title)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    plt.ylim(0.7, 1.01)
    plt.grid()
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
    plt.legend(loc="best")
    plt.show()

# Plot learning curve for SVM with RBF Kernel
plot_learning_curve(svm_rbf, "Learning Curve for SVM with RBF Kernel (IT Expenditure Classification)", X_train, y_train)

# Plot learning curve for SVM with Linear Kernel
plot_learning_curve(svm_linear, "Learning Curve for SVM with Linear Kernel (IT Expenditure Classification)", X_train, y_train)

# Part 2B: Plotting Validation Curves for SVM Classifiers

# Function to plot validation curve
def plot_validation_curve(estimator, X, y, param_name, param_range, title, cv=5, n_jobs=None):
    train_scores, test_scores = validation_curve(
        estimator, X, y, param_name=param_name, param_range=param_range, cv=cv, scoring="accuracy", n_jobs=n_jobs)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.figure(figsize=(10, 6))
    plt.title(title)
    plt.xlabel(param_name)
    plt.ylabel("Score")
    plt.ylim(0.7, 1.01)
    plt.grid()
    plt.plot(param_range, train_scores_mean, label="Training score", color="r")
    plt.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.plot(param_range, test_scores_mean, label="Cross-validation score", color="g")
    plt.fill_between(param_range, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.legend(loc="best")
    plt.show()

# C parameter range for validation curve
C_range = np.logspace(-2, 2, 5)

# Plotting Validation Curves for 'C' hyperparameter with RBF kernel
plot_validation_curve(svm_rbf, X_train, y_train, "C", C_range, "Validation Curve for SVM (RBF Kernel) - C parameter (IT Expenditure Classification)")

# Plotting Validation Curves for 'C' hyperparameter with Linear kernel
plot_validation_curve(svm_linear, X_train, y_train, "C", C_range, "Validation Curve for SVM (Linear Kernel) - C parameter (IT Expenditure Classification)")





IT Expenditure Classification - k-Nearest Neighbours

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import validation_curve, learning_curve, train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
file_path = 'IT_Expenditure_Data_Classification.csv'
it_expenditure_noisy_df = pd.read_csv(file_path)

# Preprocess the data: One-hot encode categorical variables and scale numerical features
one_hot_encoder = OneHotEncoder()
scaler = StandardScaler()
X_categorical_encoded = one_hot_encoder.fit_transform(it_expenditure_noisy_df[['Company Size', 'Sector']]).toarray()
X_numerical_scaled = scaler.fit_transform(it_expenditure_noisy_df.drop(columns=['Company Size', 'Sector', 'Rating']))
X_combined = np.hstack((X_categorical_encoded, X_numerical_scaled))
y = it_expenditure_noisy_df['Rating']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Define a range for 'k' (number of neighbors)
k_range = np.arange(1, 31)

# Train k-NN Classifier and evaluate for different values of 'k'
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred_knn = knn.predict(X_test)
    print(f"\nClassification Report for k-NN with k={k} (IT Expenditure Classification):")
    print(classification_report(y_test, y_pred_knn))

# Function to plot learning curve
def plot_learning_curve(estimator, title, X, y, cv=5, n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.figure(figsize=(10, 6))
    plt.title(title)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    plt.ylim(0.7, 1.01)
    plt.grid()
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
    plt.legend(loc="best")
    plt.show()

# Plot learning curve for k-NN with 5 neighbors as an example
plot_learning_curve(KNeighborsClassifier(n_neighbors=5), "Learning Curve for k-NN (IT Expenditure Classification)", X_train, y_train)

# Function to plot validation curve
def plot_knn_validation_curve(X, y, k_range, title, cv=5):
    train_scores, test_scores = validation_curve(
        KNeighborsClassifier(), X, y, param_name="n_neighbors", param_range=k_range, cv=cv, scoring="accuracy", n_jobs=-1)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.figure(figsize=(10, 6))
    plt.title(title)
    plt.xlabel("Number of Neighbors")
    plt.ylabel("Score")
    plt.ylim(0.7, 1.01)
    plt.grid()
    plt.plot(k_range, train_scores_mean, label="Training score", color="r")
    plt.fill_between(k_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.plot(k_range, test_scores_mean, label="Cross-validation score", color="g")
    plt.fill_between(k_range, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.legend(loc="best")
    plt.show()

# Plot validation curve for k-NN
plot_knn_validation_curve(X_train, y_train, k_range, "Validation Curve for k-NN (IT Expenditure Classification)")



IT Infrastructure Classification - Decision Trees

In [None]:
# Let's adapt the code to work with the IT Infrastructure dataset.

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, learning_curve, validation_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Load the dataset
file_path = 'IT_Infrastructure_Data_Modified.csv'  # Replace with your dataset file path
it_infrastructure_df = pd.read_csv(file_path)

# Preprocess the data: One-hot encode categorical variables
one_hot_encoder = OneHotEncoder()
X = one_hot_encoder.fit_transform(it_infrastructure_df.drop('Rating', axis=1))
y = it_infrastructure_df['Rating']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Decision Tree model using Entropy
decision_tree = DecisionTreeClassifier(criterion='entropy', random_state=42)
decision_tree.fit(X_train, y_train)

# Evaluate the model
y_pred = decision_tree.predict(X_test)
print('Classification Report for Decision Trees (IT Infrastructure Classification)')
print(classification_report(y_test, y_pred))

# Function to plot learning curves
def plot_learning_curve(estimator, X, y, cv=5, n_jobs=4, train_sizes=np.linspace(.1, 1.0, 5)):
    train_sizes, train_scores, test_scores, fit_times, _ = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, return_times=True)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.figure(figsize=(10, 6))
    plt.grid()
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
    plt.title("Decision Trees Learning Curve (IT Infrastructure Classification)")
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    plt.legend(loc="best")
    plt.show()

# Plotting the learning curve
plot_learning_curve(decision_tree, X, y)

# Function to plot model complexity graph for 'max_depth'
def plot_model_complexity_graph(estimator, X, y, cv=5, n_jobs=4, max_depth_range=np.arange(1, 15)):
    train_scores, test_scores = validation_curve(
        estimator, X, y, param_name="max_depth", param_range=max_depth_range, cv=cv, scoring='accuracy', n_jobs=n_jobs)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.figure(figsize=(10, 6))
    plt.grid()
    plt.fill_between(max_depth_range, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(max_depth_range, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(max_depth_range, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(max_depth_range, test_scores_mean, 'o-', color="g", label="Cross-validation score")
    plt.title("Decision Trees Model Complexity - Max Depth (IT Infrastructure Classification)")
    plt.xlabel("Max Depth")
    plt.ylabel("Score")
    plt.legend(loc="best")
    plt.show()

# Plotting model complexity graph for 'max_depth'
plot_model_complexity_graph(decision_tree, X, y)



IT Infrastructure Dataset - Neural Networks

In [None]:
# Let's adapt the code to work with the IT Infrastructure dataset using an MLPClassifier.

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, learning_curve, validation_curve
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Load the dataset
file_path = 'IT_Infrastructure_Data_Modified.csv'
df_infrastructure = pd.read_csv(file_path)

# Preprocess the data: One-hot encode categorical variables
encoder = OneHotEncoder()
X = encoder.fit_transform(df_infrastructure.drop('Rating', axis=1))
y = df_infrastructure['Rating']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define and train the MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(50,), activation='relu', solver='adam', max_iter=500, random_state=42)
mlp.fit(X_train, y_train)

# Predict and print the classification report
y_pred = mlp.predict(X_test)
print('Classification Report for Neural Networks (IT Infrastructure Classification)')
print(classification_report(y_test, y_pred))

# Plot learning curve
train_sizes, train_scores, test_scores = learning_curve(mlp, X_train, y_train, cv=5, n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 5))

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.figure(figsize=(10, 6))
plt.title("Neural Networks Learning Curve for MLPClassifier (IT Infrastructure Classification)")
plt.xlabel("Training examples")
plt.ylabel("Score")
plt.ylim(0.7, 1.01)
plt.grid()
plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.legend(loc="best")
plt.show()

# Plot validation curve for 'max_iter' hyperparameter
max_iter_range = np.linspace(100, 1000, 5).astype(int)
train_scores, test_scores = validation_curve(mlp, X_train, y_train, param_name="max_iter", param_range=max_iter_range, cv=5, scoring="accuracy", n_jobs=-1)

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.figure(figsize=(10, 6))
plt.title("Neural Networks Validation Curve with MLPClassifier (max_iter) (IT Infrastructure Classification)")
plt.xlabel("max_iter")
plt.ylabel("Score")
plt.ylim(0.7, 1.01)
plt.grid()
plt.plot(max_iter_range, train_scores_mean,'o-', color="r", label="Training score")
plt.fill_between(max_iter_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
plt.plot(max_iter_range, test_scores_mean, 'o-', color="g", label="Cross-validation score")
plt.fill_between(max_iter_range, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.legend(loc="best")
plt.show()



IT Infrastructure Classification - Boosted Decision Trees

In [None]:
# Let's modify the provided code to work with the IT Infrastructure dataset using AdaBoostClassifier.

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import validation_curve, learning_curve, train_test_split
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder

# Load the dataset
file_path = 'IT_Infrastructure_Data_Modified.csv'  # Adjust path as needed
it_infrastructure_df = pd.read_csv(file_path)

# Preprocess the data: One-hot encode categorical variables
one_hot_encoder = OneHotEncoder()
X = one_hot_encoder.fit_transform(it_infrastructure_df.drop('Rating', axis=1))
y = it_infrastructure_df['Rating']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize AdaBoost with Decision Trees as the base estimator
ada_boost = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=3),
    n_estimators=50,
    random_state=42
)

# Train the AdaBoost model
ada_boost.fit(X_train, y_train)

# Evaluate the model
y_pred_ada = ada_boost.predict(X_test)
print('Classification Report for Boosted Decision Trees (IT Infrastructure Classification)')
print(classification_report(y_test, y_pred_ada))

# Corrected plot_learning_curve function
def plot_learning_curve(estimator, title, X, y, cv=5, n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.figure(figsize=(10, 6))
    plt.title(title)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    plt.ylim(0.7, 1.01)
    plt.grid()
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
    plt.legend(loc="best")
    plt.show()

# Plot the learning curve
plot_learning_curve(ada_boost, "Boosted Decision Trees Learning Curve for AdaBoost (IT Infrastructure Classification)", X_train, y_train)

# Function to plot model complexity curve
def plot_model_complexity_curve(estimator, X, y, title, param_name, param_range):
    train_scores, test_scores = validation_curve(
        estimator, X, y, param_name=param_name, param_range=param_range, cv=5, scoring="accuracy", n_jobs=-1)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.figure(figsize=(10, 6))
    plt.title(title)
    plt.xlabel(param_name)
    plt.ylabel("Score")
    plt.ylim(0.7, 1.01)
    plt.grid()
    plt.plot(param_range, train_scores_mean, 'o-',label="Training score", color="r")
    plt.fill_between(param_range, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.plot(param_range, test_scores_mean, 'o-',label="Cross-validation score", color="g")
    plt.fill_between(param_range, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.legend(loc="best")
    plt.show()

# Plot the validation curve for AdaBoost 'n_estimators' parameter
n_estimators_range = np.arange(10, 110, 10)
plot_model_complexity_curve(ada_boost, X_train, y_train, "Boosted Decision Trees AdaBoost Model Complexity - Number of Estimators (IT Infrastructure Classification)", "n_estimators", n_estimators_range)



IT Infrastructure Classification - Support Vector Machines

In [None]:
# Adapting the code to work with the IT Infrastructure dataset using SVM classifiers.

from sklearn.svm import SVC
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, learning_curve, validation_curve
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Load the dataset
file_path = 'IT_Infrastructure_Data_Modified.csv'  # Adjust path as needed
df_infrastructure = pd.read_csv(file_path)

# Preprocess the data: One-hot encode categorical variables
one_hot_encoder = OneHotEncoder()
X = one_hot_encoder.fit_transform(df_infrastructure.drop('Rating', axis=1))
y = df_infrastructure['Rating']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# SVM with RBF kernel
svm_rbf = SVC(C=1.0, kernel='rbf', gamma='scale', random_state=42)
svm_rbf.fit(X_train, y_train)
y_pred_rbf = svm_rbf.predict(X_test)
print("Classification Report for SVM with RBF Kernel (IT Infrastructure Classification):")
print(classification_report(y_test, y_pred_rbf))

# SVM with Linear kernel
svm_linear = SVC(C=1.0, kernel='linear', random_state=42)
svm_linear.fit(X_train, y_train)
y_pred_linear = svm_linear.predict(X_test)
print("\nClassification Report for SVM with Linear Kernel (IT Infrastructure Classification):")
print(classification_report(y_test, y_pred_linear))

# Plot learning curve function
def plot_learning_curve(estimator, title, X, y, cv=5, n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.figure(figsize=(10, 6))
    plt.title(title)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    plt.ylim(0.7, 1.01)
    plt.grid()
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
    plt.legend(loc="best")
    plt.show()

# Plot learning curves for SVM classifiers
plot_learning_curve(svm_rbf, "Learning Curve for SVM with RBF Kernel (IT Infrastructure Classification)", X_train, y_train)
plot_learning_curve(svm_linear, "Learning Curve for SVM with Linear Kernel (IT Infrastructure Classification)", X_train, y_train)

# Plot validation curve function
def plot_validation_curve(estimator, X, y, param_name, param_range, title, cv=5, n_jobs=None):
    train_scores, test_scores = validation_curve(
        estimator, X, y, param_name=param_name, param_range=param_range, cv=cv, scoring="accuracy", n_jobs=n_jobs)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.figure(figsize=(10, 6))
    plt.title(title)
    plt.xlabel(param_name)
    plt.ylabel("Score")
    plt.ylim(0.7, 1.01)
    plt.grid()
    plt.plot(param_range, train_scores_mean, 'o-',label="Training score", color="r")
    plt.fill_between(param_range, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.plot(param_range, test_scores_mean,'o-', label="Cross-validation score", color="g")
    plt.fill_between(param_range, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.legend(loc="best")
    plt.show()

# C parameter range for validation curves
C_range = np.logspace(-2, 2, 5)

# Plotting Validation Curves for 'C' hyperparameter with RBF kernel
plot_validation_curve(svm_rbf, X_train, y_train, "C", C_range, "Validation Curve for SVM (RBF Kernel) - C parameter (IT Infrastructure Classification)")

# Plotting Validation Curves for 'C' hyperparameter with Linear kernel
plot_validation_curve(svm_linear, X_train, y_train, "C", C_range, "Validation Curve for SVM (Linear Kernel) - C parameter (IT Infrastructure Classification)")



IT Infrastructure Classification - k-Nearest Neighbours

In [None]:
# Adapting the code to work with the IT Infrastructure dataset using KNeighborsClassifier.

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import validation_curve, learning_curve, train_test_split
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
file_path = 'IT_Infrastructure_Data_Modified.csv'  # Adjust path as needed
it_infrastructure_df = pd.read_csv(file_path)

# Preprocess the data: One-hot encode categorical variables
one_hot_encoder = OneHotEncoder()
X = one_hot_encoder.fit_transform(it_infrastructure_df.drop('Rating', axis=1))
y = it_infrastructure_df['Rating']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a range for 'k' (number of neighbors)
k_range = np.arange(1, 31)

# Train k-NN Classifier and evaluate for different values of 'k'
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred_knn = knn.predict(X_test)
    print(f"\nClassification Report for k-NN with k={k} (IT Infrastructure Classification):")
    print(classification_report(y_test, y_pred_knn))

# Function to plot learning curve
def plot_learning_curve(estimator, title, X, y, cv=5, n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.figure(figsize=(10, 6))
    plt.title(title)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    plt.ylim(0.7, 1.01)
    plt.grid()
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
    plt.legend(loc="best")
    plt.show()

# Plot learning curve for k-NN with 5 neighbors as an example
plot_learning_curve(KNeighborsClassifier(n_neighbors=5), "Learning Curve for k-NN (IT Infrastructure Classification)", X_train, y_train)

# Function to plot validation curve
def plot_knn_validation_curve(X, y, k_range, title, cv=5):
    train_scores, test_scores = validation_curve(
        KNeighborsClassifier(), X, y, param_name="n_neighbors", param_range=k_range, cv=cv, scoring="accuracy", n_jobs=-1)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.figure(figsize=(10, 6))
    plt.title(title)
    plt.xlabel("Number of Neighbors")
    plt.ylabel("Score")
    plt.ylim(0.7, 1.01)
    plt.grid()
    plt.plot(k_range, train_scores_mean, 'o-',label="Training score", color="r")
    plt.fill_between(k_range, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.plot(k_range, test_scores_mean, 'o-',label="Cross-validation score", color="g")
    plt.fill_between(k_range, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.legend(loc="best")
    plt.show()

# Plot validation curve for k-NN
plot_knn_validation_curve(X_train, y_train, k_range, "Validation Curve for k-NN (IT Infrastructure Classification)")

