In [5]:
pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.4.2-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Using cached scipy-1.13.0-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.4.2-cp312-cp312-win_amd64.whl (10.6 MB)
   ---------------------------------------- 0.0/10.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/10.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/10.6 MB 435.7 kB/s eta 0:00:25
   ---------------------------------------- 0.0/10.6 MB 326.8 kB/s eta 0:00:33
   ---------------------------------------- 0.1/10.6 MB 525.1 kB/s eta 0:00:21
   ---------------------------------------- 0.1/10.6 MB 602.4 kB/s eta 0:00:18
    --------------------------------------- 0.2/10.6 MB 787.7 kB/s eta 0:00:14
    --------------------------------------- 0.2/10.6 MB 724.0 k


[notice] A new release of pip is available: 23.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, silhouette_score, adjusted_rand_score
from VectorSpaceModel import VectorSpaceModel

In [8]:
# Instantiate the Vector Space Model
vsm = VectorSpaceModel()

In [9]:
# Define the document labels based on the provided class names and document lists
doc_labels = {
    "Explainable Artificial Intelligence": [1, 2, 3, 7],
    "Heart Failure": [8, 9, 11],
    "Time Series Forecasting": [12, 13, 14, 15, 16],
    "Transformer Model": [17, 18, 21],
    "Feature Selection": [22, 23, 24, 25, 26]
}

In [10]:
# Create the feature matrix X and label array Y using Vector Space Model
X = []
Y = []
for label, docs in doc_labels.items():
    for doc in docs:
        X.append(vsm.index[str(doc)]['tf-id-frequencies'])
        Y.append(label)

# Convert lists to numpy arrays
X = np.array(X)
Y = np.array(Y)

In [12]:
print(X)


[[-6.23331375e+01 -1.89709549e+01 -1.08405457e+01 -0.00000000e+00
  -0.00000000e+00 -0.00000000e+00 -8.67243652e+01 -9.48547744e+00
  -1.35506821e+01 -0.00000000e+00 -2.71013641e+00 -2.16810913e+01
  -1.34151752e+02 -1.34151752e+02 -6.63983421e+01 -2.57462959e+01
  -1.62608185e+01 -1.08405457e+01 -0.00000000e+00 -0.00000000e+00]
 [-9.00385013e+00 -1.57567377e+01 -9.00385013e+00 -0.00000000e+00
  -0.00000000e+00 -0.00000000e+00 -5.62740633e+01 -4.50192506e+00
  -4.50192506e+00 -0.00000000e+00 -2.25096253e+00 -1.12548127e+01
  -5.51485820e+01 -5.51485820e+01 -4.27682881e+01 -1.35057752e+01
  -1.12548127e+01 -1.01293314e+01 -0.00000000e+00 -0.00000000e+00]
 [-1.12742310e+01 -1.03347118e+01 -6.57663477e+00 -0.00000000e+00
  -0.00000000e+00 -0.00000000e+00 -3.85202894e+01 -3.75807701e+00
  -4.69759626e+00 -0.00000000e+00 -1.87903851e+00 -6.57663477e+00
  -1.50323080e+01 -1.50323080e+01 -2.53670198e+01 -1.40927888e+01
  -5.63711552e+00 -4.69759626e+00 -0.00000000e+00 -0.00000000e+00]
 [-3.19

In [13]:
print(Y)

['Explainable Artificial Intelligence'
 'Explainable Artificial Intelligence'
 'Explainable Artificial Intelligence'
 'Explainable Artificial Intelligence' 'Heart Failure' 'Heart Failure'
 'Heart Failure' 'Time Series Forecasting' 'Time Series Forecasting'
 'Time Series Forecasting' 'Time Series Forecasting'
 'Time Series Forecasting' 'Transformer Model' 'Transformer Model'
 'Transformer Model' 'Feature Selection' 'Feature Selection'
 'Feature Selection' 'Feature Selection' 'Feature Selection']


In [14]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [15]:
print(X_scaled)

[[-4.24852826 -2.97422232 -2.81068175 -0.         -0.         -0.
  -3.39009868 -3.68105723 -3.90016214 -0.         -2.41845058 -3.27925869
  -4.00489045 -4.00489045 -3.45512318 -3.42389844 -2.26483735 -2.95367629
  -0.         -0.        ]
 [-0.31055726 -2.36644913 -2.19807749 -0.         -0.         -0.
  -2.01794544 -1.4359805  -0.92400775 -0.         -1.89650666 -1.39751118
  -1.44010632 -1.44010632 -2.02042184 -1.52863407 -1.40621745 -2.73179457
  -0.         -0.        ]
 [-0.478208   -1.34120352 -1.38851355 -0.         -0.         -0.
  -1.21792386 -1.10087899 -0.9883644  -0.         -1.47373983 -0.55318791
  -0.1377588  -0.1377588  -0.96390191 -1.61952449 -0.4426815  -1.03722404
  -0.         -0.        ]
 [ 0.11857166  0.13005183 -0.25976453 -0.         -0.         -0.
  -0.05755317 -0.27078856 -0.49331755 -0.         -1.51514905  0.05759456
   0.26734237  0.26734237 -0.08276158 -0.22834896  0.19564792  0.22911957
  -0.         -0.        ]
 [ 0.14914436 -0.43776069 -0.5850239

In [16]:
label_encoder = LabelEncoder()
Y_encoded = label_encoder.fit_transform(Y)

In [17]:
print(Y_encoded)

[0 0 0 0 2 2 2 3 3 3 3 3 4 4 4 1 1 1 1 1]


In [18]:
# Calculate the number of unique classes in the dataset
num_classes = len(np.unique(Y_encoded))

# Ensure that test_size is at least equal to the number of classes
test_size = max(0.2, num_classes / len(Y_encoded))

In [19]:
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y_encoded, test_size=test_size, random_state=42, stratify=Y_encoded)


In [20]:
knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train, Y_train)
knn_predictions = knn_classifier.predict(X_test)

In [21]:
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(X_scaled)
kmeans_cluster_labels = kmeans.labels_


In [22]:
print(knn_predictions)

[3 3 1 1 1]


In [23]:
precision = precision_score(Y_test, knn_predictions, average='weighted', zero_division=0)
recall = recall_score(Y_test, knn_predictions, average='weighted', zero_division=0)
f1 = f1_score(Y_test, knn_predictions, average='weighted', zero_division=0)
accuracy = accuracy_score(Y_test, knn_predictions)

In [24]:
silhouette = silhouette_score(X_scaled, kmeans_cluster_labels)
rand_index = adjusted_rand_score(Y_encoded, kmeans_cluster_labels)

In [25]:
# Print evaluation metrics
print("Text Classification Metrics:")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"Accuracy: {accuracy}")
print("\nText Clustering Metrics:")
print(f"Silhouette Score: {silhouette}")
print(f"Adjusted Rand Index: {rand_index}")

Text Classification Metrics:
Precision: 0.06666666666666667
Recall: 0.2
F1 Score: 0.1
Accuracy: 0.2

Text Clustering Metrics:
Silhouette Score: 0.5514196969509567
Adjusted Rand Index: 0.06792452830188679
