In [1]:
import sys
sys.path.insert(0, 'D:\My Work\Final Year Project\Main\FYP23-Deep-Document-Clustering\Transformer')

In [2]:
import numpy as np
import pandas as pd
import nltk
import os
import warnings
from collections import defaultdict
from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import Normalizer
from nltk.corpus import wordnet as wn, stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score
from sklearn.metrics.cluster import normalized_mutual_info_score as nmi_score
from sklearn.metrics import adjusted_rand_score as ari_score
from sklearn.metrics import f1_score 
from sklearn.cluster import SpectralClustering
from sklearn.decomposition import LatentDirichletAllocation
import pickle
import torch
import random

from utils.Encoder import Encoder

os.environ['OMP_NUM_THREADS'] = '1'
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn.cluster._kmeans")
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn.feature_extraction.text")


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def reset_random_seeds(seed):
   os.environ['PYTHONHASHSEED']=str(seed)
   torch.manual_seed(seed)
   np.random.seed(seed)
   random.seed(seed)

def Purity_Score(label_seq, pred_labels):
    # Calculate the confusion matrix to compare true labels and cluster assignments
    confusion = confusion_matrix(label_seq, pred_labels)
    # Calculate the purity
    purity = np.sum(np.max(confusion, axis=0)) / np.sum(confusion)
    return purity

def Evaluate(X, true_labels, predicted_labels):
    purity = Purity_Score(true_labels, predicted_labels)
    silhouette = silhouette_score(X, predicted_labels, metric='euclidean')
    ari = ari_score(true_labels, predicted_labels)
    nmi = nmi_score(true_labels, predicted_labels)
    
    print(f"Purity: {purity}")
    print(f"Silhouette Score: {silhouette}")
    print(f"ARI Score: {ari}")
    print(f"NMI Score: {nmi}")

def SaveFeatures(X, file_name):
    pickle_path = open(file_name, 'wb')
    pickle.dump(X, pickle_path)
    pickle_path.close()

def ReadFeatures(file_name):
    pickle_read = open(file_name, 'rb')
    x = pickle.load(pickle_read)
    pickle_read.close()
    return x

In [4]:
x = np.loadtxt('bbc.txt', dtype=float)
y = np.loadtxt('bbc_label.txt', dtype=int)

In [5]:
x = torch.tensor(x, dtype=torch.float32)

In [6]:
x.size()

torch.Size([2225, 9635])

In [7]:
x = x.reshape(shape=(1, x.size()[0], x.size()[1]))

In [8]:
d_model = 9635
num_heads = 1
drop_prob = 0.1
batch_size = 1
max_sequence_length = 2225
ffn_hidden = 2048
num_layers = 10

In [9]:
reset_random_seeds(42)

In [10]:
x.size()

torch.Size([1, 2225, 9635])

In [11]:
from utils.PositionalEncoding import PositionalEncoding
pe = PositionalEncoding(d_model=d_model, max_sequence_length=max_sequence_length)
positional_encoding = pe.forward()

In [12]:
positional_encoding = positional_encoding[:,0:-1]
positional_encoding.size()

torch.Size([2225, 9635])

In [15]:
x = x + positional_encoding

In [16]:
encoder = Encoder(d_model=d_model, ffn_hidden=ffn_hidden, num_heads=num_heads, drop_prob=drop_prob, num_layers=num_layers)

In [17]:
out = encoder(x)

x.size(): torch.Size([1, 2225, 9635])
tensor([[[ 2.6300e-02,  1.0792e+00,  1.0320e-01,  ...,  0.0000e+00,
           1.0000e+00,  0.0000e+00],
         [ 8.4147e-01,  5.4030e-01,  8.9384e-01,  ...,  1.0029e-04,
           1.0000e+00,  1.0010e-04],
         [ 9.0930e-01, -3.4795e-01,  9.1088e-01,  ...,  2.0057e-04,
           1.0000e+00,  2.0019e-04],
         ...,
         [-7.7958e-01, -6.2630e-01, -2.0705e-01,  ...,  2.2100e-01,
           9.7527e-01,  2.2058e-01],
         [-9.4822e-01,  3.1760e-01,  7.1017e-01,  ...,  2.2110e-01,
           9.7525e-01,  2.2068e-01],
         [-2.4508e-01,  9.8140e-01,  9.7655e-01,  ...,  2.2119e-01,
           9.7523e-01,  2.2078e-01]]])
qkv.size(): torch.Size([1, 2225, 28905])
1, 2225, 1, 9635
qkv.size(): torch.Size([1, 2225, 1, 28905])
q.size(): torch.Size([1, 1, 2225, 9635]), k.size(): torch.Size([1, 1, 2225, 9635]), v.size(): torch.Size([1, 1, 2225, 9635])
values.size(): torch.Size([1, 1, 2225, 9635]), attention.size(): torch.Size([1, 1, 2225, 

In [18]:
del encoder

In [19]:
def KMeans_Labels(X, n, rstate_limit, true_labels):
    # Specify the number of clusters (you can choose an appropriate value)
    num_clusters = n
    
    # find centoids which give maximum purity
    purity_collection = {}
    for i in range(rstate_limit):
        clusters = KMeans(n_init='auto', n_clusters=num_clusters, random_state=i, init='k-means++').fit(X).labels_
        purity_collection[i] = Purity_Score(true_labels, clusters)
    
    max_rand_state = max(purity_collection, key=purity_collection.get)
    print(f"Maximum purity of {purity_collection[max_rand_state]} found on random state {max_rand_state}")

    # Create a KMeans model
    kmeans = KMeans(n_init='auto', n_clusters=num_clusters, random_state=max_rand_state, init='k-means++')
    # Fit the KMeans model to the TF-IDF data
    kmeans.fit(X)
    # Get the cluster assignments for each document
    cluster_assignments = kmeans.labels_
    
    return cluster_assignments

In [20]:
enhanced_x = out.detach().numpy()
true_labels = y
pred_lables = KMeans_Labels(enhanced_x[0], 5, 700, true_labels)

[[-1.4829814  -0.30550644 -2.6516962  ... -0.27367076  1.0434115
  -0.1667899 ]
 [-0.7777049  -0.36999097 -1.2766799  ... -0.6501163   0.7576171
  -0.4635597 ]
 [ 0.09600183 -1.6981431  -2.1667325  ... -1.2274513   0.816791
  -0.53556496]
 ...
 [-2.5809865  -1.1995898   0.02883495 ...  0.9508646  -0.26006764
  -0.6930991 ]
 [-2.0074213   0.51819694  0.20549996 ...  0.9733817  -0.05098412
  -0.540231  ]
 [-1.506138    0.07542907  0.51014477 ...  0.7031725  -0.5116253
  -0.5572052 ]]
Maximum purity of 0.9267415730337079 found on random state 199


In [21]:
Evaluate(enhanced_x[0], true_labels, pred_lables)

Purity: 0.9267415730337079
Silhouette Score: 0.07887065410614014
ARI Score: 0.831234633058411
NMI Score: 0.8632482451556426
