In [19]:
import sys
sys.path.insert(0, 'D:\My Work\Final Year Project\Main\FYP23-Deep-Document-Clustering\Transformer')

In [20]:
import numpy as np
import pandas as pd
import nltk
import os
import warnings
from collections import defaultdict
from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import Normalizer
from nltk.corpus import wordnet as wn, stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score
from sklearn.metrics.cluster import normalized_mutual_info_score as nmi_score
from sklearn.metrics import adjusted_rand_score as ari_score
from sklearn.metrics import f1_score 
from sklearn.cluster import SpectralClustering
from sklearn.decomposition import LatentDirichletAllocation
import pickle
import torch
import random

from utils.Encoder import Encoder

os.environ['OMP_NUM_THREADS'] = '1'
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn.cluster._kmeans")
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn.feature_extraction.text")


In [21]:
doc_content = []  # all the content in the document
doc_name = []  # name of the document
files_path = []  # path to the documents
lexical_chain = []  # list of lexical chains from each document
total_features = []  # total number of features. 1652
final_training_Features = []
corpus = []
doc_list_sequence = []

In [22]:
def reset_random_seeds(seed):
   os.environ['PYTHONHASHSEED']=str(seed)
   torch.manual_seed(seed)
   np.random.seed(seed)
   random.seed(seed)


def ReadDocuments(dir_name):
    for Path in os.listdir(dir_name):
        file_p = os.path.join(dir_name, Path)
        with open(file_p, "r") as file:
            FileContents = file.read()
            doc_content.append(FileContents.lower())
            doc_name.append(Path)
            files_path.append(file_p)

def Purity_Score(label_seq, pred_labels):
    # Calculate the confusion matrix to compare true labels and cluster assignments
    confusion = confusion_matrix(label_seq, pred_labels)
    # Calculate the purity
    purity = np.sum(np.max(confusion, axis=0)) / np.sum(confusion)
    return purity

def Evaluate(X, true_labels, predicted_labels):
    purity = Purity_Score(true_labels, predicted_labels)
    silhouette = silhouette_score(X, predicted_labels, metric='euclidean')
    ari = ari_score(true_labels, predicted_labels)
    nmi = nmi_score(true_labels, predicted_labels)
    
    print(f"Purity: {purity}")
    print(f"Silhouette Score: {silhouette}")
    print(f"ARI Score: {ari}")
    print(f"NMI Score: {nmi}")

def SaveFeatures(X, file_name):
    pickle_path = open(file_name, 'wb')
    pickle.dump(X, pickle_path)
    pickle_path.close()

def ReadFeatures(file_name):
    pickle_read = open(file_name, 'rb')
    x = pickle.load(pickle_read)
    pickle_read.close()
    return x

In [23]:
file_name = "DOC50_Features/DOC50_TFIDF_Features.pkl"
x = ReadFeatures(file_name)

In [24]:
x.shape

(50, 3885)

In [25]:
x = x.toarray()

In [26]:
x = torch.tensor(x, dtype=torch.float32)

In [27]:
x = x.reshape(shape=(1, x.size()[0], x.size()[1]))

In [28]:
x.size()

torch.Size([1, 50, 3885])

In [29]:
d_model = 3885
num_heads = 1
drop_prob = 0.1
batch_size = 1
max_sequence_length = 50
ffn_hidden = 2048
num_layers = 30

In [30]:
reset_random_seeds(42)

In [31]:
from utils.PositionalEncoding import PositionalEncoding
pe = PositionalEncoding(d_model=d_model, max_sequence_length=max_sequence_length)
positional_encoding = pe.forward()

In [32]:
positional_encoding = positional_encoding[:,0:-1]

In [33]:
positional_encoding.size()

torch.Size([50, 3885])

In [34]:
x = x + positional_encoding

In [35]:
encoder = Encoder(d_model=d_model, ffn_hidden=ffn_hidden, num_heads=num_heads, drop_prob=drop_prob, num_layers=num_layers)

In [36]:
out = encoder(x)

x.size(): torch.Size([1, 50, 3885])
tensor([[[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           1.0000e+00,  0.0000e+00],
         [ 8.4147e-01,  5.6684e-01,  8.3891e-01,  ...,  3.3962e-02,
           1.0000e+00,  3.3961e-02],
         [ 9.0930e-01, -4.1615e-01,  9.1319e-01,  ...,  2.0143e-04,
           1.0000e+00,  2.0047e-04],
         ...,
         [ 1.2357e-01, -9.9234e-01,  3.3934e-01,  ...,  4.7335e-03,
           9.9999e-01,  4.7111e-03],
         [-7.6825e-01, -6.4014e-01, -6.0443e-01,  ...,  4.8342e-03,
           9.9999e-01,  4.8114e-03],
         [-9.5375e-01,  3.0059e-01, -9.9730e-01,  ...,  4.9350e-03,
           9.9999e-01,  4.9116e-03]]])
qkv.size(): torch.Size([1, 50, 11655])
1, 50, 1, 3885
qkv.size(): torch.Size([1, 50, 1, 11655])
q.size(): torch.Size([1, 1, 50, 3885]), k.size(): torch.Size([1, 1, 50, 3885]), v.size(): torch.Size([1, 1, 50, 3885])
values.size(): torch.Size([1, 1, 50, 3885]), attention.size(): torch.Size([1, 1, 50, 50])
x.size(): tor

In [37]:
def KMeans_Labels(X, n, rstate_limit, true_labels):
    print(X)
    # Specify the number of clusters (you can choose an appropriate value)
    num_clusters = n
    
    # find centoids which give maximum purity
    purity_collection = {}
    for i in range(rstate_limit):
        clusters = KMeans(n_init='auto', n_clusters=num_clusters, random_state=i, init='k-means++').fit(X).labels_
        purity_collection[i] = Purity_Score(true_labels, clusters)
    
    max_rand_state = max(purity_collection, key=purity_collection.get)
    print(f"Maximum purity of {purity_collection[max_rand_state]} found on random state {max_rand_state}")

    # Create a KMeans model
    kmeans = KMeans(n_init='auto', n_clusters=num_clusters, random_state=max_rand_state, init='k-means++')
    # Fit the KMeans model to the TF-IDF data
    kmeans.fit(X)
    # Get the cluster assignments for each document
    cluster_assignments = kmeans.labels_
    
    return cluster_assignments

def Actual_Labels():
    ReadDocuments(os.getcwd() + "\Doc50")
    actual_labels = {} # dictionary to store true assignments for each document | read sequence not followed
    label_path = os.getcwd() + '\\Doc50 GT\\'
    for labels_directory in os.listdir(label_path): # for each assignment folder
        actual_cluster = int(labels_directory[1]) # extract cluster label from directory name
        doc_labels = os.listdir(label_path + f"\\{labels_directory}") # for all document ids assigned to this cluster
        for doc in doc_labels:
            actual_labels[doc] = actual_cluster-1 # save cluster label
    
    label_seq = [] # save labels in order of documents read
    for doc in doc_name:
        label_seq.append(actual_labels[doc])
    return label_seq

def print_results(true_labels, predicted_labels, X):
    print("RESULTS:")
    print(f"Purity: {Purity_Score(true_labels, predicted_labels)}")
    print(f"Silhouette Score: {silhouette_score(X, predicted_labels)}")


def wrapperFunction():
    # ReadDocuments('Doc50')
    vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', preprocessor=custom_preprocessor)
    X = vectorizer.fit_transform(doc_content)

    SaveFeatures(X, 'DOC50_TFIDF_Features.pkl')
    
    true_labels = Actual_Labels()
    predicted_labels = KMeans_Labels(X, 5, 1500, true_labels)
    Evaluate(X, true_labels, predicted_labels)
    return predicted_labels, X


In [38]:
del encoder

In [39]:
enhanced_x = out.detach().numpy()

In [40]:
enhanced_x[0].shape

(50, 3885)

In [41]:
doc_content = []  # all the content in the document
doc_name = []  # name of the document
files_path = []  # path to the documents
lexical_chain = []  # list of lexical chains from each document
total_features = []  # total number of features. 1652
final_training_Features = []
corpus = []
doc_list_sequence = []

In [42]:
true_labels = Actual_Labels()
pred_lables = KMeans_Labels(enhanced_x[0], 5, 700, true_labels)

[[ 0.22546476 -0.752269   -0.2797785  ... -1.6128085  -0.86013365
  -0.50368404]
 [ 0.23896243 -1.9640815  -0.22243536 ... -1.8387103   0.96590763
  -0.9953801 ]
 [ 1.5015172  -0.8540485  -0.70489764 ... -0.60399705 -0.7053957
  -0.3041028 ]
 ...
 [ 0.3390127  -0.3497321  -0.73360956 ... -1.7436712   1.2708973
  -2.2900813 ]
 [-0.33624384 -1.62918    -0.90313387 ... -1.5827706   0.40459463
  -1.3033046 ]
 [-0.11811316 -0.96314085  0.02694708 ... -1.8406849   0.39085737
  -1.0530618 ]]
Maximum purity of 0.9 found on random state 338


In [43]:
len(true_labels)

50

In [44]:
Evaluate(enhanced_x[0], true_labels, pred_lables)

Purity: 0.9
Silhouette Score: 0.02138533629477024
ARI Score: 0.7606082230883241
NMI Score: 0.8141170603643352
