In [19]:
import numpy as np
import nltk
import os
import re
import warnings
import pickle
from collections import defaultdict
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import silhouette_score
from sklearn.metrics.cluster import normalized_mutual_info_score as nmi_score
from sklearn.metrics import adjusted_rand_score as ari_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
from nltk.corpus import wordnet as wn, stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
# from tf_idf import wrapperFunction
from sklearn.cluster import SpectralClustering
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import adjusted_rand_score  # Import the Rand Index metric
from collections import Counter

os.environ['OMP_NUM_THREADS'] = '1'
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn.cluster._kmeans")
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn.feature_extraction.text")


In [2]:
doc_content = []  # all the content in the document
doc_name = []  # name of the document
files_path = []  # path to the documents
lexical_chain = []  # list of lexical chains from each document
total_features = []  # total number of features. 1652
final_training_Features = []
corpus = []
doc_list_sequence = []

data_dict = {}
cluster_dict = {}
mapped_data_dict = {}
word_to_int = {}

In [3]:
def ReadDocuments(dir_name):
    for Path in os.listdir(dir_name):
        file_p = os.path.join(dir_name, Path)
        with open(file_p, "r") as file:
            FileContents = file.read()
            doc_content.append(FileContents.lower())
            doc_name.append(Path)
            files_path.append(file_p)

In [4]:
def Purity_Score(true_labels, predicted_labels):
    total = len(true_labels)
    num_clusters = len(set(predicted_labels))
    
    purity_sum = 0

    for cluster in set(predicted_labels):
        cluster_indices = [i for i, label in enumerate(predicted_labels) if label == cluster]
        
        if not cluster_indices:
            continue  # Skip the cluster if there are no elements in it
        
        cluster_true_labels = [true_labels[i] for i in cluster_indices]
        cluster_true_labels = [item for sublist in cluster_true_labels for item in sublist]  # Flatten the list of true labels
        
        class_counts = Counter(cluster_true_labels)
        
        # Check if class_counts is not empty before finding the maximum value
        if class_counts:
            max_class_count = max(class_counts.values())
            purity_sum += max_class_count

    return purity_sum / total


# def Purity_Score(label_seq, pred_labels):
#     # Calculate the confusion matrix to compare true labels and cluster assignments
#     confusion = confusion_matrix(label_seq, pred_labels)
#     # Calculate the purity
#     purity = np.sum(np.max(confusion, axis=0)) / np.sum(confusion)
#     return purity

In [5]:
def Evaluate(X, true_labels, predicted_labels):
    purity = Purity_Score(true_labels, predicted_labels)
    silhouette = silhouette_score(X, predicted_labels, metric='euclidean')
    ari = ari_score(true_labels, predicted_labels)
    nmi = nmi_score(true_labels, predicted_labels)
    
    print(f"Purity: {purity}")
    print(f"Silhouette Score: {silhouette}")
    print(f"ARI Score: {ari}")
    print(f"NMI Score: {nmi}")
    

In [6]:
def SaveFeatures(X, file_name):
    pickle_path = open(file_name, 'wb')
    pickle.dump(X, pickle_path)
    pickle_path.close()

In [7]:
def ReadFeatures(file_name):
    pickle_read = open(file_name, 'rb')
    x = pickle.load(pickle_read)
    pickle_read.close()
    return x

## Lexical Chains

In [8]:
def preprocess_text(text: str, remove_stopwords: bool) -> str:
    lemmatizer = WordNetLemmatizer()

    text = re.sub(r"http\S+", "", text)
    text = re.sub("[^A-Za-z]+", " ", text)
    if remove_stopwords:
        tokens = nltk.word_tokenize(text)
        updated_tokens = []
        for i in range(len(tokens)):
            if tokens[i].lower() in stopwords.words("english"):
                continue
            else:
                updated_tokens.append(lemmatizer.lemmatize(tokens[i].lower()))

    return updated_tokens

def buildRelation(nouns):
    relation_list = defaultdict(list)

    for k in range(len(nouns)):
        relation = []
        for syn in wn.synsets(nouns[k], pos=wn.NOUN):
            for l in syn.lemmas():
                relation.append(l.name())
                if l.antonyms():
                    relation.append(l.antonyms()[0].name())
            for l in syn.hyponyms():
                if l.hyponyms():
                    relation.append(l.hyponyms()[0].name().split(".")[0])
            for l in syn.hypernyms():
                if l.hypernyms():
                    relation.append(l.hypernyms()[0].name().split(".")[0])
        relation_list[nouns[k]].append(relation)
    return relation_list

def buildLexicalChain(nouns, relation_list):
    lexical = []
    threshold = 0.5
    for noun in nouns:
        flag = 0
        for j in range(len(lexical)):
            if flag == 0:
                for key in list(lexical[j]):
                    if key == noun and flag == 0:
                        lexical[j][noun] += 1
                        flag = 1
                    elif key in relation_list[noun][0] and flag == 0:
                        syns1 = wn.synsets(key, pos=wn.NOUN)
                        syns2 = wn.synsets(noun, pos=wn.NOUN)
                        if syns1[0].wup_similarity(syns2[0]) >= threshold:
                            lexical[j][noun] = 1
                            flag = 1
                    elif noun in relation_list[key][0] and flag == 0:
                        syns1 = wn.synsets(key, pos=wn.NOUN)
                        syns2 = wn.synsets(noun, pos=wn.NOUN)
                        if syns1[0].wup_similarity(syns2[0]) >= threshold:
                            lexical[j][noun] = 1
                            flag = 1
        if flag == 0:
            dic_nuevo = {}
            dic_nuevo[noun] = 1
            lexical.append(dic_nuevo)
            flag = 1
    return lexical

def eliminateWords(lexical):
    final_chain = []
    while lexical:
        result = lexical.pop()
        if len(result.keys()) == 1:
            for value in result.values():
                if value != 1:
                    final_chain.append(result)
        else:
            final_chain.append(result)
    return final_chain

def PreprocessDocuments():
    for i in files_path:
        f = open(i, "r")
        dataset = preprocess_text(f.read(), remove_stopwords=True)
        # use lexical chains as the feature selection method
        nouns = []
        l = nltk.pos_tag(dataset)
        for word, n in l:
            if n == "NN" or n == "NNS" or n == "NNP" or n == "NNPS":
                nouns.append(word)

        relation = buildRelation(nouns)
        lexical = buildLexicalChain(nouns, relation)
        chain = eliminateWords(lexical)
        lexical_chain.append(chain)

    global total_features
    for features in lexical_chain:
        for docfeature in features:
            total_features.extend(docfeature.keys())

    total_features = list(set(total_features))

    for feature in lexical_chain:
        temp = []
        # print(feature)
        for j in total_features:
            check = False
            for f in feature:
                if j in f:
                    temp.append(f[j])
                    check = True
                    break
            if not check:
                temp.append(0)

        final_training_Features.append(temp)

def build_lexical_chains(doc):
    tokens = nltk.word_tokenize(doc)
    pos_tags = nltk.pos_tag(tokens)
    chains = {}

    for token, pos in pos_tags:
        synsets = wn.synsets(token, pos=wn.NOUN)
        for synset in synsets:
            if synset not in chains:
                chains[synset] = [token]
            else:
                chains[synset].append(token)

    return chains


In [9]:
def GetLabels(label_path):
    with open(label_path, 'r') as file:
        lines = file.readlines()

    for line in lines:
        parts = line.strip().split()
        if len(parts) >= 2:
            key = parts[0]
            values = parts[1:]
            if len(values) == 1:
                data_dict[key] = values[0]
            else:
                data_dict[key] = values
                for value in values:
                    cluster_dict[value] = None

    word_list = list(cluster_dict.keys())

    for i, word in enumerate(word_list):
        word_to_int[word] = i

    for key, value in data_dict.items():
        if isinstance(value, list):
            mapped_values = [word_to_int[word] for word in value]
            mapped_data_dict[key] = mapped_values
        else:
            value_list = [value] if not isinstance(value, list) else value
            mapped_values = [word_to_int[word] if word in word_to_int else word for word in value_list]
            mapped_data_dict[key] = mapped_values

    label_seq = list(mapped_data_dict.values())
    # label_seq = [item for sublist in label_seq for item in (sublist if isinstance(sublist, list) else [sublist])]
# print("word list",len(word_list),word_list)
# print("word to int", word_to_int)
# print("mapped values", mapped_values)
# print("mapped_data_dict", mapped_data_dict)
    return label_seq

In [10]:
#doc_path = "D:\FAST\FYP\FYP23-Deep-Document-Clustering\Base Lines\Reuters\Training"
doc_path = os.getcwd() + "\Reuters\Training"
ReadDocuments(doc_path)
PreprocessDocuments()

In [12]:
normalizer = Normalizer()
normalized_features = normalizer.fit_transform(final_training_Features)
# final_training_Features = final_training_Features / torch.norm(final_training_Features, dim=1)[:, None]      # GPU

In [16]:
SaveFeatures(final_training_Features, 'Reuters_Features_LexicalChains.pkl')
SaveFeatures(normalized_features, 'Reuters_Normalized_Features_LexicalChains.pkl')

In [20]:
final_training_Features = ReadFeatures('Reuters_Features_LexicalChains.pkl')
normalized_features = ReadFeatures('Reuters_Normalized_Features_LexicalChains.pkl')

In [21]:
SumSqDis = []
pca = PCA(n_components=30, random_state=42)
pca_vecs = pca.fit_transform(final_training_Features)
# pca_vecs = torch.tensor(pca_vecs, dtype=torch.float32).to(device)

# 'D:\FAST\FYP\FYP23-Deep-Document-Clustering\Base Lines\Reuters\cats.txt'
label_path = os.getcwd() + "\Reuters\cats.txt"
label_seq = GetLabels(label_path)

purity_collection = {}
for i in range(500):
    clusters = KMeans(n_init="auto", n_clusters=len(list(word_to_int.values())), random_state=i, init="k-means++").fit(pca_vecs).labels_
    purity_collection[i] = Purity_Score(label_seq, clusters)

max_rand_state = max(purity_collection, key=purity_collection.get)
print(f"Maximum purity of {purity_collection[max_rand_state]} found on random state {max_rand_state}")

lexicalChainsLabels = KMeans(n_init="auto", n_clusters=len(list(word_to_int.values())), random_state=max_rand_state, init="k-means++").fit(pca_vecs).labels_

Evaluate(pca_vecs, label_seq, lexicalChainsLabels)

Maximum purity of 0.37022617723396367 found on random state 483


  ret = a @ b


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (10788,) + inhomogeneous part.

### Tf-Idf

In [12]:
def in_wordnet(word):
    synsets = wn.synsets(word)
    return len(synsets) > 0

def contains_number(word):
    for char in word:
        if char.isnumeric():
            return True
    return False

def min_length_word(word):
    if  len(word) in [1,2]:
        return True
    return False

def custom_preprocessor(text):
    lematizer = WordNetLemmatizer()
    used_terms = {} # keep track of which terms have already been considered
    tokens = word_tokenize(text)
    filtered_tokens = []
    for word in tokens:
        if (not contains_number(word)) and (not min_length_word(word)) and (word not in stopwords.words('english')) and (in_wordnet(word)):
            lema_word = lematizer.lemmatize(word)
            if lema_word in used_terms.keys():
                continue
            else:
                used_terms[lema_word] = 0
                filtered_tokens.append(lema_word)
    return ' '.join(filtered_tokens)

def print_terms(terms):
    for term in terms:
        print(term)


In [24]:
def KMeans_Labels(X, n, rstate_limit, true_labels):
    # find centoids which give maximum purity
    purity_collection = {}
    for i in range(rstate_limit):
        clusters = KMeans(n_init='auto', n_clusters=n, random_state=i, init='k-means++').fit(X).labels_
        purity_collection[i] = Purity_Score(true_labels, clusters)
    
    max_rand_state = max(purity_collection, key=purity_collection.get)
    print(f"Maximum purity of {purity_collection[max_rand_state]} found on random state {max_rand_state}")

    # Create a KMeans model
    cluster_assignments = KMeans(n_init='auto', n_clusters=n, random_state=max_rand_state, init='k-means++').fit(X).labels_

    return cluster_assignments

def print_results(true_labels, predicted_labels, X):
    print("RESULTS:")
    print(f"Purity: {Purity_Score(true_labels, predicted_labels)}")
    print(f"Silhouette Score: {silhouette_score(X, predicted_labels)}")


In [15]:
vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', preprocessor=custom_preprocessor)
X = vectorizer.fit_transform(doc_content)

SaveFeatures('Reuters_TFIDF_Features')

#true_labels = GetLabels()

In [26]:
predicted_labels = KMeans_Labels(X, 5, 700, label_seq)
Evaluate(X, label_seq, predicted_labels)

#print_results(, predicted_labels, X)

Maximum purity of 0.36772339636633294 found on random state 67
RESULTS:
Purity: 0.36772339636633294
Silhouette Score: -0.006047238747527097


In [27]:
tfidfLabels = predicted_labels
tfidfMatrix = X

### Consensus Clustering

In [28]:
def calculate_consensus_matrix(labels1, labels2):
    n = len(labels1)
    consensus_matrix = np.zeros((n, n))

    for i in range(n):
        for j in range(i, n):
            #Calculate the Jaccard similarity between the two label sets
            intersection = np.intersect1d(labels1[i], labels2[j])
            union = np.union1d(labels1[i], labels2[j])
            agreement = len(intersection) / len(union)
        

            consensus_matrix[i, j] = agreement
            consensus_matrix[j, i] = agreement

    return consensus_matrix

In [18]:
consensus_matrix = calculate_consensus_matrix(tfidfLabels, lexicalChainsLabels)

n_clusters = 5  # You can adjust this as needed
purity_collection = {}
for i in range(500):
    clusters = SpectralClustering(n_clusters=n_clusters, affinity="precomputed", random_state=i).fit(1 - consensus_matrix).labels_
    purity_collection[i] = Purity_Score(label_seq, clusters)

max_rand_state = max(purity_collection, key=purity_collection.get)
print(f"Maximum purity of {purity_collection[max_rand_state]} found on random state {max_rand_state}")
spectral_labels = SpectralClustering(n_clusters=n_clusters, affinity="precomputed", random_state=max_rand_state).fit(1 - consensus_matrix).labels_
print("Purity Score: ", Purity_Score(label_seq, spectral_labels))
print("Sillhouette Coefficient: ",metrics.silhouette_score(pca_vecs, spectral_labels, metric="euclidean"),)


Maximum purity of 0.36763070077864296 found on random state 2
Purity Score:  0.36763070077864296
Sillhouette Coefficient:  -0.11535524251258987


### Topical Clustering

In [19]:
num_topics = 5  # Adjust as needed
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
topic_proportions = lda.fit_transform(tfidfMatrix)

combined_features = np.hstack((spectral_labels.reshape(-1, 1), topic_proportions))

normalize_combined_features = Normalizer().fit_transform(combined_features)

topic_purity_collection = {}
for i in range(500):
    topic_clusters = (
        KMeans(n_init="auto", n_clusters=5, random_state=i, init="k-means++")
        .fit(normalize_combined_features)
        .labels_
    )
    topic_purity_collection[i] = Purity_Score(label_seq, topic_clusters)

topic_max_rand_state = max(topic_purity_collection, key=topic_purity_collection.get)
print(f"Maximum purity of {topic_purity_collection[topic_max_rand_state]} found on random state {topic_max_rand_state}")
max_labels = (
    KMeans(
        n_init="auto", n_clusters=5, random_state=topic_max_rand_state, init="k-means++"
    )
    .fit(normalize_combined_features)
    .labels_
)
print("Purity: ", Purity_Score(label_seq, max_labels))
print("Sillhouette Coefficient: ",metrics.silhouette_score(normalize_combined_features, max_labels, metric="euclidean"))

Maximum purity of 0.3674453096032629 found on random state 0
Purity:  0.3674453096032629
Sillhouette Coefficient:  0.7289370934632332
