In [1]:
import numpy as np
import pandas as pd
import nltk
import os
import re
import warnings
from collections import defaultdict
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import Normalizer
from nltk.corpus import wordnet as wn, stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score
from sklearn.cluster import SpectralClustering
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import LabelEncoder
import pickle

os.environ['OMP_NUM_THREADS'] = '1'
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn.cluster._kmeans")
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn.feature_extraction.text")


In [2]:
doc_content = []  # all the content in the document
doc_name = []  # name of the document
files_path = []  # path to the documents
lexical_chain = []  # list of lexical chains from each document
total_features = []  # total number of features. 1652
final_training_Features = [] # to store features for training
corpus = [] # to store all text
doc_list_sequence = [] # store sequence of document read
actual_labels = {} # to store actual cluster of documents

In [3]:
def ReadDocuments(dir_name):
    current_file_id = 0
    current_dir_id = 0
    for Path in os.listdir(dir_name):
        sub_dir_path = os.path.join(dir_name, Path)
        for sub_dir_files in os.listdir(sub_dir_path):
            file_p = os.path.join(sub_dir_path, sub_dir_files)
            with open(file_p, "r") as file:
                FileContents = file.read()
                doc_content.append(FileContents.lower())
                doc_name.append(current_file_id)
                actual_labels[current_file_id] = current_dir_id 
                current_file_id+=1
                files_path.append(file_p)
        current_dir_id+=1

In [4]:
def Purity_Score(label_seq, pred_labels):
    # Calculate the confusion matrix to compare true labels and cluster assignments
    confusion = confusion_matrix(label_seq, pred_labels)
    # Calculate the purity
    purity = np.sum(np.max(confusion, axis=0)) / np.sum(confusion)
    return purity

# Lexical Chains

In [5]:
 def preprocess_text(text: str, remove_stopwords: bool) -> str:
    lemmatizer = WordNetLemmatizer()

    text = re.sub(r"http\S+", "", text)
    text = re.sub("[^A-Za-z]+", " ", text)
    if remove_stopwords:
        tokens = nltk.word_tokenize(text)
        updated_tokens = []
        for i in range(len(tokens)):
            if tokens[i].lower() in stopwords.words("english"):
                continue
            else:
                updated_tokens.append(lemmatizer.lemmatize(tokens[i].lower()))

    return updated_tokens

def buildRelation(nouns):
    relation_list = defaultdict(list)

    for k in range(len(nouns)):
        relation = []
        for syn in wn.synsets(nouns[k], pos=wn.NOUN):
            for l in syn.lemmas():
                relation.append(l.name())
                if l.antonyms():
                    relation.append(l.antonyms()[0].name())
            for l in syn.hyponyms():
                if l.hyponyms():
                    relation.append(l.hyponyms()[0].name().split(".")[0])
            for l in syn.hypernyms():
                if l.hypernyms():
                    relation.append(l.hypernyms()[0].name().split(".")[0])
        relation_list[nouns[k]].append(relation)
    return relation_list

def buildLexicalChain(nouns, relation_list):
    lexical = []
    threshold = 0.5
    for noun in nouns:
        flag = 0
        for j in range(len(lexical)):
            if flag == 0:
                for key in list(lexical[j]):
                    if key == noun and flag == 0:
                        lexical[j][noun] += 1
                        flag = 1
                    elif key in relation_list[noun][0] and flag == 0:
                        syns1 = wn.synsets(key, pos=wn.NOUN)
                        syns2 = wn.synsets(noun, pos=wn.NOUN)
                        if syns1[0].wup_similarity(syns2[0]) >= threshold:
                            lexical[j][noun] = 1
                            flag = 1
                    elif noun in relation_list[key][0] and flag == 0:
                        syns1 = wn.synsets(key, pos=wn.NOUN)
                        syns2 = wn.synsets(noun, pos=wn.NOUN)
                        if syns1[0].wup_similarity(syns2[0]) >= threshold:
                            lexical[j][noun] = 1
                            flag = 1
        if flag == 0:
            dic_nuevo = {}
            dic_nuevo[noun] = 1
            lexical.append(dic_nuevo)
            flag = 1
    return lexical

def eliminateWords(lexical):
    final_chain = []
    while lexical:
        result = lexical.pop()
        if len(result.keys()) == 1:
            for value in result.values():
                if value != 1:
                    final_chain.append(result)
        else:
            final_chain.append(result)
    return final_chain

def PreprocessDocuments():
    for i in files_path:
        f = open(i, "r")
        dataset = preprocess_text(f.read(), remove_stopwords=True)
        # use lexical chains as the feature selection method
        nouns = []
        l = nltk.pos_tag(dataset)
        for word, n in l:
            if n == "NN" or n == "NNS" or n == "NNP" or n == "NNPS":
                nouns.append(word)

        relation = buildRelation(nouns)
        lexical = buildLexicalChain(nouns, relation)
        chain = eliminateWords(lexical)
        lexical_chain.append(chain)

    global total_features
    for features in lexical_chain:
        for docfeature in features:
            total_features.extend(docfeature.keys())

    total_features = list(set(total_features))

    for feature in lexical_chain:
        temp = []
        # print(feature)
        for j in total_features:
            check = False
            for f in feature:
                if j in f:
                    temp.append(f[j])
                    check = True
                    break
            if not check:
                temp.append(0)

        final_training_Features.append(temp)

In [6]:
BBC_path = os.getcwd() + "\BBC"
ReadDocuments(BBC_path)
#PreprocessDocuments()

In [57]:
#save training features
# import pickle
# pickle_path = open('BBC_Features_LexicalChains.pkl', 'wb')
# pickle.dump(final_training_Features, pickle_path)
# pickle_path.close()

In [58]:
#save normalized features
#normalizer = Normalizer()
#normalize_features = normalizer.fit_transform(final_training_Features)
#pickle_path = open('BBC_Normalized_Features_LexicalChains.pkl', 'wb')
#pickle.dump(normalize_features, pickle_path)
#pickle_path.close()

In [7]:
#read final training features:
pickle_read = open('BBC_Features_LexicalChains.pkl', 'rb')
final_training_Features = pickle.load(pickle_read)
pickle_read.close()

In [8]:
#read normalized features:
pickle_read = open('BBC_Normalized_Features_LexicalChains.pkl', 'rb')
normalize_features = pickle.load(pickle_read)
pickle_read.close()

In [83]:
# dimensionality can be reduced to check in increase in accuracy
pca = PCA(n_components=30, random_state=42)
pca_vecs = pca.fit_transform(normalize_features)

In [9]:
label_seq = list(actual_labels.values())
# purity_collection = {}
# for i in range(610):
#     clusters = KMeans(n_init="auto", n_clusters=5, random_state=i, init="k-means++").fit(normalize_features).labels_
#     purity_collection[i] = Purity_Score(label_seq, clusters)

# #highest found on 606 using normalized features
# max_rand_state = max(purity_collection, key=purity_collection.get)
# print(
#     f"Maximum purity of {purity_collection[max_rand_state]} found on random state {max_rand_state}"
# )

lexicalChainsLabels = KMeans(n_init="auto", n_clusters=5, random_state=606, init="k-means++").fit(normalize_features).labels_

print(f"""K-means lables using lexical chains: {lexicalChainsLabels}
      \nPurity {Purity_Score(label_seq, lexicalChainsLabels)}
      \nSilhoutte Score: {metrics.silhouette_score(final_training_Features, lexicalChainsLabels, metric='euclidean')}""")

K-means lables using lexical chains: [3 3 3 ... 2 2 1]
      
Purity 0.7478651685393258
      
Silhoutte Score: -0.024339736196491817


  ret = a @ b


In [10]:
normalize_features.shape

(2225, 6276)

# TF-IDF

In [10]:
def in_wordnet(word):
    synsets = wn.synsets(word)
    return len(synsets) > 0

def contains_number(word):
    for char in word:
        if char.isnumeric():
            return True
    return False

def min_length_word(word):
    if  len(word) in [1,2]:
        return True
    return False

def custom_preprocessor(text):
    lematizer = WordNetLemmatizer()
    used_terms = {} # keep track of which terms have already been considered
    tokens = word_tokenize(text)
    filtered_tokens = []
    for word in tokens:
        if (not contains_number(word)) and (not min_length_word(word)) and (word not in stopwords.words('english')) and (in_wordnet(word)):
            lema_word = lematizer.lemmatize(word)
            if lema_word in used_terms.keys():
                continue
            else:
                used_terms[lema_word] = 0
                filtered_tokens.append(lema_word)
    return ' '.join(filtered_tokens)

def print_terms(terms):
    for term in terms:
        print(term)

def KMeans_Labels(X, n, rstate_limit, true_labels):

    # Specify the number of clusters (you can choose an appropriate value)
    num_clusters = n
    
    # find centoids which give maximum purity
    purity_collection = {}
    for i in range(rstate_limit):
        clusters = KMeans(n_init='auto', n_clusters=num_clusters, random_state=i, init='k-means++').fit(X).labels_
        purity_collection[i] = Purity_Score(true_labels, clusters)
    
    max_rand_state = max(purity_collection, key=purity_collection.get)
    print(f"Maximum purity of {purity_collection[max_rand_state]} found on random state {max_rand_state}")

    # Create a KMeans model
    kmeans = KMeans(n_init='auto', n_clusters=num_clusters, random_state=max_rand_state, init='k-means++')
    # Fit the KMeans model to the TF-IDF data
    kmeans.fit(X)
    # Get the cluster assignments for each document
    cluster_assignments = kmeans.labels_
    
    return cluster_assignments

def print_results(true_labels, predicted_labels, X):
    print("RESULTS:")
    print(f"Purity: {Purity_Score(true_labels, predicted_labels)}")
    print(f"Silhouette Score: {silhouette_score(X, predicted_labels)}")

def wrapperFunction():
    # ReadDocuments(os.getcwd() + "\BBC")
    # vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', preprocessor=custom_preprocessor)
    # print("Building Features...")
    # X = vectorizer.fit_transform(doc_content)

    # # save tdidf-features
    # print("Saving tf-idf features")
    # pickle_write = open('BBC_Features_TFIDF.pkl', 'wb')
    # pickle.dump(X, pickle_write)
    # pickle_write.close()

    #load tdidf features
    pickle_read = open('BBC_Features_TFIDF.pkl', 'rb')
    X = pickle.load(pickle_read)
    pickle_read.close()
    
    
    true_labels = list(actual_labels.values())
    print(len(true_labels))
    print("Applying KMeans Clustering...")
    predicted_labels = KMeans_Labels(X, 5, 650, true_labels)
    print_results(true_labels, predicted_labels, X)
    return predicted_labels, X

In [11]:
doc_content = []
tfidfLabels, tfidfMatrix = wrapperFunction()

2225
Applying KMeans Clustering...
Maximum purity of 0.9622471910112359 found on random state 499
RESULTS:
Purity: 0.9622471910112359
Silhouette Score: 0.010218972318231987


# Consensus Clustering

In [14]:
def calculate_consensus_matrix(labels1, labels2):
    n = len(labels1)
    consensus_matrix = np.zeros((n, n))

    for i in range(n):
        for j in range(i, n):
            #Calculate the Jaccard similarity between the two label sets
            intersection = np.intersect1d(labels1[i], labels2[j])
            union = np.union1d(labels1[i], labels2[j])
            agreement = len(intersection) / len(union)
        

            consensus_matrix[i, j] = agreement
            consensus_matrix[j, i] = agreement

    return consensus_matrix

In [19]:
print("Building Consensus Matrix...")
consensus_matrix = calculate_consensus_matrix(tfidfLabels, lexicalChainsLabels)

n_clusters = 5  # You can adjust this as needed
purity_collection = {}
for i in range(600):
    print(f"Trying clustering on random state {i}..")
    clusters = SpectralClustering(n_clusters=n_clusters, affinity="precomputed", random_state=i).fit(1 - consensus_matrix).labels_
    purity_collection[i] = Purity_Score(label_seq, clusters)

max_rand_state = max(purity_collection, key=purity_collection.get)
print(f"Maximum purity of {purity_collection[max_rand_state]} found on random state {max_rand_state}")
spectral_labels = SpectralClustering(n_clusters=n_clusters, affinity="precomputed", random_state=max_rand_state).fit(1 - consensus_matrix).labels_
print("Purity Score: ", Purity_Score(label_seq, spectral_labels))
print("Sillhouette Coefficient: ",metrics.silhouette_score(normalize_features, spectral_labels, metric="euclidean"),)

Building Consensus Matrix...
Trying clustering on random state 0..
Trying clustering on random state 1..
Trying clustering on random state 2..
Trying clustering on random state 3..
Trying clustering on random state 4..
Trying clustering on random state 5..
Trying clustering on random state 6..
Trying clustering on random state 7..
Trying clustering on random state 8..
Trying clustering on random state 9..
Trying clustering on random state 10..
Trying clustering on random state 11..
Trying clustering on random state 12..
Trying clustering on random state 13..
Trying clustering on random state 14..
Trying clustering on random state 15..
Trying clustering on random state 16..
Trying clustering on random state 17..
Trying clustering on random state 18..
Trying clustering on random state 19..
Trying clustering on random state 20..
Trying clustering on random state 21..
Trying clustering on random state 22..
Trying clustering on random state 23..
Trying clustering on random state 24..
Trying

  ret = a @ b


In [16]:
# import threading

# # release unrequired memory
# # del doc_content
# # del tfidfMatrix
# # del normalize_features
# # del final_training_Features
# # gc.collect()

# threads = []
# n_clusters = 5  # You can adjust this as needed
# purity_collection = {}
# lock = threading.Lock()
# random_state_count = 200

# def SpectralClusteringParallel(consensus_matrix, n_clust, rand_state):
#     print(f"Trying clustering on random state {i}..")
#     specteral_ = SpectralClustering(n_clusters=n_clust, affinity="precomputed", random_state=rand_state)
#     clusters = specteral_.fit(1 - consensus_matrix).labels_
#     purity_collection[rand_state] = Purity_Score(label_seq, clusters)
#     print(f"Thread {rand_state} complete")

# print("Building Consensus Matrix...")
# consensus_matrix = calculate_consensus_matrix(tfidfLabels, lexicalChainsLabels)

# for i in range(random_state_count):
#     threads.append(threading.Thread(target=SpectralClusteringParallel, args=(consensus_matrix, n_clusters, i)))
#     threads[i].start()
    
#     #clusters = SpectralClustering(n_clusters=n_clusters, affinity="precomputed", random_state=i).fit(1 - consensus_matrix).labels_
#     #purity_collection[i] = Purity_Score(label_seq, clusters)

# for i in range(random_state_count):
#     threads[i].join()

# max_rand_state = max(purity_collection, key=purity_collection.get)
# print(f"Maximum purity of {purity_collection[max_rand_state]} found on random state {max_rand_state}")
# spectral_labels = SpectralClustering(n_clusters=n_clusters, affinity="precomputed", random_state=max_rand_state).fit(1 - consensus_matrix).labels_
# print("Purity Score: ", Purity_Score(label_seq, spectral_labels))
# print("Sillhouette Coefficient: ",metrics.silhouette_score(pca_vecs, spectral_labels, metric="euclidean"),)

# Topical Clustering

In [20]:
num_topics = 5  # Adjust as needed
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)

lda.fit(tfidfMatrix)

# Get the topic assignments for each document
topic_labels = lda.transform(tfidfMatrix).argmax(axis=1)

combined_labels = [lexicalChainsLabels, tfidfLabels, topic_labels]
combined_labels = list(map(list, zip(*combined_labels)))

normalize_combined_features = Normalizer().fit_transform(combined_labels)
topic_purity_collection = {}
for i in range(600):
    topic_clusters = (KMeans(n_init="auto", n_clusters=5, random_state=i, init="k-means++").fit(normalize_combined_features).labels_)
    topic_purity_collection[i] = Purity_Score(label_seq, topic_clusters)

topic_max_rand_state = max(topic_purity_collection, key=topic_purity_collection.get)
print(f"Maximum purity of {topic_purity_collection[topic_max_rand_state]} found on random state {topic_max_rand_state}")
max_labels = (KMeans(n_init="auto", n_clusters=5, random_state=topic_max_rand_state, init="k-means++").fit(normalize_combined_features)
              .labels_)
print("Purity: ", Purity_Score(label_seq, max_labels))
print("Sillhouette Coefficient: ",metrics.silhouette_score(normalize_combined_features, max_labels, metric="euclidean"))

Maximum purity of 0.8588764044943821 found on random state 71
Purity:  0.8588764044943821
Sillhouette Coefficient:  0.6609660724161398
