In [1]:
import numpy as np
import pandas as pd
import nltk
import os
import re
import warnings
from collections import defaultdict
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import Normalizer
from nltk.corpus import wordnet as wn, stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score
from sklearn.metrics.cluster import normalized_mutual_info_score as nmi_score
from sklearn.metrics import adjusted_rand_score as ari_score
from sklearn.metrics import f1_score 
from sklearn.cluster import SpectralClustering
from sklearn.decomposition import LatentDirichletAllocation
import pickle

os.environ['OMP_NUM_THREADS'] = '1'
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn.cluster._kmeans")
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn.feature_extraction.text")


In [3]:
doc_content = []  # all the content in the document
doc_name = []  # name of the document
files_path = []  # path to the documents
lexical_chain = []  # list of lexical chains from each document
total_features = []  # total number of features. 1652
final_training_Features = []
corpus = []
doc_list_sequence = []

In [4]:
def ReadDocuments(dir_name):
    for Path in os.listdir(dir_name + '\\'):
        file_p = os.getcwd() + f"\{dir_name}\\" + Path
        for files in os.listdir(file_p):
            file = os.path.join(file_p, files)
            files_path.append(file)
            with open(file, 'r', encoding='utf-8') as file:
                FileContents = file.read()
                doc_content.append(FileContents.lower())
                doc_name.append(Path)
                doc_list_sequence.append(files)

In [5]:
def Purity_Score(label_seq, pred_labels):
    # Calculate the confusion matrix to compare true labels and cluster assignments
    confusion = confusion_matrix(label_seq, pred_labels)
    # Calculate the purity
    purity = np.sum(np.max(confusion, axis=0)) / np.sum(confusion)
    return purity

In [6]:
def Evaluate(X, true_labels, predicted_labels):
    purity = Purity_Score(true_labels, predicted_labels)
    silhouette = silhouette_score(X, predicted_labels, metric='euclidean')
    ari = ari_score(true_labels, predicted_labels)
    nmi = nmi_score(true_labels, predicted_labels)
    
    print(f"Purity: {purity}")
    print(f"Silhouette Score: {silhouette}")
    print(f"ARI Score: {ari}")
    print(f"NMI Score: {nmi}")
    

In [7]:
def SaveFeatures(X, file_name):
    pickle_path = open(file_name, 'wb')
    pickle.dump(X, pickle_path)
    pickle_path.close()

In [8]:
def ReadFeatures(file_name):
    pickle_read = open(file_name, 'rb')
    x = pickle.load(pickle_read)
    pickle_read.close()
    return x

## Lexical Chains

In [9]:
def preprocess_text(text: str, remove_stopwords: bool) -> str:
    lemmatizer = WordNetLemmatizer()

    text = re.sub(r"http\S+", "", text)
    text = re.sub("[^A-Za-z]+", " ", text)
    if remove_stopwords:
        tokens = nltk.word_tokenize(text)
        updated_tokens = []
        for i in range(len(tokens)):
            if tokens[i].lower() in stopwords.words("english"):
                continue
            else:
                updated_tokens.append(lemmatizer.lemmatize(tokens[i].lower()))

    return updated_tokens


def buildRelation(nouns):
    relation_list = defaultdict(list)

    for k in range(len(nouns)):
        relation = []
        for syn in wn.synsets(nouns[k], pos=wn.NOUN):
            for l in syn.lemmas():
                relation.append(l.name())
                if l.antonyms():
                    relation.append(l.antonyms()[0].name())
            for l in syn.hyponyms():
                if l.hyponyms():
                    relation.append(l.hyponyms()[0].name().split(".")[0])
            for l in syn.hypernyms():
                if l.hypernyms():
                    relation.append(l.hypernyms()[0].name().split(".")[0])
        relation_list[nouns[k]].append(relation)
    return relation_list


def buildLexicalChain(nouns, relation_list):
    lexical = []
    threshold = 0.5
    for noun in nouns:
        flag = 0
        for j in range(len(lexical)):
            if flag == 0:
                for key in list(lexical[j]):
                    if key == noun and flag == 0:
                        lexical[j][noun] += 1
                        flag = 1
                    elif key in relation_list[noun][0] and flag == 0:
                        syns1 = wn.synsets(key, pos=wn.NOUN)
                        syns2 = wn.synsets(noun, pos=wn.NOUN)
                        if syns1[0].wup_similarity(syns2[0]) >= threshold:
                            lexical[j][noun] = 1
                            flag = 1
                    elif noun in relation_list[key][0] and flag == 0:
                        syns1 = wn.synsets(key, pos=wn.NOUN)
                        syns2 = wn.synsets(noun, pos=wn.NOUN)
                        if syns1[0].wup_similarity(syns2[0]) >= threshold:
                            lexical[j][noun] = 1
                            flag = 1
        if flag == 0:
            dic_nuevo = {}
            dic_nuevo[noun] = 1
            lexical.append(dic_nuevo)
            flag = 1
    return lexical

In [10]:
def eliminateWords(lexical):
    final_chain = []
    while lexical:
        result = lexical.pop()
        if len(result.keys()) == 1:
            for value in result.values():
                if value != 1:
                    final_chain.append(result)
        else:
            final_chain.append(result)
    return final_chain


def PreprocessDocuments():
    for i in files_path:
        with open(i, "r", encoding='utf-8') as file:
            dataset = preprocess_text(file.read(), remove_stopwords=True)
        # use lexical chains as the feature selection method
        nouns = []
        l = nltk.pos_tag(dataset)
        for word, n in l:
            if n == "NN" or n == "NNS" or n == "NNP" or n == "NNPS":
                nouns.append(word)

        relation = buildRelation(nouns)
        lexical = buildLexicalChain(nouns, relation)
        chain = eliminateWords(lexical)
        lexical_chain.append(chain)

    global total_features
    for features in lexical_chain:
        for docfeature in features:
            total_features.extend(docfeature.keys())

    total_features = list(set(total_features))

    for feature in lexical_chain:
        temp = []
        # print(feature)
        for j in total_features:
            check = False
            for f in feature:
                if j in f:
                    temp.append(f[j])
                    check = True
                    break
            if not check:
                temp.append(0)

        final_training_Features.append(temp)


def build_lexical_chains(doc):
    tokens = nltk.word_tokenize(doc)
    pos_tags = nltk.pos_tag(tokens)
    chains = {}

    for token, pos in pos_tags:
        synsets = wn.synsets(token, pos=wn.NOUN)
        for synset in synsets:
            if synset not in chains:
                chains[synset] = [token]
            else:
                chains[synset].append(token)

    return chains


In [11]:
WebKb_path = "WebKB"
print("Reading Documents...")
ReadDocuments(WebKb_path)
print("Building Relations...")
PreprocessDocuments()

Reading Documents...
Building Relations...


In [13]:
normalizer = Normalizer()
normalize_features = normalizer.fit_transform(final_training_Features)

In [15]:
# SaveFeatures(final_training_Features, 'WebKB_Features_LexicalChains.pkl')
# SaveFeatures(normalize_features, 'WebKB_Normalized_Features_LexicalChains.pkl')

In [29]:
final_training_Features = ReadFeatures('WebKB_Features_LexicalChains.pkl')
normalize_features = ReadFeatures('WebKB_Normalized_Features_LexicalChains.pkl')

In [32]:
actual_labels = {} # dictionary to store true assignments for each document | read sequence not followed
dir_num = 0

label_path = os.path.join(os.getcwd(),'WebKB')
for labels_directory in os.listdir(label_path): # for each assignment folder
    actual_cluster = dir_num 
    doc_labels = os.listdir(label_path + f"\\{labels_directory}") # for all document ids assigned to this cluster
    for doc in doc_labels:
        actual_labels[doc] = actual_cluster # save cluster label
    dir_num+=1
label_seq = [] # save labels in order of documents read
for doc in doc_list_sequence:
    label_seq.append(actual_labels[doc])

In [33]:
len(label_seq)

8282

In [35]:
pca = PCA(n_components=30, random_state=42)
pca_vecs = pca.fit_transform(normalize_features)

print("Applying Kmeans Clustering...")
purity_collection = {}
for i in range(700):
    clusters = KMeans(n_init="auto", n_clusters=5, random_state=i, init="k-means++").fit(pca_vecs).labels_
    purity_collection[i] = Purity_Score(label_seq, clusters)

max_rand_state = max(purity_collection, key=purity_collection.get)
print(f"Maximum purity of {purity_collection[max_rand_state]} found on random state {max_rand_state}")

lexicalChainsLabels = KMeans(n_init="auto", n_clusters=5, random_state=max_rand_state, init="k-means++").fit(pca_vecs).labels_

Evaluate(pca_vecs, label_seq, lexicalChainsLabels)

Applying Kmeans Clustering...
Maximum purity of 0.5400869355228206 found on random state 343
Purity: 0.5400869355228206
Silhouette Score: 0.12729090342431598
ARI Score: 0.1591516593740077
NMI Score: 0.15750300304197687


## TF-IDF 

In [36]:
def in_wordnet(word):
    synsets = wn.synsets(word)
    return len(synsets) > 0

def contains_number(word):
    for char in word:
        if char.isnumeric():
            return True
    return False

def min_length_word(word):
    if  len(word) in [1,2]:
        return True
    return False

def custom_preprocessor(text):
    lematizer = WordNetLemmatizer()
    used_terms = {} # keep track of which terms have already been considered
    tokens = word_tokenize(text)
    filtered_tokens = []
    for word in tokens:
        if (not contains_number(word)) and (not min_length_word(word)) and (word not in stopwords.words('english')) and (in_wordnet(word)):
            lema_word = lematizer.lemmatize(word)
            if lema_word in used_terms.keys():
                continue
            else:
                used_terms[lema_word] = 0
                filtered_tokens.append(lema_word)
    return ' '.join(filtered_tokens)

def print_terms(terms):
    for term in terms:
        print(term)


In [None]:
def KMeans_Labels(X, n, rstate_limit, true_labels):

    # Specify the number of clusters (you can choose an appropriate value)
    num_clusters = n
    
    # find centoids which give maximum purity
    purity_collection = {}
    for i in range(rstate_limit):
        clusters = KMeans(n_init='auto', n_clusters=num_clusters, random_state=i, init='k-means++').fit(X).labels_
        purity_collection[i] = Purity_Score(true_labels, clusters)
    
    max_rand_state = max(purity_collection, key=purity_collection.get)
    print(f"Maximum purity of {purity_collection[max_rand_state]} found on random state {max_rand_state}")

    # Create a KMeans model
    kmeans = KMeans(n_init='auto', n_clusters=num_clusters, random_state=max_rand_state, init='k-means++')
    # Fit the KMeans model to the TF-IDF data
    kmeans.fit(X)
    # Get the cluster assignments for each document
    cluster_assignments = kmeans.labels_
    
    return cluster_assignments

def print_results(true_labels, predicted_labels, X):
    print("RESULTS:")
    print(f"Purity: {Purity_Score(true_labels, predicted_labels)}")
    print(f"Silhouette Score: {silhouette_score(X, predicted_labels)}")


def wrapperFunction():
    # ReadDocuments('WebKB')
    #vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', preprocessor=custom_preprocessor)
    #X = vectorizer.fit_transform(doc_content)
    #SaveFeatures(X, 'WebKB_TFIDF_Features.pkl')

    X = ReadFeatures('WebKB_TFIDF_Features.pkl')
    
    true_labels = label_seq
    predicted_labels = KMeans_Labels(X, 5, 700, true_labels)
    Evaluate(X, true_labels, predicted_labels)
    return predicted_labels, X

In [None]:
tfidfLabels, tfidfMatrix = wrapperFunction()

## Consensus Clustering

In [None]:
def calculate_consensus_matrix(labels1, labels2):
    n = len(labels1)
    consensus_matrix = np.zeros((n, n))

    for i in range(n):
        for j in range(i, n):
            #Calculate the Jaccard similarity between the two label sets
            intersection = np.intersect1d(labels1[i], labels2[j])
            union = np.union1d(labels1[i], labels2[j])
            agreement = len(intersection) / len(union)
        

            consensus_matrix[i, j] = agreement
            consensus_matrix[j, i] = agreement

    return consensus_matrix

In [None]:
consensus_matrix = calculate_consensus_matrix(tfidfLabels, lexicalChainsLabels)

n_clusters = 5  # You can adjust this as needed
purity_collection = {}
for i in range(1500):
    clusters = SpectralClustering(n_clusters=n_clusters, affinity="precomputed", random_state=i).fit(1 - consensus_matrix).labels_
    purity_collection[i] = Purity_Score(label_seq, clusters)

max_rand_state = max(purity_collection, key=purity_collection.get)
print(f"Maximum purity of {purity_collection[max_rand_state]} found on random state {max_rand_state}")
spectral_labels = SpectralClustering(n_clusters=n_clusters, affinity="precomputed", random_state=max_rand_state).fit(1 - consensus_matrix).labels_

Evaluate(1-consensus_matrix, label_seq, spectral_labels)

## Topical Clustering

In [None]:
num_topics = 5  # Adjust as needed
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)

lda.fit(tfidfMatrix)

# Get the topic assignments for each document
topic_labels = lda.transform(tfidfMatrix).argmax(axis=1)

combined_labels = [lexicalChainsLabels, tfidfLabels, topic_labels]
combined_labels = list(map(list, zip(*combined_labels)))

normalize_combined_features = Normalizer().fit_transform(combined_labels)
topic_purity_collection = {}
for i in range(1500):
    topic_clusters = (KMeans(n_init="auto", n_clusters=5, random_state=i, init="k-means++").fit(normalize_combined_features).labels_)
    topic_purity_collection[i] = Purity_Score(label_seq, topic_clusters)

topic_max_rand_state = max(topic_purity_collection, key=topic_purity_collection.get)
print(f"Maximum purity of {topic_purity_collection[topic_max_rand_state]} found on random state {topic_max_rand_state}")
max_labels = (KMeans(n_init="auto", n_clusters=5, random_state=topic_max_rand_state, init="k-means++").fit(normalize_combined_features).labels_)

Evaluate(normalize_combined_features, label_seq, max_labels)