In [1]:
import numpy as np
import pandas as pd
import nltk
import os
import re
import warnings
from collections import defaultdict
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import Normalizer
from nltk.corpus import wordnet as wn, stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score
from sklearn.cluster import SpectralClustering
from sklearn.decomposition import LatentDirichletAllocation

os.environ['OMP_NUM_THREADS'] = '1'
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn.cluster._kmeans")
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn.feature_extraction.text")


In [2]:
doc_content = []  # all the content in the document
doc_name = []  # name of the document
files_path = []  # path to the documents
lexical_chain = []  # list of lexical chains from each document
total_features = []  # total number of features. 1652
final_training_Features = []
corpus = []
doc_list_sequence = []

In [3]:
def ReadDocuments(dir_name):
    for Path in os.listdir(dir_name):
        file_p = os.path.join(dir_name, Path)
        with open(file_p, "r") as file:
            FileContents = file.read()
            doc_content.append(FileContents.lower())
            doc_name.append(Path)
            files_path.append(file_p)

In [4]:
def Purity_Score(label_seq, pred_labels):
    # Calculate the confusion matrix to compare true labels and cluster assignments
    confusion = confusion_matrix(label_seq, pred_labels)
    # Calculate the purity
    purity = np.sum(np.max(confusion, axis=0)) / np.sum(confusion)
    return purity

### Lexical Chains

In [5]:
def preprocess_text(text: str, remove_stopwords: bool) -> str:
    lemmatizer = WordNetLemmatizer()

    text = re.sub(r"http\S+", "", text)
    text = re.sub("[^A-Za-z]+", " ", text)
    if remove_stopwords:
        tokens = nltk.word_tokenize(text)
        updated_tokens = []
        for i in range(len(tokens)):
            if tokens[i].lower() in stopwords.words("english"):
                continue
            else:
                updated_tokens.append(lemmatizer.lemmatize(tokens[i].lower()))

    return updated_tokens

def buildRelation(nouns):
    relation_list = defaultdict(list)

    for k in range(len(nouns)):
        relation = []
        for syn in wn.synsets(nouns[k], pos=wn.NOUN):
            for l in syn.lemmas():
                relation.append(l.name())
                if l.antonyms():
                    relation.append(l.antonyms()[0].name())
            for l in syn.hyponyms():
                if l.hyponyms():
                    relation.append(l.hyponyms()[0].name().split(".")[0])
            for l in syn.hypernyms():
                if l.hypernyms():
                    relation.append(l.hypernyms()[0].name().split(".")[0])
        relation_list[nouns[k]].append(relation)
    return relation_list

def buildLexicalChain(nouns, relation_list):
    lexical = []
    threshold = 0.5
    for noun in nouns:
        flag = 0
        for j in range(len(lexical)):
            if flag == 0:
                for key in list(lexical[j]):
                    if key == noun and flag == 0:
                        lexical[j][noun] += 1
                        flag = 1
                    elif key in relation_list[noun][0] and flag == 0:
                        syns1 = wn.synsets(key, pos=wn.NOUN)
                        syns2 = wn.synsets(noun, pos=wn.NOUN)
                        if syns1[0].wup_similarity(syns2[0]) >= threshold:
                            lexical[j][noun] = 1
                            flag = 1
                    elif noun in relation_list[key][0] and flag == 0:
                        syns1 = wn.synsets(key, pos=wn.NOUN)
                        syns2 = wn.synsets(noun, pos=wn.NOUN)
                        if syns1[0].wup_similarity(syns2[0]) >= threshold:
                            lexical[j][noun] = 1
                            flag = 1
        if flag == 0:
            dic_nuevo = {}
            dic_nuevo[noun] = 1
            lexical.append(dic_nuevo)
            flag = 1
    return lexical

def eliminateWords(lexical):
    final_chain = []
    while lexical:
        result = lexical.pop()
        if len(result.keys()) == 1:
            for value in result.values():
                if value != 1:
                    final_chain.append(result)
        else:
            final_chain.append(result)
    return final_chain

def PreprocessDocuments():
    for i in files_path:
        f = open(i, "r")
        dataset = preprocess_text(f.read(), remove_stopwords=True)
        # use lexical chains as the feature selection method
        nouns = []
        l = nltk.pos_tag(dataset)
        for word, n in l:
            if n == "NN" or n == "NNS" or n == "NNP" or n == "NNPS":
                nouns.append(word)

        relation = buildRelation(nouns)
        lexical = buildLexicalChain(nouns, relation)
        chain = eliminateWords(lexical)
        lexical_chain.append(chain)

    global total_features
    for features in lexical_chain:
        for docfeature in features:
            total_features.extend(docfeature.keys())

    total_features = list(set(total_features))

    for feature in lexical_chain:
        temp = []
        # print(feature)
        for j in total_features:
            check = False
            for f in feature:
                if j in f:
                    temp.append(f[j])
                    check = True
                    break
            if not check:
                temp.append(0)

        final_training_Features.append(temp)

In [6]:
doc_50_path = os.getcwd() + "\Doc50"
ReadDocuments(doc_50_path)
PreprocessDocuments()

normalizer = Normalizer()
normalize_features = normalizer.fit_transform(final_training_Features)

In [7]:
SumSqDis = []
pca = PCA(n_components=30, random_state=42)
pca_vecs = pca.fit_transform(normalize_features)

# Purity Score
for Path in os.listdir("Doc50" + "\\"):
    file_path = os.getcwd() + f"\{'Doc50'}\\" + Path
    with open(file_path, "r") as file:
        FileContents = file.read()
        corpus.append(FileContents.lower())
        doc_list_sequence.append(Path)

actual_labels = ({})  # dictionary to store true assignments for each document | read sequence not followed
label_path = os.getcwd() + "\Doc50 GT"
for labels_directory in os.listdir(label_path):  # for each assignment folder
    actual_cluster = int(
        labels_directory[1]
    )  # extract cluster label from directory name
    doc_labels = os.listdir(
        label_path + f"\\{labels_directory}"
    )  # for all document ids assigned to this cluster
    for doc in doc_labels:
        actual_labels[doc] = actual_cluster - 1  # save cluster label

label_seq = []  # save labels in order of documents read
for doc in doc_list_sequence:
    label_seq.append(actual_labels[doc])


print("Actual Labels",actual_labels)
print("Label Sequence",label_seq)

purity_collection = {}
for i in range(1500):
    clusters = KMeans(n_init="auto", n_clusters=5, random_state=i, init="k-means++").fit(pca_vecs).labels_
    purity_collection[i] = Purity_Score(label_seq, clusters)

max_rand_state = max(purity_collection, key=purity_collection.get)
print(
    f"Maximum purity of {purity_collection[max_rand_state]} found on random state {max_rand_state}"
)

lexicalChainsLabels = KMeans(n_init="auto", n_clusters=5, random_state=max_rand_state, init="k-means++").fit(pca_vecs).labels_

print(f"""K-means lables using lexical chains: {lexicalChainsLabels}
      \nPurity {Purity_Score(label_seq, lexicalChainsLabels)}
      \nSilhoutte Score: {metrics.silhouette_score(final_training_Features, lexicalChainsLabels, metric='euclidean')}""")

Actual Labels {'20361': 0, '20362': 0, '20363': 0, '20364': 0, '20365': 0, '20487': 0, '20488': 0, '20489': 0, '20490': 0, '20491': 0, '52550': 1, '52551': 1, '52552': 1, '52553': 1, '52554': 1, '52555': 1, '52556': 1, '52557': 1, '52558': 1, '52559': 1, '57110': 2, '58043': 2, '58044': 2, '58045': 2, '58046': 2, '58047': 2, '58048': 2, '58049': 2, '58050': 2, '58051': 2, '64830': 3, '64831': 3, '66189': 3, '66322': 3, '66398': 3, '66399': 3, '66400': 3, '66401': 3, '66402': 3, '66403': 3, '101725': 4, '102616': 4, '103117': 4, '103118': 4, '103119': 4, '103120': 4, '103121': 4, '103122': 4, '103123': 4, '103124': 4}
Label Sequence [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
Maximum purity of 0.72 found on random state 990
K-means lables using lexical chains: [3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 3 0 0 3 0 0 0 0 0 0 0
 0 0 0 4 4 4 2 4 4 3 0 3 0]
      
Purity 0.72
 

### Tf-Idf

In [8]:
def in_wordnet(word):
    synsets = wn.synsets(word)
    return len(synsets) > 0

def contains_number(word):
    for char in word:
        if char.isnumeric():
            return True
    return False

def min_length_word(word):
    if  len(word) in [1,2]:
        return True
    return False

def custom_preprocessor(text):
    lematizer = WordNetLemmatizer()
    used_terms = {} # keep track of which terms have already been considered
    tokens = word_tokenize(text)
    filtered_tokens = []
    for word in tokens:
        if (not contains_number(word)) and (not min_length_word(word)) and (word not in stopwords.words('english')) and (in_wordnet(word)):
            lema_word = lematizer.lemmatize(word)
            if lema_word in used_terms.keys():
                continue
            else:
                used_terms[lema_word] = 0
                filtered_tokens.append(lema_word)
    return ' '.join(filtered_tokens)

def print_terms(terms):
    for term in terms:
        print(term)


In [9]:
def KMeans_Labels(X, n, rstate_limit, true_labels):

    # Specify the number of clusters (you can choose an appropriate value)
    num_clusters = n
    
    # find centoids which give maximum purity
    purity_collection = {}
    for i in range(rstate_limit):
        clusters = KMeans(n_init='auto', n_clusters=num_clusters, random_state=i, init='k-means++').fit(X).labels_
        purity_collection[i] = Purity_Score(true_labels, clusters)
    
    max_rand_state = max(purity_collection, key=purity_collection.get)
    print(f"Maximum purity of {purity_collection[max_rand_state]} found on random state {max_rand_state}")

    # Create a KMeans model
    kmeans = KMeans(n_init='auto', n_clusters=num_clusters, random_state=max_rand_state, init='k-means++')
    # Fit the KMeans model to the TF-IDF data
    kmeans.fit(X)
    # Get the cluster assignments for each document
    cluster_assignments = kmeans.labels_
    
    return cluster_assignments

def Actual_Labels():
    actual_labels = {} # dictionary to store true assignments for each document | read sequence not followed
    label_path = os.getcwd() + '\\Doc50 GT\\'
    for labels_directory in os.listdir(label_path): # for each assignment folder
        actual_cluster = int(labels_directory[1]) # extract cluster label from directory name
        doc_labels = os.listdir(label_path + f"\\{labels_directory}") # for all document ids assigned to this cluster
        for doc in doc_labels:
            actual_labels[doc] = actual_cluster-1 # save cluster label
    
    label_seq = [] # save labels in order of documents read
    for doc in doc_name:
        label_seq.append(actual_labels[doc])
    return label_seq

def print_results(true_labels, predicted_labels, X):
    print("RESULTS:")
    print(f"Purity: {Purity_Score(true_labels, predicted_labels)}")
    print(f"Silhouette Score: {silhouette_score(X, predicted_labels)}")

def wrapperFunction():
    ReadDocuments('Doc50')
    vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', preprocessor=custom_preprocessor)
    X = vectorizer.fit_transform(doc_content)
    true_labels = Actual_Labels()
    predicted_labels = KMeans_Labels(X, 5, 1500, true_labels)
    print_results(true_labels, predicted_labels, X)
    return predicted_labels, X


In [10]:
tfidfLabels, tfidfMatrix = wrapperFunction()

Maximum purity of 0.72 found on random state 1275
RESULTS:
Purity: 0.72
Silhouette Score: 0.0609546220855259


### Consensus Clustering

In [13]:
def calculate_consensus_matrix(labels1, labels2):
    n = len(labels1)
    consensus_matrix = np.zeros((n, n))

    for i in range(n):
        for j in range(i, n):
            #Calculate the Jaccard similarity between the two label sets
            intersection = np.intersect1d(labels1[i], labels2[j])
            union = np.union1d(labels1[i], labels2[j])
            agreement = len(intersection) / len(union)
        

            consensus_matrix[i, j] = agreement
            consensus_matrix[j, i] = agreement

    return consensus_matrix

In [14]:
consensus_matrix = calculate_consensus_matrix(tfidfLabels, lexicalChainsLabels)

n_clusters = 5  # You can adjust this as needed
purity_collection = {}
for i in range(1500):
    clusters = SpectralClustering(n_clusters=n_clusters, affinity="precomputed", random_state=i).fit(1 - consensus_matrix).labels_
    purity_collection[i] = Purity_Score(label_seq, clusters)

max_rand_state = max(purity_collection, key=purity_collection.get)
print(f"Maximum purity of {purity_collection[max_rand_state]} found on random state {max_rand_state}")
spectral_labels = SpectralClustering(n_clusters=n_clusters, affinity="precomputed", random_state=max_rand_state).fit(1 - consensus_matrix).labels_
print("Purity Score: ", Purity_Score(label_seq, spectral_labels))
print("Sillhouette Coefficient: ",metrics.silhouette_score(pca_vecs, spectral_labels, metric="euclidean"),)

IndexError: index 50 is out of bounds for axis 0 with size 50

### Topical Clustering

In [15]:
num_topics = 5  # Adjust as needed
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)

lda.fit(tfidfMatrix)

# Get the topic assignments for each document
topic_labels = lda.transform(tfidfMatrix).argmax(axis=1)

combined_labels = [lexicalChainsLabels, tfidfLabels, topic_labels]
combined_labels = list(map(list, zip(*combined_labels)))

normalize_combined_features = Normalizer().fit_transform(combined_labels)
topic_purity_collection = {}
for i in range(1500):
    topic_clusters = (
        KMeans(n_init="auto", n_clusters=5, random_state=i, init="k-means++")
        .fit(normalize_combined_features)
        .labels_
    )
    topic_purity_collection[i] = Purity_Score(label_seq, topic_clusters)

topic_max_rand_state = max(topic_purity_collection, key=topic_purity_collection.get)
print(f"Maximum purity of {topic_purity_collection[topic_max_rand_state]} found on random state {topic_max_rand_state}")
max_labels = (
    KMeans(
        n_init="auto", n_clusters=5, random_state=topic_max_rand_state, init="k-means++"
    )
    .fit(normalize_combined_features)
    .labels_
)
print("Purity: ", Purity_Score(label_seq, max_labels))
print("Sillhouette Coefficient: ",metrics.silhouette_score(normalize_combined_features, max_labels, metric="euclidean"))

Maximum purity of 0.7 found on random state 429
Purity:  0.7
Sillhouette Coefficient:  0.3519651738586512
