In [1]:
import time
start = time.time()

In [2]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import reuters
import os
import re
import warnings
from collections import defaultdict
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import Normalizer
from nltk.corpus import wordnet as wn, stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score
from sklearn.metrics.cluster import normalized_mutual_info_score as nmi_score
from sklearn.metrics import adjusted_rand_score as ari_score
from sklearn.metrics import f1_score 
from sklearn.cluster import SpectralClustering
from sklearn.decomposition import LatentDirichletAllocation
import pickle

os.environ['OMP_NUM_THREADS'] = '1'
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn.cluster._kmeans")
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn.feature_extraction.text")

In [3]:
with open("../Results.txt", "w") as f:
    f.write("Reuters\n\n")

## Prepare Dataset

In [4]:
# # Provided list of topics


topics = [
    'acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'categories', 'cocoa', 'coconut', 'coconut-oil', 'coffee',
    'copper', 'copra-cake', 'corn', 'corporate', 'corporate/industrial', 'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude',
    'dfl', 'dlr', 'dmk', 'earn', 'economics', 'fuel', 'gas', 'gnp', 'gold', 'government', 'government/social', 'grain',
    'groundnut', 'groundnut-oil', 'heat', 'hog', 'housing', 'income', 'instal-debt', 'interest', 'ipi', 'iron-steel',
    'jet', 'jobs', 'l-cattle', 'lead', 'lei', 'lin-oil', 'livestock', 'lumber', 'markets', 'meal-feed', 'money-fx',
    'money-supply', 'naphtha', 'nat-gas', 'nickel', 'nkr', 'nzdlr', 'oat', 'oilseed', 'orange', 'palladium', 'palm-oil',
    'palmkernel', 'pet-chem', 'platinum', 'potato', 'propane', 'rand', 'rape-oil', 'rapeseed', 'reserves', 'retail',
    'rice', 'rubber', 'rye', 'ship', 'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'strategic-metal', 'sugar',
    'sun-meal', 'sun-oil', 'sunseed', 'tea', 'tin', 'trade', 'veg-oil', 'wheat', 'wpi', 'yen', 'zinc'
]

# Define categories

corporate_categories = ['corporate', 'corporate/industrial', 'instal-debt', 'jobs', 'lei', 'livestock',
                        'pet-chem', 'ship', 'strategic-metal', 'acq', 'l-cattle', 'nkr']

social_categories = ['carcass', 'categories', 'dfl', 'housing', 'income', 'retail', 'government', 'government/social']

markets_categories = ['alum', 'barley', 'bop', 'cocoa', 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn',
                      'cotton', 'cotton-oil', 'grain', 'groundnut', 'groundnut-oil', 'heat', 'hog', 'meal-feed', 'naphtha',
                      'nat-gas', 'nickel', 'oat', 'oilseed', 'orange', 'palladium', 'palm-oil', 'palmkernel',
                      'platinum', 'potato', 'propane', 'rape-oil', 'rapeseed', 'rice', 'rubber', 'rye', 'silver',
                      'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'sugar', 'sun-meal', 'sun-oil', 'sunseed', 'tea', 'tin',
                      'veg-oil', 'wheat', 'zinc', 'crude', 'markets', 'lumber', 'castor-oil', 'lin-oil', 'lead']

economics_categories = ['cpi', 'cpu', 'dlr', 'dmk', 'earn', 'economics', 'fuel', 'gas', 'gnp', 'gold', 'instal-debt',
                        'ipi', 'iron-steel', 'jet', 'money-fx', 'money-supply', 'reserves', 'interest', 'trade', 'wpi', 'yen', 'rand', 'nzdlr']

corporate_count = 0
social_count = 0
markets_count = 0
economics_count = 0

# Count topics in each category
for topic in topics:
    if topic in corporate_categories:
        corporate_count += 1
    elif topic in social_categories:
        social_count += 1
    elif topic in markets_categories:
        markets_count += 1
    elif topic in economics_categories:
        economics_count += 1

# Print the total count for each category
print(f"Corporate: {corporate_count}")
print(f"Social: {social_count}")
print(f"Markets: {markets_count}")
print(f"Economics: {economics_count}")
print(f"Total: {corporate_count + social_count + markets_count + economics_count}")


Corporate: 12
Social: 8
Markets: 55
Economics: 22
Total: 97


In [5]:
# Extract fileids from the reuters corpus
fileids = reuters.fileids()

# Initialize empty lists to store categories and raw text
categories = []
text = []

# Loop through each file id and collect each files categories and raw text
for file in fileids:
    categories.append(reuters.categories(file))
    text.append(reuters.raw(file))

# Combine lists into pandas dataframe. reutersDf is the final dataframe. 
df = pd.DataFrame({'ids':fileids, 'categories':categories, 'text':text})

In [6]:
df

Unnamed: 0,ids,categories,text
0,test/14826,[trade],ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RI...
1,test/14828,[grain],CHINA DAILY SAYS VERMIN EAT 7-12 PCT GRAIN STO...
2,test/14829,"[crude, nat-gas]",JAPAN TO REVISE LONG-TERM ENERGY DEMAND DOWNWA...
3,test/14832,"[corn, grain, rice, rubber, sugar, tin, trade]",THAI TRADE DEFICIT WIDENS IN FIRST QUARTER\n ...
4,test/14833,"[palm-oil, veg-oil]",INDONESIA SEES CPO PRICE RISING SHARPLY\n Ind...
...,...,...,...
10783,training/999,"[interest, money-fx]",U.K. MONEY MARKET SHORTAGE FORECAST REVISED DO...
10784,training/9992,[earn],KNIGHT-RIDDER INC &lt;KRN> SETS QUARTERLY\n Q...
10785,training/9993,[earn],TECHNITROL INC &lt;TNL> SETS QUARTERLY\n Qtly...
10786,training/9994,[earn],NATIONWIDE CELLULAR SERVICE INC &lt;NCEL> 4TH ...


In [7]:
doc_content = []  # all the content in the document
doc_name = []  # name of the document
files_path = []  # path to the documents

lexical_chain = []  # list of lexical chains from each document
total_features = []  # total number of features. 1652
final_training_Features = []
corpus = []
doc_list_sequence = []

data_dict = {}
cluster_dict = {}
mapped_data_dict = {}
actual_labels = {}

category_value_map = {'corporate':0, 'social':1, 'market':2, 'economics':3}


In [8]:
def ReadDocuments():
    for i in range(len(df)):
        row = df.iloc[i]
        doc_content.append(row['text'])
        
        id = row['ids'].split('/')[1]

        genre = row['categories'][0]
        
        if(genre in corporate_categories):
            actual_labels[id] = category_value_map['corporate']
        elif(genre in social_categories):
            actual_labels[id] = category_value_map['social']
        elif(genre in markets_categories):
            actual_labels[id] = category_value_map['market']
        elif(genre in economics_categories):
            actual_labels[id] = category_value_map['economics']

In [9]:
ReadDocuments()

## Utility Functions

In [10]:
def Purity_Score(label_seq, pred_labels):
    # Calculate the confusion matrix to compare true labels and cluster assignments
    confusion = confusion_matrix(label_seq, pred_labels)
    # Calculate the purity
    purity = np.sum(np.max(confusion, axis=0)) / np.sum(confusion)
    return purity

In [11]:
def Evaluate(X, true_labels, predicted_labels, algo):
    purity = Purity_Score(true_labels, predicted_labels)
    silhouette = silhouette_score(X, predicted_labels, metric='euclidean')
    ari = ari_score(true_labels, predicted_labels)
    nmi = nmi_score(true_labels, predicted_labels)
    
    print(f"Purity: {purity}")
    print(f"Silhouette Score: {silhouette}")
    print(f"ARI Score: {ari}")
    print(f"NMI Score: {nmi}")
    
    with open("../Results.txt", "a") as f:
        f.write(f"{algo}\n"
                f"\t Purity: {purity:.2f}\n"
                f"\t Silhouette Score: {silhouette:.2f}\n"
                f"\t NMI: {nmi:.2f}\n"
                f"\t ARI: {ari:.2f}\n")

In [12]:
def SaveFeatures(X, file_name):
    pickle_path = open(file_name, 'wb')
    pickle.dump(X, pickle_path)
    pickle_path.close()

In [13]:
def ReadFeatures(file_name):
    pickle_read = open(file_name, 'rb')
    x = pickle.load(pickle_read)
    pickle_read.close()
    return x

## Lexical Chains

In [14]:
def preprocess_text(text: str, remove_stopwords: bool) -> str:
    lemmatizer = WordNetLemmatizer()

    text = re.sub(r"http\S+", "", text)
    text = re.sub("[^A-Za-z]+", " ", text)
    if remove_stopwords:
        tokens = nltk.word_tokenize(text)
        updated_tokens = []
        for i in range(len(tokens)):
            if tokens[i].lower() in stopwords.words("english"):
                continue
            else:
                updated_tokens.append(lemmatizer.lemmatize(tokens[i].lower()))

    return updated_tokens

def buildRelation(nouns):
    relation_list = defaultdict(list)

    for k in range(len(nouns)):
        relation = []
        for syn in wn.synsets(nouns[k], pos=wn.NOUN):
            for l in syn.lemmas():
                relation.append(l.name())
                if l.antonyms():
                    relation.append(l.antonyms()[0].name())
            for l in syn.hyponyms():
                if l.hyponyms():
                    relation.append(l.hyponyms()[0].name().split(".")[0])
            for l in syn.hypernyms():
                if l.hypernyms():
                    relation.append(l.hypernyms()[0].name().split(".")[0])
        relation_list[nouns[k]].append(relation)
    return relation_list

def buildLexicalChain(nouns, relation_list):
    lexical = []
    threshold = 0.5
    for noun in nouns:
        flag = 0
        for j in range(len(lexical)):
            if flag == 0:
                for key in list(lexical[j]):
                    if key == noun and flag == 0:
                        lexical[j][noun] += 1
                        flag = 1
                    elif key in relation_list[noun][0] and flag == 0:
                        syns1 = wn.synsets(key, pos=wn.NOUN)
                        syns2 = wn.synsets(noun, pos=wn.NOUN)
                        if syns1[0].wup_similarity(syns2[0]) >= threshold:
                            lexical[j][noun] = 1
                            flag = 1
                    elif noun in relation_list[key][0] and flag == 0:
                        syns1 = wn.synsets(key, pos=wn.NOUN)
                        syns2 = wn.synsets(noun, pos=wn.NOUN)
                        if syns1[0].wup_similarity(syns2[0]) >= threshold:
                            lexical[j][noun] = 1
                            flag = 1
        if flag == 0:
            dic_nuevo = {}
            dic_nuevo[noun] = 1
            lexical.append(dic_nuevo)
            flag = 1
    return lexical

def eliminateWords(lexical):
    final_chain = []
    while lexical:
        result = lexical.pop()
        if len(result.keys()) == 1:
            for value in result.values():
                if value != 1:
                    final_chain.append(result)
        else:
            final_chain.append(result)
    return final_chain

def PreprocessDocuments():
    for i in doc_content:
        dataset = preprocess_text(i, remove_stopwords=True)
        # use lexical chains as the feature selection method
        nouns = []
        l = nltk.pos_tag(dataset)
        for word, n in l:
            if n == "NN" or n == "NNS" or n == "NNP" or n == "NNPS":
                nouns.append(word)

        relation = buildRelation(nouns)
        lexical = buildLexicalChain(nouns, relation)
        chain = eliminateWords(lexical)
        lexical_chain.append(chain)

    global total_features
    for features in lexical_chain:
        for docfeature in features:
            total_features.extend(docfeature.keys())

    total_features = list(set(total_features))

    for feature in lexical_chain:
        temp = []
        # print(feature)
        for j in total_features:
            check = False
            for f in feature:
                if j in f:
                    temp.append(f[j])
                    check = True
                    break
            if not check:
                temp.append(0)

        final_training_Features.append(temp)

In [14]:
# PreprocessDocuments()

In [15]:
# normalizer = Normalizer()
# normalize_features = normalizer.fit_transform(final_training_Features)

In [16]:
# SaveFeatures(final_training_Features, 'Reuters_Features_LexicalChains.pkl')

In [17]:
# SaveFeatures(normalize_features, 'Reuters_Normalize_Features_LexicalChains.pkl')

In [15]:
final_training_Features = ReadFeatures('..\Stored Feature Matrix\Reuter_Features\Reuters_Features_LexicalChains.pkl')
normalize_features = ReadFeatures('..\Stored Feature Matrix\Reuter_Features\Reuters_Normalize_Features_LexicalChains.pkl')

In [19]:
# pd.DataFrame(final_training_Features).shape

In [20]:
SumSqDis = []
pca = PCA(n_components=30, random_state=42)
pca_vecs = pca.fit_transform(normalize_features)

label_seq = list(actual_labels.values())

print("Applying K-Means Clustering...")
purity_collection = {}
for i in range(100):
    print(f"Checking on random state: {i}")
    clusters = KMeans(n_init="auto", n_clusters=4, random_state=i, init="k-means++").fit(normalize_features).labels_
    purity_collection[i] = Purity_Score(label_seq, clusters)

max_rand_state = max(purity_collection, key=purity_collection.get)
print(f"Maximum purity of {purity_collection[max_rand_state]} found on random state {max_rand_state}")

lexicalChainsLabels = KMeans(n_init="auto", n_clusters=4, random_state=max_rand_state, init="k-means++").fit(normalize_features).labels_

Evaluate(normalize_features, label_seq, lexicalChainsLabels, "Lexical Chains")

Applying K-Means Clustering...
Checking on random state: 0
Checking on random state: 1
Checking on random state: 2
Checking on random state: 3
Checking on random state: 4


: 

## TF-IDF

In [16]:
def in_wordnet(word):
    synsets = wn.synsets(word)
    return len(synsets) > 0

def contains_number(word):
    for char in word:
        if char.isnumeric():
            return True
    return False

def min_length_word(word):
    if  len(word) in [1,2]:
        return True
    return False

def custom_preprocessor(text):
    lematizer = WordNetLemmatizer()
    used_terms = {} # keep track of which terms have already been considered
    tokens = word_tokenize(text)
    filtered_tokens = []
    for word in tokens:
        if (not contains_number(word)) and (not min_length_word(word)) and (word not in stopwords.words('english')) and (in_wordnet(word)):
            lema_word = lematizer.lemmatize(word)
            if lema_word in used_terms.keys():
                continue
            else:
                used_terms[lema_word] = 0
                filtered_tokens.append(lema_word)
    return ' '.join(filtered_tokens)

def print_terms(terms):
    for term in terms:
        print(term)


In [21]:
def KMeans_Labels(X, n, rstate_limit, true_labels):

    # Specify the number of clusters (you can choose an appropriate value)
    num_clusters = n
    
    # find centoids which give maximum purity
    purity_collection = {}
    for i in range(rstate_limit):
        clusters = KMeans(n_init='auto', n_clusters=num_clusters, random_state=i, init='k-means++').fit(X).labels_
        purity_collection[i] = Purity_Score(true_labels, clusters)
    
    max_rand_state = max(purity_collection, key=purity_collection.get)
    print(f"Maximum purity of {purity_collection[max_rand_state]} found on random state {max_rand_state}")

    # Create a KMeans model
    kmeans = KMeans(n_init='auto', n_clusters=num_clusters, random_state=max_rand_state, init='k-means++')
    # Fit the KMeans model to the TF-IDF data
    kmeans.fit(X)
    # Get the cluster assignments for each document
    cluster_assignments = kmeans.labels_
    
    return cluster_assignments

def Actual_Labels():
    actual_labels = {} # dictionary to store true assignments for each document
    label_path = os.path.join(os.getcwd(), 'Reuters', 'cats.txt')
    
    with open(label_path, 'r') as file:
        for line in file:
            parts = line.split()
            doc_id = parts[0]
            categories = parts[1:]
            for category in categories:
                if category.isdigit():
                    actual_labels[doc_id] = int(category) - 1 # assuming categories start from 1, not 0
    
    label_seq = [] # save labels in order of documents read
    for doc in doc_name:
        label_seq.append(actual_labels.get(doc, -1))  # -1 for documents with no labels
    
    return label_seq

def print_results(true_labels, predicted_labels, X):
    print("RESULTS:")
    print(f"Purity: {Purity_Score(true_labels, predicted_labels)}")
    print(f"Silhouette Score: {silhouette_score(X, predicted_labels)}")

def wrapperFunction():
    # ReadDocuments('Reuters\\training')
    vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', preprocessor=custom_preprocessor)
    X = vectorizer.fit_transform(doc_content)

    SaveFeatures(X, 'Reuters_TFIDF_Features.pkl')

    X = ReadFeatures('..\Stored Feature Matrix\Reuter_Features\Reuters_TFIDF_Features.pkl')
    
    true_labels = list(actual_labels.values())
    predicted_labels = KMeans_Labels(X, 4, 700, true_labels)
    Evaluate(X, true_labels, predicted_labels, "Tf-Idf")
    return predicted_labels, X


In [22]:
tfidfLabels, tfidfMatrix = wrapperFunction()

Maximum purity of 0.724972191323693 found on random state 555
Purity: 0.724972191323693
Silhouette Score: 0.017570933661307995
ARI Score: 0.21702607367248578
NMI Score: 0.3494098071368686


: 

## Consensus Clustering

In [None]:
def calculate_consensus_matrix(labels1, labels2):
    n = len(labels1)
    consensus_matrix = np.zeros((n, n))

    for i in range(n):
        for j in range(i, n):
            #Calculate the Jaccard similarity between the two label sets
            intersection = np.intersect1d(labels1[i], labels2[j])
            union = np.union1d(labels1[i], labels2[j])
            agreement = len(intersection) / len(union)
        

            consensus_matrix[i, j] = agreement
            consensus_matrix[j, i] = agreement

    return consensus_matrix

In [None]:
# print("Building Consensus Matrix...")
# consensus_matrix = calculate_consensus_matrix(tfidfLabels, lexicalChainsLabels)

# print("Saving Consensus Matrix...")
# SaveFeatures(consensus_matrix, "Reuters_Consensus_Matrix.pkl")

consensus_matrixs = ReadFeatures("..\Stored Feature Matrix\Reuter_Features\Reuters_Consensus_Matrix.pkl")

print("Applying Kmeans Clustering...")
n_clusters = 4  # You can adjust this as needed
purity_collection = {}
for i in range(50):
    print(f"Trying clustering on random state {i}")
    clusters = SpectralClustering(n_clusters=n_clusters, affinity="precomputed", random_state=i).fit(1 - consensus_matrixs).labels_
    purity_collection[i] = Purity_Score(label_seq, clusters)

max_rand_state = max(purity_collection, key=purity_collection.get)
print(f"Maximum purity of {purity_collection[max_rand_state]} found on random state {max_rand_state}")
spectral_labels = SpectralClustering(n_clusters=n_clusters, affinity="precomputed", random_state=max_rand_state).fit(1 - consensus_matrixs).labels_

Evaluate(1-consensus_matrixs, label_seq, spectral_labels, "Consensus Clustering")

Applying Kmeans Clustering...
Trying clustering on random state 0
Trying clustering on random state 1
Trying clustering on random state 2
Trying clustering on random state 3
Trying clustering on random state 4
Trying clustering on random state 5
Trying clustering on random state 6
Trying clustering on random state 7
Trying clustering on random state 8
Trying clustering on random state 9
Trying clustering on random state 10
Trying clustering on random state 11
Trying clustering on random state 12
Trying clustering on random state 13
Trying clustering on random state 14
Trying clustering on random state 15
Trying clustering on random state 16
Trying clustering on random state 17
Trying clustering on random state 18
Trying clustering on random state 19
Trying clustering on random state 20
Trying clustering on random state 21
Trying clustering on random state 22
Trying clustering on random state 23
Trying clustering on random state 24
Trying clustering on random state 25
Trying clustering 

  ret = a @ b


Purity: 0.621153133110864
Silhouette Score: 0.2591406333596326
ARI Score: 0.06628240284570283
NMI Score: 0.14891482754107224


## Topical Clustering

In [None]:
num_topics = 4  # Adjust as needed
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)

lda.fit(tfidfMatrix)

# Get the topic assignments for each document
topic_labels = lda.transform(tfidfMatrix).argmax(axis=1)
combined_labels = [lexicalChainsLabels, tfidfLabels, topic_labels]
combined_labels = list(map(list, zip(*combined_labels)))

normalize_combined_features = Normalizer().fit_transform(combined_labels)
topic_purity_collection = {}
for i in range(500):
    print(f"Trying Clustering on random state {i}")
    topic_clusters = (KMeans(n_init="auto", n_clusters=4, random_state=i, init="k-means++").fit(normalize_combined_features).labels_)
    topic_purity_collection[i] = Purity_Score(label_seq, topic_clusters)

topic_max_rand_state = max(topic_purity_collection, key=topic_purity_collection.get)
print(f"Maximum purity of {topic_purity_collection[topic_max_rand_state]} found on random state {topic_max_rand_state}")
max_labels = (KMeans(n_init="auto", n_clusters=4, random_state=topic_max_rand_state, init="k-means++").fit(normalize_combined_features).labels_)

Evaluate(normalize_combined_features, label_seq, max_labels, "Topical Clustering")

Trying Clustering on random state 0
Trying Clustering on random state 1
Trying Clustering on random state 2
Trying Clustering on random state 3
Trying Clustering on random state 4
Trying Clustering on random state 5
Trying Clustering on random state 6
Trying Clustering on random state 7
Trying Clustering on random state 8
Trying Clustering on random state 9
Trying Clustering on random state 10
Trying Clustering on random state 11
Trying Clustering on random state 12
Trying Clustering on random state 13
Trying Clustering on random state 14
Trying Clustering on random state 15
Trying Clustering on random state 16
Trying Clustering on random state 17
Trying Clustering on random state 18
Trying Clustering on random state 19
Trying Clustering on random state 20
Trying Clustering on random state 21
Trying Clustering on random state 22
Trying Clustering on random state 23
Trying Clustering on random state 24
Trying Clustering on random state 25
Trying Clustering on random state 26
Trying Clus

In [None]:
end = time.time()
time_taken = end - start

In [None]:
print(f"Time taken: {time_taken:.2f}s")