# Imports

## Libraries

In [8]:
import os
import re
import string
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import AgglomerativeClustering
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import BisectingKMeans
from sklearn.cluster import DBSCAN


import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

import gensim.downloader as api
word2vec_model = api.load("word2vec-google-news-300")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Data

### Raw

In [2]:
data_folder_path = '/text-mining/data/02_text_representation/Corpus-representacion'

In [3]:
def save_files_to_dict(folder_path):
    files_dict = {}
    for root, _, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            try:
                with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
                    files_dict[file_path] = f.read()
            except Exception as e:
                print(f"Could not read file {file_path}: {e}")
    return files_dict

In [4]:
data_dict = save_files_to_dict(data_folder_path)
data = list(data_dict.values())
len(data)

866

### Labels

In [5]:
categories = [path.split('/')[-2] for path in data_dict.keys()]
label_encoder = LabelEncoder()
true_labels = label_encoder.fit_transform(categories)
unique_categories = list(set(categories))
print(unique_categories)

['comp.sys.ibm.pc.hardware', 'talk.politics.mideast', 'talk.politics.guns', 'sci.electronics', 'rec.autos', 'rec.sport.hockey', 'comp.sys.mac.hardware']


### Representations

In [6]:
tf_representations = np.load("../02_text_representation/02_text_representations_tf.npy")
tfidf_representations = np.load("../02_text_representation/02_text_representations_tfidf.npy")
word2vec_avg_representations = np.load("../02_text_representation/02_text_representations_word2vec_avg.npy")
word2vec_sum_representations = np.load("../02_text_representation/02_text_representations_word2vec_sum.npy")

# Analysis

In [18]:
def analyze_document_statistics(data, categories, unique_categories):
    """
    Analyze document statistics by category and create visualizations.

    Args:
        data: List of document texts
        categories: List of category labels for each document
        unique_categories: List of unique category names
    """
    # Calculate word counts for each category
    category_stats = {}

    for i, category in enumerate(unique_categories):
        # Get indices for this category
        category_indices = [j for j, cat in enumerate(categories) if cat == category]

        # Get documents for this category
        category_docs = [data[j] for j in category_indices]

        # Calculate word counts
        word_counts = [len(doc.split()) for doc in category_docs]

        # Store statistics
        category_stats[category] = {
            'doc_count': len(category_docs),
            'word_count_mean': np.mean(word_counts) if word_counts else 0, # Handle empty categories
            'word_count_std': np.std(word_counts) if word_counts else 0   # Handle empty categories
        }

    # Print statistics information
    print("Document Statistics by Category:")
    for category, stats in category_stats.items():
        print(f"\n{category}:")
        print(f"  Number of documents: {stats['doc_count']}")
        print(f"  Mean word count: {stats['word_count_mean']:.2f} words")
        print(f"  Std word count: {stats['word_count_std']:.2f} words")

    # Create dataframe for plotting
    word_count_df = pd.DataFrame({
        'Category': list(category_stats.keys()),
        'Mean Word Count': [stats['word_count_mean'] for stats in category_stats.values()],
        'Std Word Count': [stats['word_count_std'] for stats in category_stats.values()]
    })

    # --- Plot for Number of Documents ---
    plt.figure(figsize=(10, 8))
    doc_counts = [stats['doc_count'] for stats in category_stats.values()]
    categories_for_plot = list(category_stats.keys())
    ax_doc_count = sns.barplot(x=categories_for_plot, y=doc_counts)

    # Add values on top of bars
    for i, p in enumerate(ax_doc_count.patches):
        ax_doc_count.annotate(f'{p.get_height():.0f}',
                              (p.get_x() + p.get_width() / 2., p.get_height()),
                              ha='center', va='bottom', fontsize=10)

    plt.title('Number of Documents by Category')
    plt.ylabel('Number of Documents')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
    # --- End Plot for Number of Documents ---

    # Plot word count statistics
    plt.figure(figsize=(10, 8))
    ax = sns.barplot(x='Category', y='Mean Word Count', data=word_count_df)

    # Add values on top of bars
    for i, p in enumerate(ax.patches):
        ax.annotate(f'{p.get_height():.0f}',
                    (p.get_x() + p.get_width() / 2., p.get_height()),
                    ha='center', va='bottom', fontsize=10)

    plt.errorbar(
        x=np.arange(len(word_count_df)),
        y=word_count_df['Mean Word Count'],
        yerr=word_count_df['Std Word Count'],
        fmt='none', capsize=5, color='black'
    )
    plt.title('Mean Word Count by Category')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

    return category_stats

In [None]:
category_stats = analyze_document_statistics(data, categories, unique_categories)

# Clustering

In [7]:
def analyze_clustering_confusion(true_labels, predicted_clusters, title='Confusion Matrix After Reassignment: Goldstandard vs Clustering Results'):

    true_labels = label_encoder.fit_transform(categories)

    # Analyze the matrix to suggest correspondences
    correspondences = {}
    unique_clusters = np.unique(predicted_clusters)
    for cluster in unique_clusters:
        # Find the true label with the maximum count for each cluster
        cluster_indices = np.where(predicted_clusters == cluster)
        true_labels_for_cluster = true_labels[cluster_indices]
        if len(true_labels_for_cluster) > 0:
            most_common_label = np.bincount(true_labels_for_cluster).argmax()
            correspondences[cluster] = most_common_label
    
    # Reassign predicted clusters based on the correspondences
    reassigned_clusters = [correspondences[cluster] for cluster in predicted_clusters]
    
    # Create confusion matrix after reassignment
    cm_reassigned = confusion_matrix(true_labels, reassigned_clusters)
    
    # Create a heatmap of the confusion matrix after reassignment
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm_reassigned, annot=True, fmt='d', cmap='Blues',
                xticklabels=[f'Cluster {i}' for i in unique_clusters],
                yticklabels=np.unique(true_labels))
    plt.title(title)
    plt.xlabel('Predicted Clusters')
    plt.ylabel('Goldstandard Labels')
    plt.show()
    
    # Calculate accuracy or other metrics
    accuracy = accuracy_score(true_labels, reassigned_clusters)
    print(f"Accuracy after reassignment: {accuracy:.2f}")
    
    return cm_reassigned, correspondences, accuracy

In [8]:
n_clusters = 7

## Partitional: K-means

In [9]:
def perform_kmeans_clustering(representations, n_clusters):
    
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    predicted_clusters = kmeans.fit_predict(representations)
    return predicted_clusters

In [None]:
# Perform K-means clustering on each representation
predicted_clusters_tf = perform_kmeans_clustering(tf_representations, n_clusters)
predicted_clusters_tfidf = perform_kmeans_clustering(tfidf_representations, n_clusters)
predicted_clusters_word2vec_avg = perform_kmeans_clustering(word2vec_avg_representations, n_clusters)
predicted_clusters_word2vec_sum = perform_kmeans_clustering(word2vec_sum_representations, n_clusters)

# Analyze clustering results and store the confusion matrices
cm_tf, correspondences_tf, accuracy_tf = analyze_clustering_confusion(categories, predicted_clusters_tf)
cm_tfidf, correspondences_tfidf, accuracy_tfidf = analyze_clustering_confusion(true_labels, predicted_clusters_tfidf)
cm_word2vec_avg, correspondences_word2vec_avg, accuracy_word2vec_avg = analyze_clustering_confusion(true_labels, predicted_clusters_word2vec_avg)
cm_word2vec_sum, correspondences_word2vec_sum, accuracy_word2vec_sum = analyze_clustering_confusion(true_labels, predicted_clusters_word2vec_sum)

## Hierarchical: Agglomerative

In [None]:
# Perform Agglomerative clustering on each representation
agglomerative_clusters_tf = AgglomerativeClustering(n_clusters=n_clusters).fit_predict(tf_representations)
agglomerative_clusters_tfidf = AgglomerativeClustering(n_clusters=n_clusters).fit_predict(tfidf_representations)
agglomerative_clusters_word2vec_avg = AgglomerativeClustering(n_clusters=n_clusters).fit_predict(word2vec_avg_representations)
agglomerative_clusters_word2vec_sum = AgglomerativeClustering(n_clusters=n_clusters).fit_predict(word2vec_sum_representations)

# Analyze clustering results and store the confusion matrices
cm_agglomerative_tf, correspondences_agglomerative_tf, accuracy_agglomerative_tf = analyze_clustering_confusion(categories, agglomerative_clusters_tf)
cm_agglomerative_tfidf, correspondences_agglomerative_tfidf, accuracy_agglomerative_tfidf = analyze_clustering_confusion(true_labels, agglomerative_clusters_tfidf)
cm_agglomerative_word2vec_avg, correspondences_agglomerative_word2vec_avg, accuracy_agglomerative_word2vec_avg = analyze_clustering_confusion(true_labels, agglomerative_clusters_word2vec_avg)
cm_agglomerative_word2vec_sum, correspondences_agglomerative_word2vec_sum, accuracy_agglomerative_word2vec_sum = analyze_clustering_confusion(true_labels, agglomerative_clusters_word2vec_sum)


## Density: DBSCAN

In [None]:
# Perform DBSCAN clustering on each representation
dbscan_clusters_tf = DBSCAN().fit_predict(tf_representations)
dbscan_clusters_tfidf = DBSCAN().fit_predict(tfidf_representations)
dbscan_clusters_word2vec_avg = DBSCAN().fit_predict(word2vec_avg_representations)
dbscan_clusters_word2vec_sum = DBSCAN().fit_predict(word2vec_sum_representations)

# Analyze clustering results and store the confusion matrices
cm_dbscan_tf, correspondences_dbscan_tf, accuracy_dbscan_tf = analyze_clustering_confusion(categories, dbscan_clusters_tf)
cm_dbscan_tfidf, correspondences_dbscan_tfidf, accuracy_dbscan_tfidf = analyze_clustering_confusion(true_labels, dbscan_clusters_tfidf)
cm_dbscan_word2vec_avg, correspondences_dbscan_word2vec_avg, accuracy_dbscan_word2vec_avg = analyze_clustering_confusion(true_labels, dbscan_clusters_word2vec_avg)
cm_dbscan_word2vec_sum, correspondences_dbscan_word2vec_sum, accuracy_dbscan_word2vec_sum = analyze_clustering_confusion(true_labels, dbscan_clusters_word2vec_sum)


# Evaluation
## Internal validation
### Silhouette score
### Calinski-Harabasz index
### Davies-Bouldin index
## External validation
### Confusion matrix
### Classification report
### Adjusted Rand index