In [None]:
import umap

import numpy as np
import pandas as pd

from collections import Counter

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

In [None]:
laser = np.load('tweets/it_laser_11578.npy')
sbert = np.load('tweets/it_sbert_11578.npy')

In [None]:
df = pd.read_excel('tweets/tweets_11578_clean.xlsx')

In [None]:
# Compute silhouette scores for LASER-PCA

pca_reducer = PCA(n_components = 2)
laser_pca = pca_reducer.fit_transform(laser)

cluster_range = [30, 40, 50, 60, 70, 80]
silhouette_scores = []

for k in cluster_range:
    kmeans = KMeans(n_clusters = k, random_state = 42, n_init=10)
    cluster_labels = kmeans.fit_predict(laser_pca)
    sil_score = silhouette_score(laser_pca, cluster_labels)
    silhouette_scores.append(sil_score)
    print(f"Silhouette Score for {k} clusters: {sil_score:.4f}")

In [None]:
# Compute silhouette scores for LASER-UMAP

umap_reducer = umap.UMAP(n_components = 2, random_state=42)
laser_umap = umap_reducer.fit_transform(laser)

cluster_range = [30, 40, 50, 60, 70, 80]
silhouette_scores = []

for k in cluster_range:
    kmeans = KMeans(n_clusters = k, random_state=42, n_init=10)
    cluster_labels = kmeans.fit_predict(laser_umap)
    sil_score = silhouette_score(laser_umap, cluster_labels)
    silhouette_scores.append(sil_score)
    print(f"Silhouette Score for {k} clusters: {sil_score:.4f}")

In [None]:
# Compute silhouette scores for SBERT-PCA

pca_reducer = PCA(n_components = 2)
sbert_pca = pca_reducer.fit_transform(sbert)

cluster_range = [30, 40, 50, 60, 70, 80]
silhouette_scores = []

for k in cluster_range:
    kmeans = KMeans(n_clusters = k, random_state = 42, n_init=10)
    cluster_labels = kmeans.fit_predict(sbert_pca)
    sil_score = silhouette_score(sbert_pca, cluster_labels)
    silhouette_scores.append(sil_score)
    print(f"Silhouette Score for {k} clusters: {sil_score:.4f}")

In [None]:
# Compute silhouette scores for SBERT-UMAP

umap_reducer = umap.UMAP(n_components = 2, random_state=42)
sbert_umap = umap_reducer.fit_transform(sbert)

cluster_range = [30, 40, 50, 60, 70, 80]
silhouette_scores = []

for k in cluster_range:
    kmeans = KMeans(n_clusters = k, random_state=42, n_init=10)
    cluster_labels = kmeans.fit_predict(sbert_umap)
    sil_score = silhouette_score(sbert_umap, cluster_labels)
    silhouette_scores.append(sil_score)
    print(f"Silhouette Score for {k} clusters: {sil_score:.4f}")

In [None]:
# Plot SSE to determine the optimal cluster number

sse = []
cluster_range = [30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]
for k in cluster_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(sbert_umap)
    sse.append(kmeans.inertia_)
    
plt.figure(figsize=(10, 6))
plt.plot(cluster_range, sse, 'bo-', color='black')
plt.xlabel('Number of Clusters', fontsize =16)
plt.ylabel('Sum of Squared Errors', fontsize =16)
plt.xticks(cluster_range)
plt.grid(True)
plt.show()

In [None]:
# Set number of clusters to 50 and fit the KMeans model
kmeans_model = KMeans(n_clusters=50, random_state=42, n_init=10)
kmeans_model.fit(sbert_umap)

df['n_50'] = kmeans_model.labels_

In [None]:
# Split the cleaned text individual words for each tweet and get the top 5 most common keywords in each cluster


df['keywords'] = df['text_noent_clean'].str.split()

top_keywords_by_cluster = df.groupby('n_50')['keywords'].sum()

def get_top_keywords(keywords, top_n=5):
    keyword_counts = Counter(keywords)
    return keyword_counts.most_common(top_n)

df_top_keywords = top_keywords_by_cluster.apply(get_top_keywords)

In [None]:
df = df.merge(df_top_keywords.rename('top_keywords'), left_on='n_50', right_index=True)
df = df.sort_values(by='id')

In [None]:
# Generate a random subset for manual labeling

df_random = df.groupby('n_50').apply(lambda x: x.sample(n=10, random_state=42)).reset_index(drop=True)
df_random.to_excel('tweets/tweets_results/tweets_500_random.xlsx', index = False)

In [None]:
# Sum the labeled category values within each cluster

df_label = pd.read_excel('tweets/tweets_500_random.xlsx')

categories = ['Agriculture', 'Water', 'Ecosystem', 'Economy', 'Society', 'General']
cluster_sum = df_label.groupby('n_50')[categories].sum()

In [None]:
# Assign category scores to each cluster and map the scores back to tweets

def assign_scores(count):
    if count >= 6:
        return 1
    elif count >= 3:
        return 0.5
    elif count >= 1:
        return 0.2
    else:
        return 0
    
cluster_scores = cluster_sum.applymap(assign_scores)
cluster_scores = cluster_scores.reset_index()

df_scores = df.merge(cluster_scores, on='n_50', how='left')

In [None]:
df_scores.to_excel('tweets/tweets_11578_scores.xlsx')

In [None]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm

from matplotlib import colors

In [None]:
# Visualize embeddings results

sector_color_maps = {
    "Water": cm.Blues,
    "Society": cm.RdPu,
    "General": cm.Greys,
    "Agriculture": cm.YlOrBr,
    "Ecosystem": cm.Greens,
    "Economy": cm.Purples
}

# Map clusters to their dominant sectors
cluster_to_category = {
    0: "Society", 1: "Water", 2: "Water", 3: "Agriculture", 4: "Society",
    5: "Water", 6: "Water", 7: "Water", 8: "Agriculture", 9: "Water",
    10: "General", 11: "General", 12: "Society", 13: "Water", 14: "General",
    15: "General", 16: "Society", 17: "Society", 18: "Ecosystem", 19: "Society",
    20: "Water", 21: "Society", 22: "Economy", 23: "Water", 24: "Agriculture",
    25: "Water", 26: "Society", 27: "Agriculture", 28: "General", 29: "Water",
    30: "Water", 31: "Society", 32: "Water", 33: "General", 34: "General",
    35: "General", 36: "Society", 37: "Society", 38: "Ecosystem", 39: "Society",
    40: "Water", 41: "Agriculture", 42: "Society", 43: "Society", 44: "General",
    45: "Water", 46: "Society", 47: "Water", 48: "Society", 49: "Water"
}

# Generate a color for each cluster using color maps
cluster_colors = {}
for category, color_map in sector_color_maps.items():
    category_clusters = [cluster for cluster, cat in cluster_to_category.items() if cat == category]
    num_clusters = len(category_clusters)
    colors = color_map(np.linspace(0.4, 0.8, num_clusters))
    for cluster, color in zip(category_clusters, colors):
        cluster_colors[cluster] = color

plt.figure(figsize=(12, 8))

unique_clusters = df['n_50'].unique()
for cluster in unique_clusters:
    cluster_points = sbert_umap[df['n_50'] == cluster]
    color = cluster_colors.get(cluster, "black") 
    plt.scatter(cluster_points[:, 0], cluster_points[:, 1], label=f"Cluster {cluster} ({cluster_to_category[cluster]})", color=color, s=10, alpha=0.7)

# Calculate centroids and add keyword labels only for clusters 18, 22, and 27
for cluster in [18, 22, 27]:
    if cluster in unique_clusters:
        keywords = df[df['n_50'] == cluster]['top_keywords'].iloc[0]
        keywords_text = ', '.join([keyword[0] for keyword in keywords[:3]])

        label_text = f"Cluster {cluster}\n{keywords_text}"

        cluster_points = sbert_umap[df['n_50'] == cluster]
        centroid = cluster_points.mean(axis=0)
        
        plt.text(centroid[0], centroid[1], label_text, fontsize=8, ha='center', va='center', 
                 bbox=dict(facecolor='white', alpha=0.6))

plt.xlabel("UMAP Dimension 1")
plt.ylabel("UMAP Dimension 2")
plt.title("UMAP Visualization of Clustered Embeddings with Sector-Based Color Schemes")
plt.show()