In [1]:
import pandas as pd
from sklearn.cluster import KMeans
import warnings

# Ignore FutureWarnings, there is a function call within the sklearn library that is using an outdated pandas method (I think)
warnings.simplefilter(action='ignore', category=FutureWarning)

#Load the data from the Excel file
data = pd.read_excel("tdf-vectors.xlsx")

# Transpose the data so that documents are rows and words are columns
transposed_data = data.set_index("Unnamed: 0").transpose()

# Specify the centroid document names and extract their TF-IDF vectors
centroid_names = [
    "9901_sports.txt-tfidf", 
    "9902_food.txt-tfidf", 
    "9903_tech.txt-tfidf", 
    "9904_science.txt-tfidf", 
    "9905_business.txt-tfidf", 
    "9906_politics.txt-tfidf"
]
centroids = transposed_data.loc[centroid_names]

# Prepare the data for clustering (excluding the centroids themselves)
data_for_clustering = transposed_data.drop(centroid_names)

# Initialize KMeans with the specified centroids and fit the model to the data
kmeans = KMeans(n_clusters=6, init=centroids, n_init=1)
labels = kmeans.fit_predict(data_for_clustering)

# Add the cluster labels to the dataset
data_for_clustering['Cluster'] = labels

# Create a mapping from cluster number to cluster name
cluster_mapping = {
    0: "sports",
    1: "food",
    2: "tech",
    3: "science",
    4: "business",
    5: "politics"
}

#  Replace cluster numbers with names in the dataframe
data_for_clustering['Cluster'] = data_for_clustering['Cluster'].map(cluster_mapping)

# Create a simplified dataframe showing the document and its cluster
document_clusters = data_for_clustering[['Cluster']].reset_index()
document_clusters.columns = ['Document', 'Cluster']

# At this point, 'document_clusters' contains the documents with their respective cluster names.

# Group by 'Cluster' and create a dictionary where keys are cluster names and values are lists of documents
clustered_documents = {}
for cluster, group in document_clusters.groupby('Cluster'):
    clustered_documents[cluster] = list(group['Document'])

# Display documents for each cluster 
clustered_document_samples = {cluster: docs for cluster, docs in clustered_documents.items()}
clustered_document_samples

{'business': ['0203_food.txt-tfidf'],
 'food': ['0201_food.txt-tfidf'],
 'politics': ['0105_food.txt-tfidf',
  '0101_sports.txt-tfidf',
  '0117_sports.txt-tfidf',
  '0306_science.txt-tfidf',
  '0111_sports.txt-tfidf',
  '0502_business.txt-tfidf',
  '0202_food.txt-tfidf',
  '0114_sports.txt-tfidf',
  '0108_sports.txt-tfidf',
  '0520_politics.txt-tfidf',
  '0116_food.txt-tfidf',
  '0302_science.txt-tfidf'],
 'science': ['0118_sports.txt-tfidf',
  '0518_business.txt-tfidf',
  '0515_politics.txt-tfidf',
  '0104_sports.txt-tfidf',
  '0305_tech.txt-tfidf',
  '0310_science.txt-tfidf',
  '0109_sports.txt-tfidf',
  '0409_science.txt-tfidf',
  '0519_business.txt-tfidf',
  '0309_science.txt-tfidf',
  '0420_tech.txt-tfidf',
  '0313_tech.txt-tfidf',
  '0312_tech.txt-tfidf',
  '0315_science.txt-tfidf',
  '0419_science.txt-tfidf',
  '0415_tech.txt-tfidf',
  '0216_food.txt-tfidf',
  '0217_food.txt-tfidf',
  '0319_tech.txt-tfidf',
  '0318_tech.txt-tfidf',
  '0406_science.txt-tfidf',
  '0106_sports.txt-