In [1]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from tabulate import tabulate
from collections import Counter
from gensim.models import Word2Vec
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

In [2]:
dataset = ["I love playing football on the weekends",
"I enjoy hiking and camping in the mountains",
"I like to read books and watch movies",
"I prefer playing video games over sports",
"I love listening to music and going to concerts"]

In [3]:
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to C:\Users\End
[nltk_data]     User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\End
[nltk_data]     User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
def preprocess(text):
    #convert text to lowercase
    text = text.lower()
    #tokenize
    tokens  = text.split()
    #remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [5]:
preprocessed_dataset = [preprocess(doc) for doc in dataset]
preprocessed_dataset

['love playing football weekend',
 'enjoy hiking camping mountain',
 'like read book watch movie',
 'prefer playing video game sport',
 'love listening music going concert']

# TEXT CLUSTERING USING TF-IDF VECTORIZER

In [6]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(preprocessed_dataset)

In [7]:
k = 2 # Define the number of clusters
km = KMeans(n_clusters=k)
km.fit(X)

# Predict the clusters for each document
y_pred = km.predict(X)

# Display the document and its predicted cluster in a table
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(preprocessed_dataset, y_pred)])
print(tabulate(table_data, headers="firstrow"))

# Print top terms per cluster
print("\nTop terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()

for i in range(k):
    print("Cluster %d:" % i)
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])
    print()

  super()._check_params_vs_input(X, default_n_init=10)


Document                              Predicted Cluster
----------------------------------  -------------------
love playing football weekend                         1
enjoy hiking camping mountain                         0
like read book watch movie                            0
prefer playing video game sport                       0
love listening music going concert                    1

Top terms per cluster:
Cluster 0:
 camping
 mountain
 hiking
 enjoy
 video
 sport
 prefer
 game
 book
 read

Cluster 1:
 love
 football
 weekend
 going
 music
 concert
 listening
 playing
 sport
 camping



In [8]:
# Calculate purity
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity:", purity)

Purity: 0.6


# TEXT CLUSTERING USING WORD2VEC VECTORIZER

In [9]:
tokenized_dataset = [doc.split() for doc in preprocessed_dataset]
word2vec_model = Word2Vec(sentences=tokenized_dataset, vector_size=100, window=5, min_count=1, workers=4)

In [10]:
X = np.array([np.mean([word2vec_model.wv[word] for word in doc.split() if word in word2vec_model.wv], axis=0) for doc in preprocessed_dataset])

In [11]:
k = 2 # Define the number of clusters
km = KMeans(n_clusters=k)
km.fit(X)

# Predict the clusters for each document
y_pred = km.predict(X)

# Tabulate the document and predicted cluster
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(preprocessed_dataset, y_pred)])
print(tabulate(table_data, headers="firstrow"))

  super()._check_params_vs_input(X, default_n_init=10)


Document                              Predicted Cluster
----------------------------------  -------------------
love playing football weekend                         1
enjoy hiking camping mountain                         0
like read book watch movie                            0
prefer playing video game sport                       1
love listening music going concert                    0


In [12]:
# Calculate purity
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples

print("Purity:", purity)

Purity: 0.6
