In [3]:
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from tabulate import tabulate
from collections import Counter
import nltk

In [18]:
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

def preprocess_text(text):
    text = text.lower()
    
    text = re.sub(r'[^a-z\s]', '', text)
    
    tokens = text.split()
    
    tokens = [ps.stem(word) for word in tokens if word not in stop_words]
    
    return ' '.join(tokens)

[nltk_data] Downloading package stopwords to /Users/ammar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
dataset = ["I love playing football on the weekends",
           "I enjoy hiking and camping in the mountains",
           "I like to read books and watch movies",
           "I prefer playing video games over sports",
           "I love listening to music and going to concerts"]


In [9]:
preprocessed_dataset = [preprocess_text(doc) for doc in dataset]

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(preprocessed_dataset)

k = 2
km = KMeans(n_clusters=k)
km.fit(X)

y_pred = km.predict(X)

table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(dataset, y_pred)])
print(tabulate(table_data, headers="firstrow"))

print("\nTop terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in range(k):
    print("Cluster %d:" % i)
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])
    print()

total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples

print("Purity:", purity)

Document                                           Predicted Cluster
-----------------------------------------------  -------------------
I love playing football on the weekends                            1
I enjoy hiking and camping in the mountains                        0
I like to read books and watch movies                              0
I prefer playing video games over sports                           0
I love listening to music and going to concerts                    1

Top terms per cluster:
Cluster 0:
 camp
 mountain
 hike
 enjoy
 video
 sport
 prefer
 game
 book
 read

Cluster 1:
 love
 footbal
 weekend
 go
 music
 concert
 listen
 play
 sport
 camp

Purity: 0.6


In [16]:
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.cluster import KMeans
from gensim.models import Word2Vec
from tabulate import tabulate
from collections import Counter
import nltk

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

def preprocess_text(text):
    text = text.lower()
    
    text = re.sub(r'[^a-z\s]', '', text)
    
    tokens = text.split()
    
    tokens = [ps.stem(word) for word in tokens if word not in stop_words]
    
    return ' '.join(tokens)

dataset = ["I love playing football on the weekends",
           "I enjoy hiking and camping in the mountains",
           "I like to read books and watch movies",
           "I prefer playing video games over sports",
           "I love listening to music and going to concerts"]

preprocessed_dataset = [preprocess_text(doc) for doc in dataset]

tokenized_dataset = [doc.split() for doc in preprocessed_dataset]
word2vec_model = Word2Vec(sentences=tokenized_dataset, vector_size=100,
                           window=5, min_count=1, workers=4)

X = np.array([np.mean([word2vec_model.wv[word] for word in doc.split() if word in 
                       word2vec_model.wv], axis=0) for doc in preprocessed_dataset])

k = 2
km = KMeans(n_clusters=k)
km.fit(X)

y_pred = km.predict(X)

table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(dataset, y_pred)])
print(tabulate(table_data, headers="firstrow"))

total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples

print("Purity:", purity)


Document                                           Predicted Cluster
-----------------------------------------------  -------------------
I love playing football on the weekends                            0
I enjoy hiking and camping in the mountains                        1
I like to read books and watch movies                              0
I prefer playing video games over sports                           0
I love listening to music and going to concerts                    1
Purity: 0.6


[nltk_data] Downloading package stopwords to /Users/ammar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
