In [None]:
!pip install -U sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import KMeans
import numpy as np

embedder = SentenceTransformer('all-MiniLM-L6-v2')

corpus = ['I want to learn more on cache.',
          'What is virtual memory?.',
          'How does virtual memory differ from physical memory?',
          'What is the purpose of virtual memory in a computer system?',
          'How is virtual memory implemented in modern operating systems?',
          'What is cache memory and what is its purpose?',
          'What are the main components of a hard disk drive?',
          'What is the difference between a hard disk drive and a solid-state drive (SSD)?',
          'How does a hard disk store and retrieve data?',
          'What is the cache replacement policy and why is it important?',
          'What are the different levels of cache typically found in modern processors?',
          'How does cache memory improve the performance of a computer system?'
          ]
corpus_embeddings = embedder.encode(corpus)

corpus_embeddings = corpus_embeddings /  np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
clustering_model = KMeans(n_clusters=3)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_
print(cluster_assignment)

[1 0 0 0 0 1 2 2 2 1 1 1]




In [None]:
topics = {
    0: 'virtual memory',
    1: 'cache',
    2: 'hard disk'
}

In [None]:
clustered_sentences = {}
for sentence_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in clustered_sentences:
        clustered_sentences[cluster_id] = []

    clustered_sentences[cluster_id].append(corpus[sentence_id])
clustered_sentences

{1: ['I want to learn more on cache.',
  'What is cache memory and what is its purpose?',
  'What is the cache replacement policy and why is it important?',
  'What are the different levels of cache typically found in modern processors?',
  'How does cache memory improve the performance of a computer system?'],
 0: ['What is virtual memory?.',
  'How does virtual memory differ from physical memory?',
  'What is the purpose of virtual memory in a computer system?',
  'How is virtual memory implemented in modern operating systems?'],
 2: ['What are the main components of a hard disk drive?',
  'What is the difference between a hard disk drive and a solid-state drive (SSD)?',
  'How does a hard disk store and retrieve data?']}

In [None]:
actual_dict = {}
for i in clustered_sentences:
  actual_dict[topics.get(i)] = clustered_sentences.get(i)

In [None]:
for key, value in actual_dict.items():
    print('* ', key)
    for i in value:
      print(i)

    print()

*  cache
I want to learn more on cache.
What is cache memory and what is its purpose?
What is the cache replacement policy and why is it important?
What are the different levels of cache typically found in modern processors?
How does cache memory improve the performance of a computer system?

*  virtual memory
What is virtual memory?.
How does virtual memory differ from physical memory?
What is the purpose of virtual memory in a computer system?
How is virtual memory implemented in modern operating systems?

*  hard disk
What are the main components of a hard disk drive?
What is the difference between a hard disk drive and a solid-state drive (SSD)?
How does a hard disk store and retrieve data?



In [None]:
test_q = 'What is virtual memory and how does it work?'

encoded_test_q = embedder.encode(test_q)
encoded_test_q = encoded_test_q /  np.linalg.norm([encoded_test_q], axis=1, keepdims=True)
test_q_resulted_label = clustering_model.predict(encoded_test_q)

topics[test_q_resulted_label[0]]

'virtual memory'