## **Topic Bert**

In [None]:
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups

docs = fetch_20newsgroups(subset='all')['data']
topic_model = BERTopic()
topics, probabilities = topic_model.fit_transform(docs[:100])


In [None]:
topics

[3,
 0,
 -1,
 0,
 0,
 -1,
 1,
 3,
 3,
 2,
 2,
 1,
 2,
 -1,
 2,
 -1,
 0,
 -1,
 -1,
 -1,
 -1,
 -1,
 0,
 -1,
 3,
 1,
 1,
 0,
 0,
 2,
 -1,
 1,
 0,
 3,
 -1,
 0,
 0,
 1,
 -1,
 1,
 1,
 -1,
 -1,
 -1,
 3,
 0,
 0,
 -1,
 1,
 -1,
 -1,
 -1,
 1,
 0,
 -1,
 1,
 2,
 0,
 2,
 1,
 3,
 0,
 -1,
 -1,
 2,
 1,
 3,
 1,
 0,
 0,
 0,
 -1,
 -1,
 1,
 1,
 1,
 3,
 0,
 2,
 3,
 -1,
 -1,
 0,
 -1,
 -1,
 2,
 2,
 -1,
 -1,
 2,
 3,
 2,
 2,
 0,
 -1,
 0,
 0,
 3,
 1,
 0]

In [None]:
probabilities

array([1.        , 1.        , 0.        , 1.        , 0.99392061,
       0.        , 1.        , 1.        , 0.9902732 , 1.        ,
       1.        , 1.        , 1.        , 0.        , 0.97242466,
       0.        , 0.86348717, 0.        , 0.        , 0.        ,
       0.        , 0.        , 1.        , 0.        , 1.        ,
       1.        , 1.        , 0.69180451, 1.        , 1.        ,
       0.        , 1.        , 1.        , 1.        , 0.        ,
       0.94337058, 0.93586901, 1.        , 0.        , 0.9517817 ,
       0.90689897, 0.        , 0.        , 0.        , 1.        ,
       1.        , 1.        , 0.        , 0.94184772, 0.        ,
       0.        , 0.        , 1.        , 0.69517268, 0.        ,
       0.88935723, 1.        , 0.94343116, 1.        , 0.94982853,
       1.        , 0.76040913, 0.        , 0.        , 0.98247802,
       1.        , 1.        , 0.9166383 , 0.88837485, 1.        ,
       0.9865659 , 0.        , 0.        , 0.99278062, 1.     

In [None]:
print("docs: {}\n".format(topic_model.get_representative_docs()))
print(topic_model.get_topic_info())




   Topic  Count              Name  \
0     -1     32  -1_the_and_of_to   
1      0     24  0_the_to_for_and   
2      1     18    1_the_to_of_in   
3      2     14  2_the_of_to_that   
4      3     12   3_the_to_and_he   

                                     Representation  \
0  [the, and, of, to, in, you, that, from, for, it]   
1    [the, to, for, and, is, of, have, you, it, in]   
2    [the, to, of, in, is, that, and, it, for, you]   
3    [the, of, to, that, in, is, and, it, are, you]   
4     [the, to, and, he, in, is, of, be, they, for]   

                                 Representative_Docs  
0  [From: sera@zuma.UUCP (Serdar Argic)\nSubject:...  
1  [From: jcmorris@mbunix.mitre.org (Morris)\nSub...  
2  [From: robinson@cogsci.Berkeley.EDU (Michael R...  
3  [From: pharvey@quack.kfu.com (Paul Harvey)\nSu...  
4  [From: cs902043@ariel.yorku.ca (SHAWN LUDDINGT...  


In [None]:
bar_chart = topic_model.visualize_barchart(n_words=5)
bar_chart.show()

In [None]:
map_chart = topic_model.visualize_topics()
map_chart.show()

## **Topic Bert**

In [None]:
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups
from sentence_transformers import SentenceTransformer

#docs = fetch_20newsgroups(subset='all')['data']
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
topic_model = BERTopic(embedding_model=sentence_model)
topics, probabilities = topic_model.fit_transform(docs[:500])


In [None]:
print("docs: {}\n".format(topic_model.get_representative_docs()))
print(topic_model.get_topic_info())


   Topic  Count             Name  \
0      0    449  0_the_to_of_and   
1      1     51  1_the_to_in_and   

                                   Representation  \
0  [the, to, of, and, in, is, that, it, you, for]   
1   [the, to, in, and, edu, that, of, is, it, he]   

                                 Representative_Docs  
0  [From: viking@iastate.edu (Dan Sorenson)\nSubj...  
1  [From: franjion@spot.Colorado.EDU (John Franji...  


In [None]:
# Reduce the number of topics
topic_model.reduce_topics(docs[:500], nr_topics=10)

In [None]:
bar_chart = topic_model.visualize_barchart(n_words=5)
bar_chart.show()

## **Topic Bert**

In [None]:
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")


docs = fetch_20newsgroups(subset='all')['data']
model = BERTopic(top_n_words=15,
                 nr_topics="auto",
                 embedding_model="all-MiniLM-L6-v2",
                 verbose=True,
                 vectorizer_model=vectorizer_model
                 )
topics, probabilities = topic_model.fit_transform(docs[:100])






2023-12-08 15:05:50,230 - BERTopic - Embedding - Transforming documents to embeddings.
2023-12-08 15:06:20,477 - BERTopic - Embedding - Completed ✓
2023-12-08 15:06:20,482 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2023-12-08 15:06:24,941 - BERTopic - Dimensionality - Completed ✓
2023-12-08 15:06:24,945 - BERTopic - Cluster - Start clustering the reduced embeddings
2023-12-08 15:06:24,959 - BERTopic - Cluster - Completed ✓


In [None]:
print(topic_model.get_topic_info())


   Topic  Count               Name  \
0     -1     26   -1_the_and_to_of   
1      0     22     0_the_in_to_of   
2      1     21   1_the_to_for_and   
3      2     19    2_the_of_and_to   
4      3     12  3_det_nyr_the_tor   

                                     Representation  \
0  [the, and, to, of, in, that, edu, from, it, you]   
1    [the, in, to, of, is, it, that, you, and, com]   
2    [the, to, for, and, is, of, it, you, have, in]   
3   [the, of, and, to, in, that, is, it, you, from]   
4  [det, nyr, the, tor, bos, mtl, chi, la, pit, to]   

                                 Representative_Docs  
0  [From: npet@bnr.ca (Nick Pettefar)\nSubject: R...  
1  [From: servalan@access.digex.com (Servalan)\nS...  
2  [From: jcmorris@mbunix.mitre.org (Morris)\nSub...  
3  [From: rodger@zeisler.lonestar.org (Rodger B. ...  
4  [From: cabanrf@wkuvx1.bitnet\nSubject: Re: My ...  


## **LDA**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

texts = [
    "The principles of physics explain the behavior of matter and energy in the universe.",
    "Python is a widely-used programming language in the field of computer science.",
    "Mathematics is essential for understanding the structure and patterns in our world.",
    "In computer science, algorithms play a crucial role in solving computational problems.",
    "Newton's laws of motion describe the relationship between a body and the forces acting on it.",
    "Artificial intelligence is a rapidly evolving field with applications in various industries.",
]


vectorizer = CountVectorizer(max_df=0.85, max_features=1000, stop_words='english')
X = vectorizer.fit_transform(texts)

lda_model = LatentDirichletAllocation(n_components=3, random_state=42)
lda_topic_matrix = lda_model.fit_transform(X)

print(lda_topic_matrix)


[[0.91575029 0.0421745  0.04207521]
 [0.92397515 0.03867478 0.03735007]
 [0.04784919 0.04802717 0.90412364]
 [0.93269355 0.03368767 0.03361878]
 [0.0418672  0.04202243 0.91611037]
 [0.03753095 0.9252278  0.03724125]]


## **SentenceTransformer**

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

texts = [
    "The principles of physics explain the behavior of matter and energy in the universe.",
    "Python is a widely-used programming language in the field of computer science.",
    "Mathematics is essential for understanding the structure and patterns in our world.",
    "In computer science, algorithms play a crucial role in solving computational problems.",
    "Newton's laws of motion describe the relationship between a body and the forces acting on it.",
    "Artificial intelligence is a rapidly evolving field with applications in various industries.",
]
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
embeddings = model.encode(texts, show_progress_bar=True)

kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(embeddings)

print(kmeans.labels_)


.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/550 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/265M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/450 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[1 0 2 2 1 0]


## **Doc2Vec**

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

texts = [
    "The principles of physics explain the behavior of matter and energy in the universe.",
    "Python is a widely-used programming language in the field of computer science.",
    "Mathematics is essential for understanding the structure and patterns in our world.",
    "In computer science, algorithms play a crucial role in solving computational problems.",
    "Newton's laws of motion describe the relationship between a body and the forces acting on it.",
    "Artificial intelligence is a rapidly evolving field with applications in various industries.",
]
# Preprocess texts and create tagged documents
tagged_data = [TaggedDocument(words=text.split(), tags=[str(i)]) for i, text in enumerate(texts)]

# Train Doc2Vec model
doc2vec_model = Doc2Vec(vector_size=100, window=5, min_count=1, workers=4, epochs=100)
doc2vec_model.build_vocab(tagged_data)
doc2vec_model.train(tagged_data, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

# Get document vectors
doc_vectors = [doc2vec_model.infer_vector(text.split()) for text in texts]

# Display results
print(doc_vectors)


[array([-0.04468578, -0.03172525, -0.02336094,  0.02108237, -0.0043058 ,
       -0.04247472,  0.00817306,  0.05366248, -0.01677766, -0.01404194,
       -0.02098143, -0.01483755,  0.01972649,  0.00623768,  0.01866548,
       -0.01819881, -0.00574904,  0.02255074, -0.01387547,  0.01134918,
        0.00279183, -0.00150497,  0.01626148,  0.0009281 , -0.02040142,
       -0.00098944, -0.04620254, -0.02118087, -0.02455159, -0.02195866,
        0.03613842, -0.0016514 ,  0.0247794 , -0.01020949,  0.00130957,
        0.03362888, -0.02358495, -0.02751656, -0.02398572, -0.0220746 ,
       -0.00488604,  0.00876167, -0.00815369, -0.04173892, -0.01628377,
       -0.02182982,  0.00012938, -0.00464728,  0.00939182, -0.0081459 ,
       -0.00802651, -0.01891498, -0.01747006, -0.01129803, -0.00710436,
       -0.00338911,  0.02110584,  0.02394428, -0.02000761,  0.01171236,
        0.04611248,  0.01497415,  0.02490227,  0.01155229,  0.00208763,
        0.02899683,  0.00492817,  0.01917089, -0.026857  ,  0.0

In [None]:
# Apply K-Means clustering
num_topics = 3  # Specify the number of topics
kmeans = KMeans(n_clusters=num_topics, random_state=42)
kmeans.fit(doc_vectors)

# Assign cluster labels to the texts
cluster_labels = kmeans.labels_

# Display results
df = pd.DataFrame({'text': texts, 'cluster': cluster_labels})
print(df)

                                                text  cluster
0  The principles of physics explain the behavior...        1
1  Python is a widely-used programming language i...        2
2  Mathematics is essential for understanding the...        1
3  In computer science, algorithms play a crucial...        1
4  Newton's laws of motion describe the relations...        0
5  Artificial intelligence is a rapidly evolving ...        1
