# Unsupervised Learning

we will evaluate some topic modeling techniques
- LDA
- NMF 
- Top2Vec
- Bertopic 
    - bert classic (miniml-6)
    - openai
    - tweet-classification
    - climabert
    - universal sentence encoder


for each of this i made different test with different parameters and different datasets changing the numebr of topics

Dataset:
- climate: 1669 preprocessed tweets 
- todo other

## Prepare Data

In [1]:
from evaluation import Trainer, DataLoader
from dotenv import load_dotenv
import numpy as np
import os
import openai
from sentence_transformers import SentenceTransformer
load_dotenv()

2023-03-28 16:14:30.055781: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/alessiogandelli/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
dataloader = DataLoader(dataset="climate").prepare_docs(save="climate.txt").preprocess_octis(output_folder="climate")

Loading climate data
created vocab
5225
words filtering done


## NMF



in the next cell we train the model multiple times, first changing the parameters of num_topics (10 to 50 with step 10) and all this 3 times

In [None]:
for i, random_state in enumerate([0, 21, 42]):
    dataset = "climate"
    custom = True
    params = {"num_topics": [(i+1)*10 for i in range(5)], "random_state": random_state}

    trainer = Trainer(dataset=dataset,
                      model_name="NMF",
                      params=params,
                      custom_dataset=custom,
                      verbose=True)
                      
    results = trainer.train(save=f"NMF_climate_{i+1}")

Now we save all the results in a dataframe, we compute the average on the 3 runs and save the results in a csv file

In [138]:
result_df = pd.DataFrame( columns = ['Dataset', 'model', 'nr_topics', 'npmi', 'umass', 'diversity', 'computation_time'])

# dataset used and model 
dataset = results[0]['Dataset']
model = results[0]['Model']

# fill the dataframe 

for test in results:
        row = pd.Series([test['Dataset'], test['Model'], test['Params']['num_topics'], test['Scores']['npmi'], test['Scores']['umass'], test['Scores']['diversity'], test['Computation Time']], index = result_df.columns)
        result_df = result_df.append(row, ignore_index=True)

# groupby and get the mean for the 3 tests 
result_df.set_index(['Dataset', 'model', 'nr_topics'], inplace=True)
a = result_df.groupby(['nr_topics']).mean()
a.reset_index(inplace=True)

#save the results in a file 
a['dataset'] = dataset
a['model'] = model
a[['dataset', 'model','nr_topics', 'npmi', 'umass', 'diversity', 'computation_time']].to_csv('bertopic'+'nmf'+'.csv', index=False)
        


## LDA

In [None]:
for i, random_state in enumerate([0, 21, 42]):
    dataset, custom = "climate", True
    params = {"num_topics": [(i+1)*10 for i in range(5)], "random_state": random_state}

    trainer = Trainer(dataset=dataset,
                      model_name="LDA",
                      params=params,
                      custom_dataset=custom,
                      verbose=True)
    results = trainer.train()

In [141]:
result_df = pd.DataFrame( columns = ['Dataset', 'model', 'nr_topics', 'npmi', 'umass', 'diversity', 'computation_time'])

# dataset used and model 
dataset = results[0]['Dataset']
model = results[0]['Model']

# fill the dataframe 

for test in results:
        row = pd.Series([test['Dataset'], test['Model'], test['Params']['num_topics'], test['Scores']['npmi'], test['Scores']['umass'], test['Scores']['diversity'], test['Computation Time']], index = result_df.columns)
        result_df = result_df.append(row, ignore_index=True)

# groupby and get the mean for the 3 tests 
result_df.set_index(['Dataset', 'model', 'nr_topics'], inplace=True)
a = result_df.groupby(['nr_topics']).mean()
a.reset_index(inplace=True)

#save the results
a['dataset'] = dataset
a['model'] = model
a[['dataset', 'model','nr_topics', 'npmi', 'umass', 'diversity',
        'computation_time']].to_csv('LDA'+'.csv', index=False)

## Bertopic

 ### Data preparation 

here we prepare the data for the bertopic model, so we get back the data as a list of strings


In [15]:
%%capture
# Prepare data
dataset, custom = "climate", True
data_loader = DataLoader(dataset)
_, timestamps = data_loader.load_docs()
data = data_loader.load_octis(custom)
data = [" ".join(words) for words in data.get_corpus()]

### evaluate 


Some functions to evaluate the models

In [134]:
# give a sentence_transformers model name or directly embeddings to evaluate 
def get_bertopic_result(model_name, embeddings = None, custom = False):

        # get emebddings if not provided
        display_name = model_name
        model_name = None
        results = []

        if not custom:
                model = SentenceTransformer(display_name)
                embeddings = model.encode(data, show_progress_bar=True)
                print('embedded')
                model_name = display_name

  


        # do this 3 times
        for i in range(3):
                # params that will be passed to Bertopic, model name none for custom embeddings
                params = {
                        "embedding_model": model_name,
                        "nr_topics": [(i+1)*10 for i in range(5)],  # 10, 20, 30, 40, 50 topics
                        "min_topic_size": [5,15],                   # 5, 15 documents per topic
                        "verbose": True
                }
                # train
                trainer = Trainer(      dataset=dataset,
                                        model_name="BERTopic",
                                        params=params,
                                        bt_embeddings=embeddings,
                                        custom_dataset=True,
                                        verbose=False)

                results.append(trainer.train())
                print(f"Done with {display_name} {i+1}")
        
        return results

# given the results 
def clean_results(results):
        # create result df 
        result_df = pd.DataFrame( columns = ['Dataset', 'model', 'nr_topics', 'min_topic_size', 'npmi', 'umass', 'diversity', 'computation_time'])

        # dataset used and model 
        dataset = results[0][0]['Dataset']
        model = results[0][0]['Params']['embedding_model']

        # fill the dataframe 
        for result in results:
                for test in result:
                        pd.Series([test['Dataset'], test['Params']['embedding_model'], test['Params']['nr_topics'], test['Params']['min_topic_size'], test['Scores']['npmi'], test['Scores']['umass'], test['Scores']['diversity'], test['Computation Time']], index = result_df.columns)
                        result_df = result_df.append(pd.Series([test['Dataset'], test['Params']['embedding_model'], test['Params']['nr_topics'], test['Params']['min_topic_size'], test['Scores']['npmi'], test['Scores']['umass'], test['Scores']['diversity'], test['Computation Time']], index = result_df.columns), ignore_index=True)

        # groupby and get the mean for the 3 tests 
        result_df.set_index(['Dataset', 'model', 'nr_topics', 'min_topic_size'], inplace=True)
        a = result_df.groupby(['nr_topics', 'min_topic_size']).mean()
        a.reset_index(inplace=True)

        #save the results
        a['dataset'] = dataset
        a['model'] = model
        a[['dataset', 'model','nr_topics', 'min_topic_size', 'npmi', 'umass', 'diversity',
                'computation_time']].to_csv('bertopic'+'climabert'+'.csv', index=False)
                
        return result_df

def get_openai_embeddings(texts, model="text-embedding-ada-002"):
        texts = [text.replace("\n", " ") for text in texts]

        embs = openai.Embedding.create(input = texts, model=model)['data']
        return np.array([np.array(emb['embedding']) for emb in embs])

### climatebert

In [None]:
climatebert_name = "climatebert/distilroberta-base-climate-f"
climatebert_results = get_bertopic_result(climatebert_name)
clean_results(climatebert_results)

### tweetclassifcation

In [None]:
tc_name = "louisbetsch/tweetclassification-bf-model"
tc_results = get_bertopic_result(tc_name)
clean_results(tc_results)

### bert base

In [None]:
bert_model_name = "all-MiniLM-L6-v2"
bert_results = get_bertopic_result(bert_model_name)
clean_results(bert_results)

### openai

In [None]:
openai.api_key = os.getenv("OPENAI_API_KEY")
openai_embeddings = get_openai_embeddings(data)
openai_results = get_bertopic_result("openai", embeddings=openai_embeddings, custom=True)
clean_results(openai_results)

### Universal sentence encoder 

In [None]:
import tensorflow_hub
embedding_model = tensorflow_hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

use_embeddings = embedding_model(data)
use_embeddings = np.array([emb.numpy() for emb in use_embeddings])
use_results = get_bertopic_result("use", embeddings=use_embeddings, custom=True)
clean_results(use_results)

## Top2Vec

In [147]:
for i in range(3):
    dataset, custom = "climate", True
    params = {"nr_topics": [(i+1)*10 for i in range(5)],
              # "embedding_model": "all-MiniLM-L6-v2",
              "hdbscan_args": {'min_cluster_size': 15,
                               'metric': 'euclidean',
                               'cluster_selection_method': 'eom'}}

    trainer = Trainer(dataset=dataset,
                      custom_dataset=custom,
                      #custom_model=Top2Vec,
                      model_name="Top2Vec",
                      params=params,
                      verbose=False)
    results = trainer.train()

2023-03-17 14:11:41,686 - top2vec - INFO - Pre-processing documents for training
INFO:top2vec:Pre-processing documents for training
2023-03-17 14:11:42,360 - top2vec - INFO - Creating joint document/word embedding
INFO:top2vec:Creating joint document/word embedding
2023-03-17 14:11:52,311 - top2vec - INFO - Creating lower dimension embedding of documents
INFO:top2vec:Creating lower dimension embedding of documents
2023-03-17 14:12:15,671 - top2vec - INFO - Finding dense areas of documents
INFO:top2vec:Finding dense areas of documents
2023-03-17 14:12:16,123 - top2vec - INFO - Finding topics
INFO:top2vec:Finding topics
2023-03-17 14:12:17,325 - top2vec - INFO - Pre-processing documents for training
INFO:top2vec:Pre-processing documents for training
2023-03-17 14:12:17,680 - top2vec - INFO - Creating joint document/word embedding
INFO:top2vec:Creating joint document/word embedding
2023-03-17 14:12:26,041 - top2vec - INFO - Creating lower dimension embedding of documents
INFO:top2vec:Crea

In [148]:
results

[{'Dataset': 'climate',
  'Dataset Size': 1669,
  'Model': 'Top2Vec',
  'Params': {'hdbscan_args': {'min_cluster_size': 15,
    'metric': 'euclidean',
    'cluster_selection_method': 'eom'},
   'reduction': False,
   'nr_topics': 7},
  'Scores': {'npmi': -0.1587504863442093,
   'umass': -6.289935931655772,
   'diversity': 0.44285714285714284},
  'Computation Time': 57.5817129611969},
 {'Dataset': 'climate',
  'Dataset Size': 1669,
  'Model': 'Top2Vec',
  'Params': {'hdbscan_args': {'min_cluster_size': 15,
    'metric': 'euclidean',
    'cluster_selection_method': 'eom'},
   'reduction': False,
   'nr_topics': 2},
  'Scores': {'npmi': -0.14258214992671217,
   'umass': -5.516094819047652,
   'diversity': 0.9},
  'Computation Time': 51.67424297332764},
 {'Dataset': 'climate',
  'Dataset Size': 1669,
  'Model': 'Top2Vec',
  'Params': {'hdbscan_args': {'min_cluster_size': 15,
    'metric': 'euclidean',
    'cluster_selection_method': 'eom'},
   'reduction': False,
   'nr_topics': 6},
  'Sco

In [145]:
# create result df 
result_df = pd.DataFrame( columns = ['Dataset', 'model', 'nr_topics', 'min_topic_size', 'npmi', 'umass', 'diversity', 'computation_time'])

# dataset used and model 
dataset = results[0]['Dataset']
model = results[0]['Model']

# fill the dataframe 

for test in results:
        row = pd.Series([test['Dataset'], test['Model'], test['Params']['nr_topics'], test['Params']['hdbscan_args']['min_cluster_size'], test['Scores']['npmi'], test['Scores']['umass'], test['Scores']['diversity'], test['Computation Time']], index = result_df.columns)
        result_df = result_df.append(row, ignore_index=True)

# groupby and get the mean for the 3 tests 
result_df.set_index(['Dataset', 'model', 'nr_topics', 'min_topic_size'], inplace=True)
a = result_df.groupby(['nr_topics', 'min_topic_size']).mean()
a.reset_index(inplace=True)

#save the results
a['dataset'] = dataset
a['model'] = model
a[['dataset', 'model','nr_topics', 'min_topic_size', 'npmi', 'umass', 'diversity',
        'computation_time']].to_csv('bertopic'+'openai'+'.csv', index=False)
        
