In [1]:
from contextlib import contextmanager
from typing import Optional
import tempfile

import pandas as pd
import os
import yaml

from autotm.base import AutoTM
from autotm.preprocessing.text_preprocessing import DataTransformer

### Prepare working directory

In [2]:
@contextmanager
def prepare_working_dir(working_dir: Optional[str] = None):
    if working_dir is None:
        with tempfile.TemporaryDirectory(prefix="autotm_wd_") as tmp_working_dir:
            yield tmp_working_dir
    else:
        yield working_dir

In [3]:
with prepare_working_dir() as wdir:
    print(f"Working directory {os.path.abspath(wdir)} for AutoTM")

Working directory /tmp/autotm_wd_4qc5tjnb for AutoTM


### Preprocess data using DataTransformer

In [4]:
data_transformer = DataTransformer(col_to_process="text",)

[nltk_data] Downloading package stopwords to /home/fhrzn/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/fhrzn/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Call `.fit()` and `.transform()` sequentially.

In [5]:
data_transformer.fit('../data/sample_corpora/sample_dataset_lenta.csv')

In [6]:
data_transformer.transform()

Or perform both operation directly using `.fit_transform()`

In [7]:
data_transformer.fit_transform('../data/sample_corpora/sample_dataset_lenta.csv')

### Define AutoTM instance

In [8]:
with open("../conf/config.yaml", "r", encoding="utf-8") as f:
    config = yaml.safe_load(f)

Fit model

In [10]:
with prepare_working_dir() as wdir:

    autotm = AutoTM(
        **config,
        working_dir_path=wdir
    )

    autotm.fit(data_transformer.data)    
    model_name = "artm.model"
    autotm.save(model_name, overwrite=True)
    print(f"Saving model to {os.path.abspath(model_name)}")

[nltk_data] Downloading package stopwords to /home/fhrzn/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/fhrzn/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Saved to /tmp/autotm_wd_n57qdety/a1c8fd43-7ba1-482a-b2a4-e4876b057313/ppp.csv
Starting...
part 1/1


E0820 18:38:01.162883 96508 dictionary_operations.cc:381] Error at line 1, file /tmp/autotm_wd_n57qdety/a1c8fd43-7ba1-482a-b2a4-e4876b057313/test_set_data_voc.txt. Expected format: <token> [<class_id>], dictionary will be gathered in random token order


/tmp/autotm_wd_n57qdety/a1c8fd43-7ba1-482a-b2a4-e4876b057313/cooc_df.txt is ready!
/tmp/autotm_wd_n57qdety/a1c8fd43-7ba1-482a-b2a4-e4876b057313/cooc_tf.txt is ready!
Calculating pPMI...
Calculating pPMI...
/tmp/autotm_wd_n57qdety/a1c8fd43-7ba1-482a-b2a4-e4876b057313/ppmi_tf.txt is ready!
/tmp/autotm_wd_n57qdety/a1c8fd43-7ba1-482a-b2a4-e4876b057313/ppmi_df.txt is ready!


E0820 18:38:03.655612 96508 dictionary_operations.cc:452] Error at line 1, file /tmp/autotm_wd_n57qdety/a1c8fd43-7ba1-482a-b2a4-e4876b057313/ppmi_tf.txt. Number of values in all lines should be equal to 3, dictionary will be gathered without cooc info
2023-08-20 18:38:04,188 - GA_algo - INFO - Starting experiment: 1692531484
2023-08-20 18:38:04,189 - GA_algo - INFO - ALGORITHM PARAMS  number of individuals 4; number of fitness evals unlimited; number of early stopping iterations 500; crossover prob None
2023-08-20 18:38:04,191 - root - INFO - Calculating fitness...
INFO:root:Loading dataset entities
INFO:root:Reading dictionary from /tmp/autotm_wd_n57qdety/a1c8fd43-7ba1-482a-b2a4-e4876b057313/dictionary.txt
INFO:root:Dataset entities initialization took  0.13
INFO:root:Using TM model: <class 'autotm.fitness.tm.TopicModel'> according to fitness name: default, topics count: 10
10it [00:00, 732.72it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
10it [00:00, 786.60it/s]
0it [

Training is complete
Wow! all topics


10it [00:00, 968.97it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


COMPONENTS:  0.4795999886457885 0.3891480665191247


10it [00:00, 14060.69it/s]
0it [00:00, ?it/s]
10it [00:00, 4598.51it/s]
0it [00:00, ?it/s]
10it [00:00, 3711.77it/s]
0it [00:00, ?it/s]
10it [00:00, 1541.17it/s]
0it [00:00, ?it/s]
10it [00:00, 1273.63it/s]
0it [00:00, ?it/s]
10it [00:00, 1127.44it/s]
0it [00:00, ?it/s]
10it [00:00, 713.95it/s]
0it [00:00, ?it/s]
10it [00:00, 597.66it/s]
0it [00:00, ?it/s]
10it [00:00, 601.64it/s]
0it [00:00, ?it/s]
10it [00:00, 551.50it/s]
0it [00:00, ?it/s]
  "switchP": np.nanmean(switchp_scores),
INFO:root:Fitness estimation took  1.61
INFO:root:Deleting bigartm logs: []
INFO:root:Loading dataset entities
INFO:root:Reading dictionary from /tmp/autotm_wd_n57qdety/a1c8fd43-7ba1-482a-b2a4-e4876b057313/dictionary.txt
INFO:root:Dataset entities initialization took  0.14
INFO:root:Using TM model: <class 'autotm.fitness.tm.TopicModel'> according to fitness name: default, topics count: 10
7it [00:00, 58023.97it/s]
3it [00:00, 41665.27it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
INFO:root:Building dictionary


Early stopping is triggered


7it [00:00, 62869.65it/s]
3it [00:00, 37449.14it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


COMPONENTS:  0.0004019023762962918 0.0


7it [00:00, 42799.02it/s]
3it [00:00, 33644.15it/s]
7it [00:00, 49932.19it/s]
3it [00:00, 27900.03it/s]
7it [00:00, 40219.35it/s]
3it [00:00, 34379.54it/s]
7it [00:00, 44350.65it/s]
3it [00:00, 43842.90it/s]
7it [00:00, 42612.67it/s]
3it [00:00, 16666.11it/s]
7it [00:00, 39568.91it/s]
3it [00:00, 36684.87it/s]
7it [00:00, 39783.37it/s]
3it [00:00, 24059.11it/s]
7it [00:00, 31167.86it/s]
3it [00:00, 29059.84it/s]
7it [00:00, 10244.29it/s]
3it [00:00, 31536.12it/s]
7it [00:00, 34299.21it/s]
3it [00:00, 32513.98it/s]
  "switchP": np.nanmean(switchp_scores),
INFO:root:Fitness estimation took  0.37
INFO:root:Deleting bigartm logs: []
INFO:root:Loading dataset entities
INFO:root:Reading dictionary from /tmp/autotm_wd_n57qdety/a1c8fd43-7ba1-482a-b2a4-e4876b057313/dictionary.txt
INFO:root:Dataset entities initialization took  0.15
INFO:root:Using TM model: <class 'autotm.fitness.tm.TopicModel'> according to fitness name: default, topics count: 10
8it [00:00, 7738.57it/s]
2it [00:00, 26715.31it

Early stopping is triggered


8it [00:00, 5260.14it/s]
2it [00:00, 19195.90it/s]
3it [00:00, 589.92it/s]
0it [00:00, ?it/s]


COMPONENTS:  0.013668066683284527 0.0


8it [00:00, 15148.73it/s]
2it [00:00, 8120.63it/s]
8it [00:00, 7400.62it/s]
2it [00:00, 14513.16it/s]
8it [00:00, 6596.11it/s]
2it [00:00, 9218.25it/s]
8it [00:00, 5476.49it/s]
2it [00:00, 10965.50it/s]
8it [00:00, 4584.57it/s]
2it [00:00, 3746.59it/s]
8it [00:00, 2923.37it/s]
2it [00:00, 17476.27it/s]
8it [00:00, 5981.18it/s]
2it [00:00, 1510.37it/s]
8it [00:00, 6040.40it/s]
2it [00:00, 23045.63it/s]
8it [00:00, 6321.48it/s]
2it [00:00, 10010.27it/s]
8it [00:00, 4386.20it/s]
2it [00:00, 22857.24it/s]
  "switchP": np.nanmean(switchp_scores),
INFO:root:Fitness estimation took  0.69
INFO:root:Deleting bigartm logs: []
INFO:root:Loading dataset entities
INFO:root:Reading dictionary from /tmp/autotm_wd_n57qdety/a1c8fd43-7ba1-482a-b2a4-e4876b057313/dictionary.txt
INFO:root:Dataset entities initialization took  0.15
INFO:root:Using TM model: <class 'autotm.fitness.tm.TopicModel'> according to fitness name: default, topics count: 10
7it [00:00, 79137.81it/s]
3it [00:00, 46091.25it/s]
0it [00:

Early stopping is triggered


7it [00:00, 43821.09it/s]
3it [00:00, 35951.18it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


COMPONENTS:  0.0 0.0


7it [00:00, 39837.35it/s]
3it [00:00, 22429.43it/s]
7it [00:00, 27908.87it/s]
3it [00:00, 24867.42it/s]
7it [00:00, 4027.45it/s]
3it [00:00, 30030.82it/s]
7it [00:00, 43240.25it/s]
3it [00:00, 19691.57it/s]
7it [00:00, 40274.52it/s]
3it [00:00, 32939.56it/s]
7it [00:00, 47508.30it/s]
3it [00:00, 26105.63it/s]
7it [00:00, 50360.43it/s]
3it [00:00, 27715.67it/s]
7it [00:00, 40665.00it/s]
3it [00:00, 27473.61it/s]
7it [00:00, 26355.59it/s]
3it [00:00, 13386.08it/s]
7it [00:00, 24507.62it/s]
3it [00:00, 26105.63it/s]
  "switchP": np.nanmean(switchp_scores),
INFO:root:Fitness estimation took  0.74
INFO:root:Deleting bigartm logs: []
2023-08-20 18:38:07,823 - root - INFO - The fitness results have been obtained
2023-08-20 18:38:07,824 - GA_algo - INFO - POPULATION IS CREATED
2023-08-20 18:38:07,825 - GA_algo - INFO - ENTERING GENERATION 0
2023-08-20 18:38:07,826 - GA_algo - INFO - PAIRS ARE CREATED
2023-08-20 18:38:07,828 - GA_algo - INFO - CURRENT COUNTER: 9
2023-08-20 18:38:07,828 - root -

Early stopping is triggered


4it [00:00, 29641.72it/s]
6it [00:00, 48770.98it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


COMPONENTS:  0.0 0


4it [00:00, 36002.61it/s]
6it [00:00, 46863.73it/s]
4it [00:00, 28630.06it/s]
6it [00:00, 43389.35it/s]
4it [00:00, 27869.13it/s]
6it [00:00, 62446.21it/s]
4it [00:00, 28102.54it/s]
6it [00:00, 26743.70it/s]
4it [00:00, 17962.76it/s]
6it [00:00, 57456.22it/s]
4it [00:00, 25890.77it/s]
6it [00:00, 33734.35it/s]
4it [00:00, 33756.97it/s]
6it [00:00, 48489.06it/s]
4it [00:00, 20485.00it/s]
6it [00:00, 44073.25it/s]
4it [00:00, 17067.36it/s]
6it [00:00, 55924.05it/s]
4it [00:00, 24070.61it/s]
6it [00:00, 55431.33it/s]
  "switchP": np.nanmean(switchp_scores),
INFO:root:Fitness estimation took  0.73
INFO:root:Deleting bigartm logs: []
INFO:root:Loading dataset entities
INFO:root:Reading dictionary from /tmp/autotm_wd_n57qdety/a1c8fd43-7ba1-482a-b2a4-e4876b057313/dictionary.txt
INFO:root:Dataset entities initialization took  0.17
INFO:root:Using TM model: <class 'autotm.fitness.tm.TopicModel'> according to fitness name: default, topics count: 10
4it [00:00, 53773.13it/s]
6it [00:00, 66930.38i

Early stopping is triggered


4it [00:00, 38304.15it/s]
6it [00:00, 92862.82it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


COMPONENTS:  0.0 0


4it [00:00, 30504.03it/s]
6it [00:00, 53773.13it/s]
4it [00:00, 28055.55it/s]
6it [00:00, 52980.68it/s]
4it [00:00, 26462.49it/s]
6it [00:00, 33376.42it/s]
4it [00:00, 43018.50it/s]
6it [00:00, 58936.36it/s]
4it [00:00, 31068.92it/s]
6it [00:00, 37008.56it/s]
4it [00:00, 35246.25it/s]
6it [00:00, 67288.30it/s]
4it [00:00, 35469.80it/s]
6it [00:00, 22919.69it/s]
4it [00:00, 12035.31it/s]
6it [00:00, 69711.42it/s]
4it [00:00, 28532.68it/s]
6it [00:00, 10123.02it/s]
4it [00:00, 36472.21it/s]
6it [00:00, 38304.15it/s]
  "switchP": np.nanmean(switchp_scores),
INFO:root:Fitness estimation took  0.76
INFO:root:Deleting bigartm logs: []
INFO:root:Loading dataset entities
INFO:root:Reading dictionary from /tmp/autotm_wd_n57qdety/a1c8fd43-7ba1-482a-b2a4-e4876b057313/dictionary.txt
INFO:root:Dataset entities initialization took  0.15
INFO:root:Using TM model: <class 'autotm.fitness.tm.TopicModel'> according to fitness name: default, topics count: 10
10it [00:00, 637.63it/s]
0it [00:00, ?it/s]
0it

Training is complete
Wow! all topics


10it [00:00, 615.52it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


COMPONENTS:  0.4754919673078965 0.3891480665191247


10it [00:00, 10721.64it/s]
0it [00:00, ?it/s]
10it [00:00, 4833.26it/s]
0it [00:00, ?it/s]
10it [00:00, 4145.39it/s]
0it [00:00, ?it/s]
10it [00:00, 2021.55it/s]
0it [00:00, ?it/s]
10it [00:00, 1419.58it/s]
0it [00:00, ?it/s]
10it [00:00, 1327.19it/s]
0it [00:00, ?it/s]
10it [00:00, 870.93it/s]
0it [00:00, ?it/s]
10it [00:00, 547.27it/s]
0it [00:00, ?it/s]
10it [00:00, 422.75it/s]
0it [00:00, ?it/s]
10it [00:00, 364.12it/s]
0it [00:00, ?it/s]
  "switchP": np.nanmean(switchp_scores),
INFO:root:Fitness estimation took  1.61
INFO:root:Deleting bigartm logs: []
INFO:root:Loading dataset entities
INFO:root:Reading dictionary from /tmp/autotm_wd_n57qdety/a1c8fd43-7ba1-482a-b2a4-e4876b057313/dictionary.txt
INFO:root:Dataset entities initialization took  0.15
INFO:root:Using TM model: <class 'autotm.fitness.tm.TopicModel'> according to fitness name: default, topics count: 10
10it [00:00, 890.85it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
10it [00:00, 713.17it/s]
0it [00:00, ?

Training is complete
Wow! all topics


10it [00:00, 553.78it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


COMPONENTS:  0.47268861448595373 0.36618383723588754


10it [00:00, 10070.36it/s]
0it [00:00, ?it/s]
10it [00:00, 9027.77it/s]
0it [00:00, ?it/s]
10it [00:00, 2476.12it/s]
0it [00:00, ?it/s]
10it [00:00, 2426.84it/s]
0it [00:00, ?it/s]
10it [00:00, 1730.47it/s]
0it [00:00, ?it/s]
10it [00:00, 1181.03it/s]
0it [00:00, ?it/s]
10it [00:00, 747.10it/s]
0it [00:00, ?it/s]
10it [00:00, 884.50it/s]
0it [00:00, ?it/s]
10it [00:00, 552.65it/s]
0it [00:00, ?it/s]
10it [00:00, 492.45it/s]
0it [00:00, ?it/s]
  "switchP": np.nanmean(switchp_scores),
INFO:root:Fitness estimation took  1.87
INFO:root:Deleting bigartm logs: []
INFO:root:Loading dataset entities
INFO:root:Reading dictionary from /tmp/autotm_wd_n57qdety/a1c8fd43-7ba1-482a-b2a4-e4876b057313/dictionary.txt
INFO:root:Dataset entities initialization took  0.15
INFO:root:Using TM model: <class 'autotm.fitness.tm.TopicModel'> according to fitness name: default, topics count: 10
10it [00:00, 800.06it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
10it [00:00, 591.46it/s]
0it [00:00, ?

Training is complete
Wow! all topics


10it [00:00, 623.00it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


COMPONENTS:  0.47268861448595373 0.36618383723588754


10it [00:00, 9082.51it/s]
0it [00:00, ?it/s]
10it [00:00, 6241.52it/s]
0it [00:00, ?it/s]
10it [00:00, 3884.69it/s]
0it [00:00, ?it/s]
10it [00:00, 2947.92it/s]
0it [00:00, ?it/s]
10it [00:00, 1762.31it/s]
0it [00:00, ?it/s]
10it [00:00, 1131.85it/s]
0it [00:00, ?it/s]
10it [00:00, 966.30it/s]
0it [00:00, ?it/s]
10it [00:00, 897.91it/s]
0it [00:00, ?it/s]
10it [00:00, 495.87it/s]
0it [00:00, ?it/s]
10it [00:00, 495.80it/s]
0it [00:00, ?it/s]
  "switchP": np.nanmean(switchp_scores),
INFO:root:Fitness estimation took  1.89
INFO:root:Deleting bigartm logs: []
2023-08-20 18:38:15,067 - root - INFO - The fitness results have been obtained
2023-08-20 18:38:15,068 - GA_algo - INFO - ize of the new generation is 5
2023-08-20 18:38:15,069 - GA_algo - INFO - TIME OF THE FITNESS FUNCTION IN CROSSOVER: 7.240384817123413
2023-08-20 18:38:15,069 - GA_algo - INFO - CROSSOVER IS OVER
2023-08-20 18:38:15,329 - root - INFO - Calculating fitness...
INFO:root:Loading dataset entities
INFO:root:Reading dic

Training is complete
Wow! all topics


10it [00:00, 861.16it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


COMPONENTS:  0.4795999886457885 0.3891480665191247


10it [00:00, 21161.98it/s]
0it [00:00, ?it/s]
10it [00:00, 3921.74it/s]
0it [00:00, ?it/s]
10it [00:00, 5018.91it/s]
0it [00:00, ?it/s]
10it [00:00, 2873.99it/s]
0it [00:00, ?it/s]
10it [00:00, 1603.39it/s]
0it [00:00, ?it/s]
10it [00:00, 1145.61it/s]
0it [00:00, ?it/s]
10it [00:00, 1076.51it/s]
0it [00:00, ?it/s]
10it [00:00, 866.43it/s]
0it [00:00, ?it/s]
10it [00:00, 635.54it/s]
0it [00:00, ?it/s]
10it [00:00, 596.66it/s]
0it [00:00, ?it/s]
  "switchP": np.nanmean(switchp_scores),
INFO:root:Fitness estimation took  1.38
INFO:root:Deleting bigartm logs: []
INFO:root:Loading dataset entities
INFO:root:Reading dictionary from /tmp/autotm_wd_n57qdety/a1c8fd43-7ba1-482a-b2a4-e4876b057313/dictionary.txt
INFO:root:Dataset entities initialization took  0.12
INFO:root:Using TM model: <class 'autotm.fitness.tm.TopicModel'> according to fitness name: default, topics count: 10
INFO:root:Fitness estimation took  0.13
INFO:root:Deleting bigartm logs: []
INFO:root:Loading dataset entities
INFO:roo

Training is complete
Wow! all topics


10it [00:00, 904.02it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


COMPONENTS:  0.47268861448595373 0.36618383723588754


10it [00:00, 12264.05it/s]
0it [00:00, ?it/s]
10it [00:00, 7044.51it/s]
0it [00:00, ?it/s]
10it [00:00, 3952.04it/s]
0it [00:00, ?it/s]
10it [00:00, 2315.89it/s]
0it [00:00, ?it/s]
10it [00:00, 1476.19it/s]
0it [00:00, ?it/s]
10it [00:00, 1627.97it/s]
0it [00:00, ?it/s]
10it [00:00, 1152.09it/s]
0it [00:00, ?it/s]
10it [00:00, 706.28it/s]
0it [00:00, ?it/s]
10it [00:00, 741.66it/s]
0it [00:00, ?it/s]
10it [00:00, 721.58it/s]
0it [00:00, ?it/s]
  "switchP": np.nanmean(switchp_scores),
INFO:root:Fitness estimation took  1.53
INFO:root:Deleting bigartm logs: []
INFO:root:Loading dataset entities
INFO:root:Reading dictionary from /tmp/autotm_wd_n57qdety/a1c8fd43-7ba1-482a-b2a4-e4876b057313/dictionary.txt
INFO:root:Dataset entities initialization took  0.12
INFO:root:Using TM model: <class 'autotm.fitness.tm.TopicModel'> according to fitness name: default, topics count: 10
8it [00:00, 14639.80it/s]
2it [00:00, 31895.85it/s]
3it [00:00, 809.71it/s]
0it [00:00, ?it/s]
INFO:root:Building dicti

Early stopping is triggered


8it [00:00, 8220.10it/s]
2it [00:00, 28435.96it/s]
3it [00:00, 656.45it/s]
0it [00:00, ?it/s]


COMPONENTS:  0.013668066683284527 0.0


8it [00:00, 21902.37it/s]
2it [00:00, 4324.02it/s]
8it [00:00, 13046.05it/s]
2it [00:00, 22133.53it/s]
8it [00:00, 15427.33it/s]
2it [00:00, 13464.86it/s]
8it [00:00, 7913.78it/s]
2it [00:00, 30727.50it/s]
8it [00:00, 7468.16it/s]
2it [00:00, 20020.54it/s]
8it [00:00, 6729.73it/s]
2it [00:00, 33420.75it/s]
8it [00:00, 6089.73it/s]
2it [00:00, 24385.49it/s]
8it [00:00, 6528.10it/s]
2it [00:00, 20360.70it/s]
8it [00:00, 7740.35it/s]
2it [00:00, 26379.27it/s]
8it [00:00, 6821.39it/s]
2it [00:00, 20213.51it/s]
  "switchP": np.nanmean(switchp_scores),
INFO:root:Fitness estimation took  0.59
INFO:root:Deleting bigartm logs: []
2023-08-20 18:38:19,195 - root - INFO - The fitness results have been obtained
2023-08-20 18:38:19,196 - GA_algo - INFO - TIME OF THE FITNESS FUNCTION IN MUTATION: 3.866995334625244
2023-08-20 18:38:19,197 - GA_algo - INFO - MUTATION IS OVER
2023-08-20 18:38:19,199 - GA_algo - INFO - Population len 4. Best params so far: [0.0, 25.0, 46.35895265629014, 28.06078132351432

Training is complete
Wow! all topics


10it [00:00, 781.59it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


COMPONENTS:  0.4795999886457885 0.3891480665191247


10it [00:00, 9412.71it/s]
0it [00:00, ?it/s]
10it [00:00, 6626.07it/s]
0it [00:00, ?it/s]
10it [00:00, 3980.17it/s]
0it [00:00, ?it/s]
10it [00:00, 2403.47it/s]
0it [00:00, ?it/s]
10it [00:00, 2415.24it/s]
0it [00:00, ?it/s]
10it [00:00, 1539.02it/s]
0it [00:00, ?it/s]
10it [00:00, 1390.73it/s]
0it [00:00, ?it/s]
10it [00:00, 841.74it/s]
0it [00:00, ?it/s]
10it [00:00, 510.59it/s]
0it [00:00, ?it/s]
10it [00:00, 657.16it/s]
0it [00:00, ?it/s]
  "switchP": np.nanmean(switchp_scores),
INFO:root:Fitness estimation took  1.37
INFO:root:Logging params and artifacts to mlflow
INFO:root:Created experiment_None
INFO:root:Experiment exists, omitting creation
INFO:root:Logged params and artifacts to mlflow
INFO:root:Deleting bigartm logs: []
2023-08-20 18:38:20,755 - GA_algo - INFO - ENTERING GENERATION 1
2023-08-20 18:38:20,756 - GA_algo - INFO - PAIRS ARE CREATED
2023-08-20 18:38:20,757 - GA_algo - INFO - CURRENT COUNTER: 14
2023-08-20 18:38:20,757 - root - INFO - Calculating fitness...
INFO:r

Experiment run name: fitness-__noname__-31becdf3-6eb5-47e5-977b-c5c495472e95_tmp_0


INFO:root:Dataset entities initialization took  0.13
INFO:root:Using TM model: <class 'autotm.fitness.tm.TopicModel'> according to fitness name: default, topics count: 10
INFO:root:Fitness estimation took  0.14
INFO:root:Deleting bigartm logs: []
INFO:root:Loading dataset entities
INFO:root:Reading dictionary from /tmp/autotm_wd_n57qdety/a1c8fd43-7ba1-482a-b2a4-e4876b057313/dictionary.txt
INFO:root:Dataset entities initialization took  0.12
INFO:root:Using TM model: <class 'autotm.fitness.tm.TopicModel'> according to fitness name: default, topics count: 10
10it [00:00, 975.81it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
10it [00:00, 728.25it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
10it [00:00, 684.93it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
10it [00:00, 762.77it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
INFO:root:Building dictionary


Training is complete
Wow! all topics


10it [00:00, 718.46it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


COMPONENTS:  0.47398968892953997 0.3904442603148159


10it [00:00, 19803.14it/s]
0it [00:00, ?it/s]
10it [00:00, 7998.29it/s]
0it [00:00, ?it/s]
10it [00:00, 3852.58it/s]
0it [00:00, ?it/s]
10it [00:00, 2660.01it/s]
0it [00:00, ?it/s]
10it [00:00, 1645.86it/s]
0it [00:00, ?it/s]
10it [00:00, 1400.48it/s]
0it [00:00, ?it/s]
10it [00:00, 913.99it/s]
0it [00:00, ?it/s]
10it [00:00, 710.06it/s]
0it [00:00, ?it/s]
10it [00:00, 820.37it/s]
0it [00:00, ?it/s]
10it [00:00, 545.97it/s]
0it [00:00, ?it/s]
  "switchP": np.nanmean(switchp_scores),
INFO:root:Fitness estimation took  1.26
INFO:root:Deleting bigartm logs: []
2023-08-20 18:38:22,252 - root - INFO - The fitness results have been obtained
2023-08-20 18:38:22,253 - GA_algo - INFO - ize of the new generation is 2
2023-08-20 18:38:22,254 - GA_algo - INFO - TIME OF THE FITNESS FUNCTION IN CROSSOVER: 1.4963486194610596
2023-08-20 18:38:22,254 - GA_algo - INFO - CROSSOVER IS OVER
2023-08-20 18:38:22,461 - root - INFO - Calculating fitness...
INFO:root:Loading dataset entities
INFO:root:Reading d

Training is complete
Wow! all topics


10it [00:00, 770.47it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


COMPONENTS:  0.4795999886457885 0.3891480665191247


10it [00:00, 16250.69it/s]
0it [00:00, ?it/s]
10it [00:00, 6180.82it/s]
0it [00:00, ?it/s]
10it [00:00, 4818.27it/s]
0it [00:00, ?it/s]
10it [00:00, 2869.27it/s]
0it [00:00, ?it/s]
10it [00:00, 1917.75it/s]
0it [00:00, ?it/s]
10it [00:00, 1478.01it/s]
0it [00:00, ?it/s]
10it [00:00, 1080.23it/s]
0it [00:00, ?it/s]
10it [00:00, 1039.15it/s]
0it [00:00, ?it/s]
10it [00:00, 751.10it/s]
0it [00:00, ?it/s]
10it [00:00, 579.23it/s]
0it [00:00, ?it/s]
  "switchP": np.nanmean(switchp_scores),
INFO:root:Fitness estimation took  1.40
INFO:root:Deleting bigartm logs: []
INFO:root:Loading dataset entities
INFO:root:Reading dictionary from /tmp/autotm_wd_n57qdety/a1c8fd43-7ba1-482a-b2a4-e4876b057313/dictionary.txt
INFO:root:Dataset entities initialization took  0.13
INFO:root:Using TM model: <class 'autotm.fitness.tm.TopicModel'> according to fitness name: default, topics count: 10
10it [00:00, 992.92it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
10it [00:00, 810.62it/s]
0it [00:00,

Training is complete
Wow! all topics


10it [00:00, 673.56it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


COMPONENTS:  0.47398968892953997 0.3904442603148159


10it [00:00, 17932.04it/s]
0it [00:00, ?it/s]
10it [00:00, 9431.76it/s]
0it [00:00, ?it/s]
10it [00:00, 3642.47it/s]
0it [00:00, ?it/s]
10it [00:00, 3063.32it/s]
0it [00:00, ?it/s]
10it [00:00, 1479.26it/s]
0it [00:00, ?it/s]
10it [00:00, 1403.29it/s]
0it [00:00, ?it/s]
10it [00:00, 1173.89it/s]
0it [00:00, ?it/s]
10it [00:00, 737.03it/s]
0it [00:00, ?it/s]
10it [00:00, 730.47it/s]
0it [00:00, ?it/s]
10it [00:00, 688.04it/s]
0it [00:00, ?it/s]
  "switchP": np.nanmean(switchp_scores),
INFO:root:Fitness estimation took  1.25
INFO:root:Deleting bigartm logs: []
INFO:root:Loading dataset entities
INFO:root:Reading dictionary from /tmp/autotm_wd_n57qdety/a1c8fd43-7ba1-482a-b2a4-e4876b057313/dictionary.txt
INFO:root:Dataset entities initialization took  0.12
INFO:root:Using TM model: <class 'autotm.fitness.tm.TopicModel'> according to fitness name: default, topics count: 10
10it [00:00, 545.59it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
10it [00:00, 970.84it/s]
0it [00:00, 

Training is complete
Wow! all topics


10it [00:00, 810.76it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


COMPONENTS:  0.47268861448595373 0.36618383723588754


10it [00:00, 7601.13it/s]
0it [00:00, ?it/s]
10it [00:00, 10407.70it/s]
0it [00:00, ?it/s]
10it [00:00, 3912.96it/s]
0it [00:00, ?it/s]
10it [00:00, 2414.40it/s]
0it [00:00, ?it/s]
10it [00:00, 2389.78it/s]
0it [00:00, ?it/s]
10it [00:00, 1694.19it/s]
0it [00:00, ?it/s]
10it [00:00, 1667.91it/s]
0it [00:00, ?it/s]
10it [00:00, 880.18it/s]
0it [00:00, ?it/s]
10it [00:00, 791.71it/s]
0it [00:00, ?it/s]
10it [00:00, 830.56it/s]
0it [00:00, ?it/s]
  "switchP": np.nanmean(switchp_scores),
INFO:root:Fitness estimation took  1.57
INFO:root:Deleting bigartm logs: []
INFO:root:Loading dataset entities
INFO:root:Reading dictionary from /tmp/autotm_wd_n57qdety/a1c8fd43-7ba1-482a-b2a4-e4876b057313/dictionary.txt
INFO:root:Dataset entities initialization took  0.13
INFO:root:Using TM model: <class 'autotm.fitness.tm.TopicModel'> according to fitness name: default, topics count: 10
INFO:root:Fitness estimation took  0.14
INFO:root:Deleting bigartm logs: []
2023-08-20 18:38:27,044 - root - INFO - The

Training is complete
Wow! all topics


10it [00:00, 775.82it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


COMPONENTS:  0.4795999886457885 0.3891480665191247


10it [00:00, 13622.29it/s]
0it [00:00, ?it/s]
10it [00:00, 7401.28it/s]
0it [00:00, ?it/s]
10it [00:00, 4341.48it/s]
0it [00:00, ?it/s]
10it [00:00, 2460.15it/s]
0it [00:00, ?it/s]
10it [00:00, 1578.35it/s]
0it [00:00, ?it/s]
10it [00:00, 1499.47it/s]
0it [00:00, ?it/s]
10it [00:00, 1110.93it/s]
0it [00:00, ?it/s]
10it [00:00, 1043.33it/s]
0it [00:00, ?it/s]
10it [00:00, 625.74it/s]
0it [00:00, ?it/s]
10it [00:00, 585.25it/s]
0it [00:00, ?it/s]
  "switchP": np.nanmean(switchp_scores),
INFO:root:Fitness estimation took  1.36
INFO:root:Logging params and artifacts to mlflow
INFO:root:Created experiment_None
INFO:root:Experiment exists, omitting creation
INFO:root:Logged params and artifacts to mlflow
INFO:root:Deleting bigartm logs: []
2023-08-20 18:38:28,578 - GA_algo - INFO - Y: [0.8687480551649132, 0.8687480551649132]
2023-08-20 18:38:28,578 - root - INFO - Sending a best individual to be logged
INFO:root:Loading dataset entities
INFO:root:Reading dictionary from /tmp/autotm_wd_n57qde

Experiment run name: fitness-__noname__-bcfda32e-842f-4a46-bc37-b90f15f842da_tmp_1


INFO:root:Dataset entities initialization took  0.12
INFO:root:Using TM model: <class 'autotm.fitness.tm.TopicModel'> according to fitness name: default, topics count: 10
10it [00:00, 917.87it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
10it [00:00, 704.77it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
10it [00:00, 597.91it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
10it [00:00, 862.74it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
INFO:root:Building dictionary


Training is complete
Wow! all topics


10it [00:00, 573.05it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


COMPONENTS:  0.4795999886457885 0.3891480665191247


10it [00:00, 10760.14it/s]
0it [00:00, ?it/s]
10it [00:00, 5831.09it/s]
0it [00:00, ?it/s]
10it [00:00, 2924.90it/s]
0it [00:00, ?it/s]
10it [00:00, 2001.00it/s]
0it [00:00, ?it/s]
10it [00:00, 1754.57it/s]
0it [00:00, ?it/s]
10it [00:00, 1204.95it/s]
0it [00:00, ?it/s]
10it [00:00, 1017.79it/s]
0it [00:00, ?it/s]
10it [00:00, 686.01it/s]
0it [00:00, ?it/s]
10it [00:00, 593.11it/s]
0it [00:00, ?it/s]
10it [00:00, 610.76it/s]
0it [00:00, ?it/s]
  "switchP": np.nanmean(switchp_scores),
INFO:root:Fitness estimation took  1.44
INFO:root:Logging params and artifacts to mlflow
INFO:root:Created experiment_None
INFO:root:Experiment exists, omitting creation
INFO:root:Logged params and artifacts to mlflow
INFO:root:Deleting bigartm logs: []
2023-08-20 18:38:30,206 - GA_algo - INFO - Logged the best solution. Obtained fitness is 0.8687480551649132
INFO:root:Loading dataset entities
INFO:root:Reading dictionary from /tmp/autotm_wd_n57qdety/a1c8fd43-7ba1-482a-b2a4-e4876b057313/dictionary.txt


Experiment run name: fitness-__noname__-244ea494-dec5-4439-8ebf-43bc8130538f


INFO:root:Dataset entities initialization took  0.13
10it [00:00, 818.37it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
10it [00:00, 652.18it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
10it [00:00, 875.05it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
10it [00:00, 829.08it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


Training is complete
Saving model to /home/fhrzn/Projects/AutoTM/examples/artm.model


Use trained model for prediction

In [11]:
trained_model = AutoTM.load(model_name)
trained_model

In [12]:
predicted = trained_model.predict(data_transformer.data)

Starting...


E0820 18:38:31.617317 97183 perplexity.cc:109] Perplexity was configured to use UnigramCollectionModel with dictionary 996b5d4a-2ae8-4af7-bb9f-7dacbdef5dc8. This dictionary can't be found.
E0820 18:38:31.617403 97183 perplexity.cc:109] Perplexity was configured to use UnigramCollectionModel with dictionary 996b5d4a-2ae8-4af7-bb9f-7dacbdef5dc8. This dictionary can't be found.
E0820 18:38:31.617444 97183 perplexity.cc:109] Perplexity was configured to use UnigramCollectionModel with dictionary 996b5d4a-2ae8-4af7-bb9f-7dacbdef5dc8. This dictionary can't be found.
E0820 18:38:31.617501 97183 perplexity.cc:109] Perplexity was configured to use UnigramCollectionModel with dictionary 996b5d4a-2ae8-4af7-bb9f-7dacbdef5dc8. This dictionary can't be found.
E0820 18:38:31.617537 97183 perplexity.cc:109] Perplexity was configured to use UnigramCollectionModel with dictionary 996b5d4a-2ae8-4af7-bb9f-7dacbdef5dc8. This dictionary can't be found.
E0820 18:38:31.617576 97183 perplexity.cc:109] Perplexi

In [13]:
predicted.head()

Unnamed: 0,main0,main1,main2,main3,main4,main5,main6,main7,main8,main9
0,0.386334,0.523874,0.00212,0.029499,0.000101,0.05668854,0.0002180499,0.001144,1.764675e-05,4e-06
1,8.8e-05,0.007041,8.3e-05,0.000178,0.004842,1.943518e-07,1.849505e-08,0.000281,5.671378e-07,0.987487
2,0.000113,0.000938,0.005827,2e-06,0.000947,0.002259208,4.46976e-05,0.989692,8.191225e-08,0.000178
3,0.125599,0.453699,0.000258,0.026937,0.00011,0.1684149,2.997085e-05,0.08381,0.137423,0.003719
4,0.721979,0.006053,0.036695,0.00326,0.000104,9.454819e-05,1.371012e-05,0.231175,7.566109e-07,0.000625


In [16]:
trained_model.print_topics()

main0
['год', 'миллион', 'свой', 'доллар', 'работа', 'компания', 'миллиард', 'программа', 'который', 'около', 'объект', 'использовать', 'составлять', 'это', 'получать', 'проект', 'общий', 'данные', 'рост', 'частность', 'процент', 'время', 'результат', 'сша', 'работать', 'служба', 'число', 'новый', 'некоторый', 'специалист', 'настоящий', 'половина', 'сумма', 'приводить', 'исследование', 'система', 'рубль', 'отмечать', 'помогать', 'оказываться', 'вырастать', 'мочь', 'газета', 'связь', 'федеральный', 'штат', 'ситуация', 'случай', 'проблема', 'нарушение']

main1
['год', 'рубль', 'матч', 'первый', 'команда', 'второй', 'доллар', 'дом', 'россия', 'комплекс', 'сообщать', 'это', 'составлять', 'москва', 'стоимость', 'который', 'получать', 'квартира', 'рынок', 'московский', 'время', 'счет', 'выступать', 'январь', 'занимать', 'мочь', 'лига', 'состояться', 'тысяча', 'месяц', 'май', 'клуб', 'вернуться', 'белый', 'данные', 'процент', 'миллион', 'октябрь', 'группа', 'начало', 'место', 'апрель', 'весь'