In [4]:
%matplotlib inline

import artm
import os
import glob
import pandas as pd
import numpy as np
import plotly.express as px
from dataclasses import dataclass, astuple
from typing import Dict, List, Tuple, Iterator, Optional
import functools
import itertools
import tqdm
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
import json

MAX_WORDS_PER_TOPIC = 20
MAX_TEXT_LENGTH = 150
BASE_PATH = "models/experiments"

# Func: Reading data

In [5]:
class Datasets:
    datasets = {
        "lentaru": os.path.join('datasets', 'lenta_ru','lenta-ru-news_sample_10000_processed.csv'),
    }
    
    loaded_datasets = dict()
    
    @classmethod
    def dataset(cls, name: str) -> pd.DataFrame:
        data = cls.loaded_datasets.get(name, pd.read_csv(cls.datasets[name]))
        return data

@dataclass(frozen=True)
class TmModel:
    exp_id: str
    model_id: str
    dataset_name: str
    data: pd.DataFrame
    model: object
    topic_word_dist: np.ndarray
    dtd: np.ndarray
    topics_dict: Dict[str, List[str]]
        
    def __iter__(self):
#         return iter((data, model, topic_word_dist, dtd, topics_dict))
        return iter(astuple(self))

    def __str__(self):
        return f"TM_{self.exp_id}_{self.model_id}_{self.dataset_name}"


# def read_data(exp_path: str) -> TmModel:
#     _, num = os.path.basename(p).split('_')[-1]

#     dataset_path = os.path.join('datasets', 'lenta_ru','lenta-ru-news_sample_10000_processed.csv')
#     model_path_pattern = os.path.join(exp_path, f'best_model/model_{num}_*')
    

#     phi_path = os.path.join(exp_path, 'matrixes/phi.npy')
#     theta_path =  os.path.join(exp_path, 'matrixes/theta.npy')

#     data = pd.read_csv(dataset_path)

#     model = artm.load_artm_model(model_path)

#     topic_word_dist = np.load(phi_path)
#     doc_topic_dist = np.load(theta_path)

#     topics_dict = model.score_tracker['TopTokensScore'].last_tokens
    
#     return TmModel(data, model, topic_word_dist, doc_topic_dist.T, topics_dict)

def read_exp_data(exp_path: str, model_ids: Optional[List[str]] = None) -> Iterator[TmModel]:
    exp_name = os.path.basename(exp_path)
    
    # Example: experiment_<dataset>_S_<topic_count>_<exp_id>
    _, dataset_name, _, topic_count, exp_id = exp_name.split('_')
    
    data = Datasets.dataset(dataset_name)

    model_path_pattern = os.path.join(exp_path, f'best_model/model_{exp_id}_*')
    model_paths = glob.glob(model_path_pattern)

    for model_path in model_paths:
        model_id = model_path.split('_')[-1]
        
        if model_ids is not None and model_id not in model_ids:
            continue
        
        metrics_path = os.path.join(exp_path, f'metrics/saved_metrics_{model_id}.pkl')
        phi_path = os.path.join(exp_path, f'matrixes/phi_{model_id}.npy')
        theta_path =  os.path.join(exp_path, f'matrixes/theta_{model_id}.npy')

        if not all(os.path.exists(path) for path in [metrics_path, phi_path, theta_path]):
            continue
        
        with open(metrics_path, 'rb') as f:
            metrics = pickle.load(f)
        
        if not metrics['all_topics']:
            continue
        
        model = artm.load_artm_model(model_path)
        topic_word_dist = np.load(phi_path)
        doc_topic_dist = np.load(theta_path)
        topics_dict = model.score_tracker['TopTokensScore'].last_tokens

        yield TmModel(exp_id, model_id, dataset_name, data, model, topic_word_dist, doc_topic_dist.T, topics_dict)   

# Func: auxillary functions

In [6]:
def make_align_topics(max_topics: int):
    def align_topics(topics):
        el = np.zeros(max_topics)
        el[:len(topics)] = topics
        el[:max_topics - len(topics)] = np.nan
        return el
    return align_topics

def parse_topic_name(name):
    return int(name[len('main'):]) if name.startswith('main') else int(name[len('back'):])

def validate_text_samples(df: pd.DataFrame, tm: TmModel) -> None:
    s = df.apply(lambda x: len(sum(x, start=[])) == len(set(el for l in x for el in l)), axis=1)
        
    assert all(s), str(tm)

# Func: Making task №1 and №2 of the markup

In [7]:
def make_sample_topics(tm: TmModel) -> pd.DataFrame:
    df = pd.DataFrame([{'exp_id': tm.exp_id, 'model_id': tm.model_id, 'dataset_name': tm.dataset_name, 
                        'topic_id': topic_id, 'wordset': ' '.join(words[:MAX_WORDS_PER_TOPIC])} for topic_id, words in tm.topics_dict.items() if topic_id.startswith('main')])
    return df

# Func: Making task №3 of the markup

In [8]:
def make_samples_topic2word(tm: TmModel) -> pd.DataFrame:
    exp_id, model_id, dataset_name, data, model, topic_word_dist, dtd, topics_dict = tm

    # prepare auxillary structures
    main_topics_count = max(int(k[len('main'):]) for k in topics_dict.keys() if k.startswith('main')) + 1
    back_topics_count = max((int(k[len('back'):]) for k in topics_dict.keys() if k.startswith('back')), default=-1) + 1
    topic_num2words = {int(k[len('main'):]) if k.startswith('main') else main_topics_count + int(k[len('back'):]): v[:20] for k,v in topics_dict.items()}

    dtd = dtd[:, :main_topics_count]

    phi_df = model.get_phi()


    top_topics = np.flip(np.argsort(dtd, axis=1)).astype('int')[:, :2]

    # prepare tfidf to be used for word choosing
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(data['processed_text'])
    feat_names = vectorizer.get_feature_names_out()
    rows, cols = X.nonzero()

    def get_words_by_tfidf(doc_num):
        columns = cols[np.where(rows == doc_num)]
        arr = np.flip(np.argsort(X[doc_num, columns].toarray().reshape(-1)))
        words_by_tfidf = [feat_names[cols[idx]] for idx in arr[:20]]
        return words_by_tfidf


    doc2words_by_tfidf = [get_words_by_tfidf(doc_num) for doc_num in range(data.shape[0])]


    # structures to select topics relative to a chosen word
    word2probtopic = np.argsort(phi_df * -1, axis=1)

    topics_by_popularity = np.argsort(np.mean(dtd, axis=0))
    word2most_probable_topics_df = np.argsort(phi_df * -1, axis=1).iloc[:, :10]
    word2poptopic = word2most_probable_topics_df.apply(lambda x: [el for el in topics_by_popularity if el in set(x)][:10], axis=1)


    def choose_word_and_topics(doc_id: int, text: str, tpcs: List[int]) -> Tuple[str, List[int]]:
        already_chosen_tpcs = set(tpcs)

    #     # 1. way of choosing
    #     word_sets = (
    #         w for t in tpcs 
    #         for w in np.flip(phi_df[f'main{t}'].argsort())[:20].index.tolist()
    #     )

    #     word_set = list(set(word_sets))
    #     np.random.shuffle(word_set)

    #     # problem with StopIteration
    #     word = next(w for w in word_set)
        # 2. TF-IDF
        word = np.random.choice(doc2words_by_tfidf[doc_id][:10], size=1)[0]

        # 3. random word from the text
    #     word = np.random.choice(text.split(' '), size=1)[0]

        # finding the most probable topics this word belongs to
        # exclude tpcs
        most_prob_topics = [w for w in word2probtopic.loc[word] if w not in already_chosen_tpcs][:2]
        already_chosen_tpcs.update(most_prob_topics)

        # two most popular topics this word is among top 10 words of 
        # exclude tpcs
        most_pop_topics = [w for w in word2poptopic.loc[word] if w not in already_chosen_tpcs][:2]
        already_chosen_tpcs.update(most_pop_topics)

        # exclude tpcs
        rest_topics = [w for w in word2poptopic.loc[word] if w not in already_chosen_tpcs]
        
        random_topics = np.random.choice(rest_topics, size=1)[0]
        already_chosen_tpcs.add(random_topics)
        
        all_topics = [*tpcs, *most_prob_topics, *most_pop_topics, random_topics]
        
        assert len(all_topics) == len(set(all_topics)), tm

        return {
            "doc": doc_id,
            "text": text,
            "chosen_word": word,
            "top_topics_for_text": tpcs,
            "most_prob_topics_for_word": most_prob_topics,
            "most_pop_topics_for_word": most_pop_topics,
            "random_topics": random_topics,
            "all_topics": all_topics
        }

    text2top_topics = list(enumerate(zip(data['processed_text'], top_topics)))

#     highlighted_word_df = pd.DataFrame([choose_word_and_topics(i, text, tpcs) for i, (text, tpcs) in tqdm.tqdm(text2top_topics)])
    highlighted_word_df = pd.DataFrame([choose_word_and_topics(i, text, tpcs) for i, (text, tpcs) in text2top_topics])

    max_topics = highlighted_word_df['all_topics'].map(len).max()
    highlighted_word_df[[f'topic_{i}' for i in range(max_topics)]] = highlighted_word_df['all_topics'].apply(make_align_topics(max_topics)).tolist()
    for c in [f'topic_{i}' for i in range(max_topics)]:
        highlighted_word_df[c] = highlighted_word_df[c].apply(lambda tnum: topic_num2words[tnum])

    highlighted_word_df[['exp_id', 'model_id', 'dataset_name']] = exp_id, model_id, dataset_name
        
    return highlighted_word_df

# Func: Making task №4 of the markup 

In [9]:
def make_samples_topic2text(tm: TmModel) -> pd.DataFrame:
    exp_id, model_id, dataset_name, data, model, topic_word_dist, dtd, topics_dict = tm
    
    rng = np.random.default_rng()

    MAX_TOPIC_SAMPLES = 5

    main_topics_count = max(int(k[len('main'):]) for k in topics_dict.keys() if k.startswith('main')) + 1
    back_topics_count = max((int(k[len('back'):]) for k in topics_dict.keys() if k.startswith('back')), default=-1) + 1
    topic_num2words = {int(k[len('main'):]) if k.startswith('main') else main_topics_count + int(k[len('back'):]): v[:20] for k,v in topics_dict.items()}

    dtd = dtd[:, :main_topics_count]
    argsrt_dtd = np.argsort(dtd, axis=1)
    top_topics = np.flip(argsrt_dtd).astype('int')
    
    def generate_options(i, top_topics_record):
        top_topics_row = [c_idx for c_idx in top_topics_record[:3] if dtd[i, c_idx] > 0.01]
#         random_topics = rng.choice(argsrt_dtd[i, :main_topics_count - len(top_topics_row)], size=MAX_TOPIC_SAMPLES - len(top_topics_row), shuffle=True, replace=False).tolist()
        random_topics = rng.choice(top_topics_record[3:], size=MAX_TOPIC_SAMPLES - len(top_topics_row), shuffle=True, replace=False).tolist()

        return {
            "doc": i,
            "topics": top_topics_row,
            "random_topics": random_topics
        }
    

    doc2topics = pd.DataFrame([generate_options(i, topics) for i, topics in enumerate(top_topics)])
    doc2topics = doc2topics.set_index('doc')

    markup_df = data.merge(doc2topics, left_index=True, right_index=True)
    markup_df = markup_df[['text', 'tokens_len', 'topics', 'random_topics']]
    markup_df = markup_df[markup_df['tokens_len'] <= MAX_TEXT_LENGTH]
    markup_df[[f'topic_{i}' for i in range(MAX_TOPIC_SAMPLES)]] = (markup_df['topics'] + markup_df['random_topics']).apply(make_align_topics(MAX_TOPIC_SAMPLES)).tolist()
    markup_df[[f'topic_{i}' for i in range(MAX_TOPIC_SAMPLES)]] = markup_df[[f'topic_{i}' for i in range(MAX_TOPIC_SAMPLES)]].astype('int')
    
    for c in [f'topic_{i}' for i in range(MAX_TOPIC_SAMPLES)]:
        markup_df[c] = markup_df[c].apply(lambda tnum: topic_num2words[tnum])
    
    markup_df[['exp_id', 'model_id', 'dataset_name']] = exp_id, model_id, dataset_name
    
    validate_text_samples(markup_df[['topics', 'random_topics']], tm)
    
    return markup_df

# Execute

In [10]:
def generate_samples_for_tasks(base_path: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    topics_dfs, topic2text_dfs, topic2word_dfs = [], [], [] 

    for exp_path in tqdm.tqdm(glob.glob(os.path.join(base_path, "experiment_*")), desc="experiment"):
        models = read_exp_data(exp_path) 
        for tm in tqdm.tqdm(models, desc="models"):
            topics_dfs.append(make_sample_topics(tm))
            topic2word_dfs.append(make_samples_topic2word(tm))
            topic2text_dfs.append(make_samples_topic2text(tm))

    topics_df = pd.concat(topics_dfs)
    topic2word_df = pd.concat(topic2word_dfs)
    topic2text_df = pd.concat(topic2text_dfs)

    return topics_df, topic2word_df, topic2text_df

# save samples for labeling in the json formath required by Toloka 
def write_topics_to_json(topics_df: pd.DataFrame):
    topics_df["correct_bad_words"] = None
    cols_to_rename ={c:f"INPUT:{c}" for c in topics_df.columns.tolist()}
    
    toloka_input_topics_df = topics_df.rename(columns=cols_to_rename)
    
    with open("toloka_input_topics.json", 'w') as f:
        records = toloka_input_topics_df.to_dict('records')
        json.dump(records, f)

In [11]:
topics_df, topic2word_df, topic2text_df = generate_samples_for_tasks(BASE_PATH)

topics_df.to_parquet("topics_samples.parquet")
topic2word_df.to_parquet("topic2word_samples.parquet")
topic2text_df.to_parquet("topic2text_samples.parquet")

experiment:   0%|          | 0/4 [00:00<?, ?it/s]
models: 0it [00:00, ?it/s][A
models: 1it [00:17, 17.75s/it][A
experiment:  25%|██▌       | 1/4 [00:17<00:53, 17.75s/it]
models: 0it [00:00, ?it/s][A
models: 1it [00:19, 19.31s/it][A
models: 2it [00:34, 16.87s/it][A
models: 3it [00:49, 16.06s/it][A
models: 4it [01:05, 15.87s/it][A
models: 5it [01:23, 16.92s/it][A
models: 6it [01:40, 16.69s/it][A
experiment:  50%|█████     | 2/4 [01:57<02:12, 66.23s/it]
models: 0it [00:00, ?it/s][A
models: 1it [00:17, 17.07s/it][A
experiment:  75%|███████▌  | 3/4 [02:14<00:43, 43.78s/it]
models: 0it [00:00, ?it/s][A
models: 1it [00:15, 15.92s/it][A
experiment: 100%|██████████| 4/4 [02:30<00:00, 37.73s/it]


In [15]:
write_topics_to_json(topics_df)

In [16]:
topics_df

Unnamed: 0,exp_id,model_id,dataset_name,topic_id,wordset,correct_bad_words
0,1655995996,1655996024,lentaru,main0,который компания также свой мочь новый процент...,
1,1655995996,1655996024,lentaru,main1,матч команда сборная россия который чемпионат ...,
2,1655995996,1655996024,lentaru,main2,ахмедова узбекский прививка врач судебный фото...,
3,1655995996,1655996024,lentaru,main3,грузия грузинский военный осетия южный конфлик...,
4,1655995996,1655996024,lentaru,main4,нефть новый нефтяной расход газпром госпрограм...,
...,...,...,...,...,...,...
5,1656001268,1656001315,lentaru,main5,дело сотрудник сообщать область задерживать уг...,
6,1656001268,1656001315,lentaru,main6,президент украина глава заявлять партия минист...,
7,1656001268,1656001315,lentaru,main7,процент рубль доллар составлять тысяча миллиар...,
8,1656001268,1656001315,lentaru,main8,который проект миллион фильм новый тысяча такж...,


# Other

In [13]:
# diff = (np.max(tm.dtd, axis=1) - np.min(tm.dtd, axis=1))

# px.histogram(diff)

In [14]:
# px.box(diff)