In [1]:
import artm
import seaborn as sns
import warnings
import pandas as pd
warnings.filterwarnings("ignore")

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
all_data_list = [
    {
        'name': 'nips',
        'path': 'Nips/',
        'format': 'bow_uci',
        'batch_folder': 'Nips/batches',
        'dictionary': 'Nips/dictionary.txt',
    },
    {
        'name': 'wiki-en',
        'path': 'Wiki-En/',
        'format': 'vowpal_wabbit',
        'batch_folder': 'Wiki-En/batches',
        'dictionary': 'Wiki-En/dictionary.txt',
    },
    {
        'name': 'pubmed',
        'path': 'Pubmed/',
        'format': 'bow_uci',
        'batch_folder': 'Pubmed/batches',
        'dictionary': 'Pubmed/dictionary.txt',
    },
    {
        'name': 'nytimes',
        'path': 'Nytimes/',
        'format': 'bow_uci',
        'batch_folder': 'Nytimes/batches',
        'dictionary': 'Nytimes/dictionary.txt',
    },
    {
        'name': 'lyrics',
        'path': 'Lyrics/',
        'format': 'vowpal_wabbit',
        'batch_folder': 'Lyrics/batches',
        'dictionary': 'Lyrics/dictionary.txt',
    },
    {
        'name': 'wiki-enru',
        'path': 'Wiki-En-Ru/',
        'format': 'vowpal_wabbit',
        'batch_folder': 'Wiki-En-Ru/batches',
        'dictionary': 'Wiki-En-Ru/dictionary.txt',
    },
]

In [41]:
def data_prepare(data_dict):
    data_format = data_dict['format']
    if data_format == 'vowpal_wabbit':
        data_path = data_dict['path'] + 'vw.{}.txt'.format(data_dict['name'])
    else:
        data_path = data_dict['path']
    collection_name = data_dict['name']
    target_folder = data_dict['batch_folder']
    dictionary_path = data_dict['dictionary']
    batch_vectorizer = artm.BatchVectorizer(
        data_path=data_path,
        data_format=data_format,
        collection_name=collection_name,
        target_folder=target_folder,
    )
    dictionary = artm.Dictionary()
    dictionary.gather(data_path=target_folder)
    dictionary.save_text(dictionary_path=dictionary_path)


def data_offline_fit(data_dict):
    collection_name = data_dict['name']
    target_folder = data_dict['batch_folder']
    dictionary_path = data_dict['dictionary']
    
    batch_vectorizer = artm.BatchVectorizer(
        data_path=target_folder,
        data_format='batches',
    )
    dictionary = artm.Dictionary()
    dictionary.load_text(dictionary_path=dictionary_path)
    
    offline_model = artm.ARTM(
        num_topics=20,
        dictionary=dictionary,
        num_processors=2,
    )
    offline_model.scores.add(artm.PerplexityScore(
            name='PerplexityScore',
            dictionary=dictionary
    ))
    offline_model.scores.add(artm.TopTokensScore(name='top_tokens_score'))
    offline_model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=10)
    
    perplex_score = offline_model.score_tracker['PerplexityScore'].value
    tokens_score = offline_model.score_tracker['top_tokens_score'].last_tokens
    print("Collection - {}, Perplexity - {}".format(
        collection_name,
        offline_model.score_tracker['PerplexityScore'].last_value,
    ))
    phi = offline_model.get_phi()
    phi.to_csv(data_dict['path'] + 'phi.tsv', sep='\t')
    
    return perplex_score, tokens_score

In [7]:
for data_dict in all_data_list:
    data_prepare(data_dict)
    print('{} done'.format(data_dict['name']))

In [44]:
for data_dict in all_data_list:
    data_dict['perplex_score'], data_dict['tokens_score'] = data_offline_fit(data_dict)
    pd.DataFrame.from_dict(data_dict, orient='index').to_csv(data_dict['path'] + 'offline_result.tsv', sep='\t', header=None)


Collection - nips, Perplexity - 1567.3600178254426

Collection - wiki-en, Perplexity - 6701.67167326759

Collection - pubmed, Perplexity - 3956.646160311604

Collection - nytimes, Perplexity - 5167.75077711719

Collection - lyrics, Perplexity - 535.1523577440564

Collection - wiki-enru, Perplexity - 6072.576865014042


# Смотрим на онлайн "из коробки"

In [3]:
def data_online_fit(data_dict):
    collection_name = data_dict['name']
    target_folder = data_dict['batch_folder']
    dictionary_path = data_dict['dictionary']
    
    batch_vectorizer = artm.BatchVectorizer(
        data_path=target_folder,
        data_format='batches',
    )
    dictionary = artm.Dictionary()
    dictionary.load_text(dictionary_path=dictionary_path)
    
    online_model = artm.ARTM(
        num_topics=20,
        dictionary=dictionary,
        num_processors=2,
    )
    online_model.scores.add(artm.PerplexityScore(
            name='PerplexityScore',
            dictionary=dictionary
    ))
    online_model.scores.add(
        artm.TopTokensScore(name='top_tokens_score')
    )
    online_model.fit_online(
        batch_vectorizer=batch_vectorizer,
        update_every=int(batch_vectorizer.num_batches / 20) + 1,
    )
    
    fit_perplex_score = online_model.score_tracker['PerplexityScore'].value
    tokens_score = online_model.score_tracker['top_tokens_score'].last_tokens
    
    online_model.scores.add(artm.PerplexityScore(
            name='FinalPerplexity',
            dictionary=dictionary,
    ))
    online_model.transform(
        batch_vectorizer=batch_vectorizer,
        theta_matrix_type=None
    )
    perplexity = online_model.get_score('FinalPerplexity').value

    #phi = online_model.get_phi()
    #phi.to_csv(data_dict['path'] + 'phi_online1.tsv', sep='\t')
    
    return perplexity, fit_perplex_score, tokens_score

In [6]:
new_data_list = []
for data_dict in all_data_list:
    data_dict = {x: str(data_dict[x]) for x in data_dict}
    new_data_list.append(data_dict)

In [4]:
for data_dict in all_data_list:
    data_dict['perplex_score_online_final'], data_dict['perplex_score_online0'], data_dict['tokens_score_online0'] = data_online_fit(data_dict)
    data_dict['perplex_score_online_final'], data_dict['perplex_score_online0'], data_dict['tokens_score_online0'] = str(data_dict['perplex_score_online_final']), str(data_dict['perplex_score_online0']), str(data_dict['tokens_score_online0'])
    pd.DataFrame.from_dict(data_dict, orient='index').to_csv(data_dict['path'] + 'online_result_base.tsv', sep='\t', header=None)
    print data_dict['name'] + ' done'

nips done
wiki-en done
pubmed done
nytimes done
lyrics done
wiki-enru done


In [24]:
for data_dict in all_data_list:
    data_info = pd.read_csv(data_dict['path'] + 'online_result_base.tsv', sep='\t', header=None, index_col=0)
    data_info = data_info.to_dict()[1]
    print("Collection - {}, Perplexity - {}".format(
        data_info['name'],
        data_info['perplex_score_online_final'],
    ))

Collection - nips, Perplexity - 2378.67793044
Collection - wiki-en, Perplexity - 8111.0757332
Collection - pubmed, Perplexity - 4854.35310672
Collection - nytimes, Perplexity - 6178.12057525
Collection - lyrics, Perplexity - 606.563782487
Collection - wiki-enru, Perplexity - 7426.74240867
