## Тематическая модель на данных Last.fm

Сначала настраиваем и подключаем нужные библиотеки

In [81]:
import os
import sys
HOME = '/home/vovapolu/Projects/'
BIGARTM_PATH = HOME + 'bigartm/'
BIGARTM_BUILD_PATH = BIGARTM_PATH + 'build/'
sys.path.append(os.path.join(BIGARTM_PATH, 'src/python'))
os.environ['ARTM_SHARED_LIBRARY'] = os.path.join(BIGARTM_BUILD_PATH, 'src/artm/libartm.so')

In [92]:
import artm.messages_pb2, artm.library, uuid, glob, time
import csv, shutil


plays_file = 'usersha1-artmbid-artname-plays.tsv'
batch_path = 'batches' #Папка с батчами
shutil.rmtree('batches')

artist_id_to_name = {} #Мапа, переводящая artist_id в имя
artists_idxs = {} #Мапа, переводящая artist_id в номер в батче
artists = [] #Имена артистов в батче

Читаем файл и строим батчи

In [83]:
last_user_id = ''
handled_users = 0

users_to_handle = 50000 #Данные скольких юзеров обрабатываем
users_in_batch = 1000 #Сколько юзеров в батче

batch = None

with open(plays_file, 'rb') as tsvin:
    tsvin = csv.reader(tsvin, delimiter='\t', quoting=csv.QUOTE_NONE)

    field = None

    for row in tsvin:
        
        user_id, artist_id, artist_name, plays = row

        if user_id != last_user_id:
            if handled_users > users_to_handle or handled_users % users_in_batch == 0:
                if batch is not None:
                    for artist in artists:
                        batch.token.append(artist.decode('utf8'))
                    artm.library.Library().SaveBatch(batch, batch_path)
                    artists = []
                    artists_idxs = {}
                batch = artm.messages_pb2.Batch()
                batch.id = str(uuid.uuid4())
            
            if handled_users > users_to_handle:
                break
                        
            item = batch.item.add()
            item.id = handled_users
            field = item.field.add()
            
            last_user_id = user_id
            handled_users += 1
            
        if artist_id not in artist_id_to_name:
            artist_id_to_name[artist_id] = artist_name
        if artist_id not in artists_idxs:
            artists_idxs[artist_id] = len(artists)
            artists.append(artist_name)

        field.token_id.append(artists_idxs[artist_id])
        field.token_count.append(int(plays))

Запускаем BigArtm

In [84]:
master = artm.library.MasterComponent()
master.config().processors_count = 4 #Распараллеливаем на 4 ядра
master.Reconfigure()

batches = glob.glob(batch_path + "/*.batch")

background_topics = []
objective_topics = []
all_topics = []
topic_count = 200
background_topic_count = 5

for i in xrange(topic_count):
    topic_name = ("background" if i < background_topic_count else "objective") + " topic " + str(i)
    all_topics.append(topic_name)
    if i < background_topic_count:
        background_topics.append(topic_name)
    else:
        objective_topics.append(topic_name)

# Configure scores
perplexity_score = master.CreatePerplexityScore()

sparsity_theta_objective = master.CreateSparsityThetaScore(topic_names=objective_topics)
sparsity_phi_objective = master.CreateSparsityPhiScore(topic_names=objective_topics)
sparsity_theta_background = master.CreateSparsityThetaScore(topic_names=background_topics)
sparsity_phi_background = master.CreateSparsityPhiScore(topic_names=background_topics)

top_tokens_score = master.CreateTopTokensScore(topic_names=objective_topics, num_tokens=20)
background_tokens = master.CreateTopTokensScore(topic_names=background_topics)

# Configure regularizers
theta_objective = master.CreateSmoothSparseThetaRegularizer(topic_names=objective_topics)
theta_background = master.CreateSmoothSparseThetaRegularizer(topic_names=background_topics)
phi_objective = master.CreateSmoothSparsePhiRegularizer(topic_names=objective_topics)
phi_background = master.CreateSmoothSparsePhiRegularizer(topic_names=background_topics)
decorrelator = master.CreateDecorrelatorPhiRegularizer(topic_names=objective_topics)

theta_regularizers = {(theta_objective.name(), -0.1), (theta_background.name(), 0.1)}
phi_regularizers = {(phi_objective.name(), -0.1), (phi_background.name(), 0.1), (decorrelator.name(), 10000)}

# Initialize model
pwt_model = "pwt"
master.InitializeModel(model_name=pwt_model, batch_folder=batch_path, topic_names=all_topics)

# Perform iterations
for iteration in range(0, 10):
    print "Iteration", iteration
    start = time.clock()
    master.ProcessBatches(pwt_model, batches, "nwt", theta_regularizers, inner_iterations_count=20)
    master.RegularizeModel(pwt_model, "nwt", "rwt", phi_regularizers)
    master.NormalizeModel("nwt", pwt_model, "rwt")
    print "Perplexity = %.3f" % perplexity_score.GetValue(pwt_model).value,
    print "Phi objective sparsity = %.3f" % sparsity_phi_objective.GetValue(pwt_model).value,
    print "Theta objective sparsity = %.3f" % sparsity_theta_objective.GetValue(pwt_model).value
    print "Phi backgournd sparsity = %.3f" % sparsity_phi_background.GetValue(pwt_model).value,
    print "Theta background sparsity = %.3f" % sparsity_theta_background.GetValue(pwt_model).value
    finish = time.clock()
    print "%.1f s" % ((finish - start) / 4)

# Visualize top token in each topic and a snippet of theta matrix
print "Genres"
artm.library.Visualizers.PrintTopTokensScore(top_tokens_score.GetValue(pwt_model))
print "Background genres"
artm.library.Visualizers.PrintTopTokensScore(background_tokens.GetValue(pwt_model))

Iteration 0
Perplexity = 82996.726 Phi objective sparsity = 0.365 Theta objective sparsity = 0.213
Phi backgournd sparsity = 0.000 Theta background sparsity = 0.000
15.2 s
Iteration 1
Perplexity = 2526.611 Phi objective sparsity = 0.799 Theta objective sparsity = 0.791
Phi backgournd sparsity = 0.000 Theta background sparsity = 0.000
15.2 s
Iteration 2
Perplexity = 1010.856 Phi objective sparsity = 0.900 Theta objective sparsity = 0.904
Phi backgournd sparsity = 0.000 Theta background sparsity = 0.000
15.2 s
Iteration 3
Perplexity = 688.140 Phi objective sparsity = 0.934 Theta objective sparsity = 0.909
Phi backgournd sparsity = 0.000 Theta background sparsity = 0.000
15.0 s
Iteration 4
Perplexity = 567.366 Phi objective sparsity = 0.952 Theta objective sparsity = 0.908
Phi backgournd sparsity = 0.000 Theta background sparsity = 0.000
15.1 s
Iteration 5
Perplexity = 504.915 Phi objective sparsity = 0.962 Theta objective sparsity = 0.905
Phi backgournd sparsity = 0.000 Theta background 

In [None]:
main_musician = "green day"
top_matches_count = 10
top_matches = []
main_musician_ind = -1

#Long method

topic_model = master.GetTopicModel(model=pwt_model, topic_names={objective_topics[0]})
for i in xrange(len(topic_model.token)):
    musician = topic_model.token[i]
    if musician == main_musician:
        main_musician_ind = i
        break
            
if main_musician_ind == -1:
    print main_musician, "could not be found."
else:       
    print main_musician_ind
    for topic_name in objective_topics:
        topic_model = master.GetTopicModel(model=pwt_model, topic_names={topic_name})
        top_matches_topic = []
        
        main_musician_weight = topic_model.token_weights[main_musician_ind].value[0]
        if main_musician_weight < 1e-05:
            continue
        print topic_name
            
        for i in xrange(len(topic_model.token)):
            if i != main_musician_ind:
                another_musician_weight = topic_model.token_weights[i].value[0]
                top_matches_topic.append((another_musician_weight, topic_model.token[i]))
                top_matches_topic = sorted(top_matches_topic, reverse=True)[:top_matches_count]
            #if len(top_mathces_topic) == top_matches_count and top_matches_count[-1][0] > main_musician_weight:
            #    break
        
        print top_matches_topic
        
        for match in top_matches_topic:
            top_matches.append((match[0] * main_musician_weight, match[1]))

    for match in sorted(top_matches, reverse=True)[:top_matches_count]:
        print match


In [91]:
main_musician = "metallica"
top_matches_count = 20
top_matches = []

#Fast method

topics = []
now_topic = []
score = top_tokens_score.GetValue(pwt_model)
topic_index = -1
for i in xrange(model.num_entries):
    if score.topic_index[i] != topic_index:
        if topic_index != -1:
            topics.append(now_topic)
            now_topic = []
        topic_index = score.topic_index[i]
        
    now_topic.append((score.weight[i], score.token[i]))

topics.append(now_topic)

for topic in topics:
    for match in topic:
        if match[1] == main_musician:
            for top_match in topic:
                if top_match[1] != main_musician:
                    top_matches.append((top_match[0] * match[0], top_match[1]))
                    
for top_match in sorted(top_matches, reverse=True)[:top_matches_count]:
    print top_match


(0.010976274261382368, u'black sabbath')
(0.00400871377061679, u'black label society')
(0.003837842885826892, u'clutch')
(0.0038244757666363594, u'volbeat')
(0.00359758336603766, u'ozzy osbourne')
(0.0034691669362785504, u'in flames')
(0.0028705520770563875, u'down')
(0.0025500796903830114, u'monster magnet')
(0.002174447447864214, u'danko jones')
(0.0014703701117781132, u'danzig')
(0.0014523125168645068, u'bloodhound gang')
(0.0013676723767116206, u'corrosion of conformity')
(0.0013670532619872167, u'queen')
(0.0012225651248267966, u'mot\xf6rhead')
(0.001163979304014906, u'type o negative')
(0.0011130858090529008, u'ac/dc')
(0.0010967877781204727, u'pantera')
(0.0009095143416957691, u'mustasch')
(0.0008506828363727681, u'gwar')
(0.0008027985347726294, u'lamb of god')
