In [None]:
import artm

data_dir='.\\data\\'
data_file='Ec_Bu__be_vpw.txt'

# ----  BIGARTM !!! ------
batch_vectorizer = artm.BatchVectorizer(data_path='data/Ec_Bu__be_vpw.txt',
                                        data_format='vowpal_wabbit',
                                        target_folder='.\\collection_batches3\\',
                                        class_ids=['text', 'authors', 'bigrams', 'references'] #'text',
                                       )

main_dictionary = artm.Dictionary()
main_dictionary.gather(data_path='collection_batches3')
main_dictionary.save(dictionary_path='collection_batches3/main_dictionary')
main_dictionary.save_text(dictionary_path='collection_batches3/main_dictionary.txt')

In [None]:
import pandas as pd

# просматриваем получившийся файл в панде
data_dir='.\\collection_batches3\\'
data_file='main_dictionary.txt'  
header=['token', 'class_id', 'value', 'tf', 'df']
main_pd = pd.read_csv(data_dir+data_file, delimiter=',', names=header, encoding="utf8")
#main_pd = main_pd.sort_values(by ='df', ascending=False)
main_pd_=main_pd[2:]
print main_pd.shape[0]
main_pd_.head()

In [None]:
main_pd_['df'].fillna(0, inplace=True)
main_pd_['df'] = main_pd_['df'].astype('float')

main_pd_.sort_values(by=['df'], ascending=False, inplace=True)

In [None]:
print 'Число слов ', main_pd_.loc[(main_pd_.class_id==' text')].shape[0]
print 'Число биграмм ', main_pd.loc[(main_pd.class_id==' bigrams')].shape[0]
print 'Число ссылок', main_pd.loc[(main_pd.class_id==' references')].shape[0]
print 'Число авторов', main_pd.loc[(main_pd.class_id==' authors')].shape[0]

In [None]:
main_dictionary.filter(class_id='text', min_df=2, max_df=5000)   #
main_dictionary.filter(class_id='bigrams', min_df=2)   #сохранить токены с df=[2, ...) !! предварительно сохранив копию, т.к. фильтр вносит изменения сразу в файл

In [None]:
main_dictionary.filter(class_id='authors', max_df=630)   # убираем так Nan,  Nan df=635
main_dictionary.filter(class_id='references', max_df=12000) # убираем так Nan,  Nan df=12093

In [None]:
main_dictionary.save_text(dictionary_path='collection_batches3/main_dictionary.txt')

Построение модели

In [None]:
import artm

# -- Вводим основные (domain - d) и фоновые (Background - b) темы. 

def SetTopicsNum(d, b):
    background_topics = []
    domain_topics = []
    all_topics = []

    for i in range(1, d+b+1):
        if i <= d:
            topic_name = "d" + str(i)
            domain_topics.append(topic_name)
        else:
            topic_name = "b" + str(i)
            background_topics.append(topic_name)
        all_topics.append(topic_name)
    return all_topics, domain_topics, background_topics
# ---
#-- Инициализуем модель
batch_vectorizer = artm.BatchVectorizer(data_path='collection_batches3',
                                        data_format='batches')
main_dictionary = artm.Dictionary()
main_dictionary.load_text(dictionary_path='collection_batches3/main_dictionary.txt')

In [None]:
def SetScores(model):
    
    model.scores.add(artm.PerplexityScore(name='Perplexity_Score',
                                      dictionary=main_dictionary))
 
    model.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore_bigrams', 
                                           class_id='bigrams'))         
    
    model.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore')) 
    
    model.scores.add(artm.TopTokensScore(name='Bigrams_Top10_Tokens', 
                                         num_tokens=15, 
                                         dictionary = main_dictionary,
                                         class_id='bigrams'))
    
    model.scores.add(artm.TopTokensScore(name='Text_Top10_Tokens', 
                                         num_tokens=15, 
                                         dictionary = main_dictionary,
                                         class_id='text'))
    
    model.scores.add(artm.TopTokensScore(name='References_Top10', 
                                         num_tokens=15, 
                                         dictionary = main_dictionary,
                                         class_id='references'))
                    
    model.scores.add(artm.TopTokensScore(name='Authors_Top15', 
                                         num_tokens=15, 
                                         dictionary = main_dictionary,
                                         class_id='authors'))
        
    model.scores.add(artm.TopicKernelScore(name='DomainTopicKernelScore', 
                                           probability_mass_threshold=0.25, 
                                           class_id='bigrams' ))

In [None]:
# reg_dict - словарь OrderedDict, ключ - имя регуляризатора, значение - коэффициент
#Оставляем только те регуляризаторы, которые нужны

def SetRegularizers(model, reg_dict):
    
    #model.regularizers.add(artm.DecorrelatorPhiRegularizer(name=reg_dict.items()[0][0], gamma=0, tau=reg_dict[reg_dict.items()[0][0]])) #, class_ids=['text', 'bigrams', 'authors'], topic_names=domain_topics))
    #model.regularizers.add(artm.DecorrelatorPhiRegularizer(name=reg_dict.items()[1][0], gamma=0, class_ids=['references'], tau=reg_dict[reg_dict.items()[1][0]]))
    #model.regularizers.add(artm.SmoothSparsePhiRegularizer(name=reg_dict.items()[0][0], gamma=0, topic_names=background_topics, class_ids=['text', 'bigrams'], dictionary=main_dictionary, tau=reg_dict[reg_dict.items()[0][0]]))

    model.regularizers.add(artm.SmoothSparsePhiRegularizer(name=reg_dict.items()[0][0], gamma=0, topic_names=domain_topics, tau=reg_dict[reg_dict.items()[0][0]]))
    model.regularizers.add(artm.SmoothSparseThetaRegularizer(name=reg_dict.items()[1][0], topic_names=domain_topics, tau=reg_dict[reg_dict.items()[1][0]]))
#     model.regularizers.add(artm.SmoothSparsePhiRegularizer(name=reg_dict.items()[4][0], gamma=0, topic_names=domain_topics, tau=reg_dict[reg_dict.items()[4][0]]))
#     model.regularizers.add(artm.SmoothSparseThetaRegularizer(name=reg_dict.items()[5][0], topic_names=domain_topics, tau=reg_dict[reg_dict.items()[5][0]]))


In [None]:
from collections import OrderedDict

results_dir='.\\final_tests4\\'

# для тестирования моделей задаем листы - в зависимости от задачи тестирования
top_num_list =[13]  #для тестирования числа тем
#для тестирования значения коэффицикнтов регуляризации:
tau_list1=[-0.6]   
tau_list2=[-2.81]  

for top_num in top_num_list:

#задаем параметры моделей

    class_ids={'bigrams':1.0, 'authors':1.0, 'references':1.0}  #{'text': 0.5, 'bigrams':1.0, 'authors':1.0, 'references':1.0}
    all_topics, domain_topics, background_topics=SetTopicsNum(top_num, 0)  #SetTopicsNum(d, b)
    reg_dict = OrderedDict([
                            #('DecorrPhi', t1),
                            #('SmoothPhi_back', t1)
                          ('SparsePhi', t1),
                          ('SparseTheta', t2),
                                    ]
                                    )
    
    model=artm.ARTM(topic_names = all_topics,
                        dictionary=main_dictionary,
                        class_ids=class_ids,
                        cache_theta=True,
                        theta_columns_naming='title',
                        seed=2) 
    
    #name_to_save='seed_test_10step_'+str(seed_num)
    #name_to_save='num_test_10step_'+str(top_num)+'_topics'
    model.initialize(dictionary=main_dictionary)
#     model.load(name_to_save)   #10step_none_reg_15topics_2  
#     model.class_ids={'text': 0.5, 'bigrams':1.0, 'authors':1.0, 'references':1.0}
#     all_topics, domain_topics, background_topics=SetTopicsNum(top_num, 0)
    
    SetScores(model)

    SetRegularizers(model, reg_dict)

    model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=10)

    CallingResultsProcedures(0, model)
    #model.save(name_to_save,'p_wt')
    

Вызов функций для записи результатов

In [None]:
import matplotlib.pyplot as plt

def plot_figures(model, topics_directory):

    # perplexity
    x = range(model.num_phi_updates)[1:]
    fig, ax1 = plt.subplots()
    #plt.title(u'Метрики качества модели, 15*'+ '$\\tau$/|$W_t$|, $\\tau$='+ str(format(tau_value, ".0e")), fontsize=14, y=1.06)
    
    ax1.plot(x, model.score_tracker['Perplexity_Score'].value[1:], 'g-', linewidth=1, label=u"Перплексия")
    ax1.set_xlabel(u'Номер итерации')
    ax1.set_ylabel(u'Перплексия', color='g')
    ax2 = ax1.twinx()
    
   # ax2.plot(x, model.score_tracker['SparsityPhiScore_text'].value[1:], 'r*', linewidth=1, label=u'Разреженность '+'$\\Phi$'+'-text')
    ax2.plot(x, model.score_tracker['SparsityPhiScore_bigrams'].value[1:], 'r:', linewidth=1, label=u'Разреженность '+'$\\Phi$'+'-bigrams')
    ax2.plot(x, model.score_tracker['SparsityThetaScore'].value[1:], 'r-.', linewidth=1, label=u'Разреженность '+'$\\Theta$')
    ax2.set_ylabel(u'Доля', color='r')
    ax2.legend(bbox_to_anchor=(1.10, 1), loc=2, borderaxespad=0.)
    
    props = dict(boxstyle='round', facecolor='wheat', alpha=0.3)
    
    
    ax1.text(0.14, -0.16, u'Перплексия: ' + str(round(model.score_tracker['Perplexity_Score'].last_value, 3))+
            # u'\nРазреженность ' +'$\\Phi$'+'-text: ' + str(round(model.score_tracker['SparsityPhiScore_text'].last_value, 3))+
             u',  Разреженность ' +'$\\Phi$'+'-bigrams: ' +str(round(model.score_tracker['SparsityPhiScore_bigrams'].last_value, 3))+
             u'\nРазреженность ' +'$\\Theta$: '+str(round(model.score_tracker['SparsityThetaScore'].last_value,3)), 
         transform=plt.gcf().transFigure, bbox=props)


    plt.grid(True)
    plt.savefig(topics_directory+'scores_sparsity.png', dpi=150, bbox_inches = 'tight')
    #plt.show()
    
    # kernels
    x = range(model.num_phi_updates)[1:]
    fig, ax1 = plt.subplots()
    ax1.plot(x, model.score_tracker['DomainTopicKernelScore'].average_size[1:], 'g-', linewidth=1, label=u"Размер ядра")
    ax1.set_xlabel(u'Номер итерации')
    ax1.set_ylabel(u'Размер ядра', color='g')
    ax2 = ax1.twinx()
    ax2.plot(x, model.score_tracker['DomainTopicKernelScore'].average_contrast[1:], 'r*', linewidth=1, label=u"Контраст")
    ax2.plot(x, model.score_tracker['DomainTopicKernelScore'].average_purity[1:], 'r--', linewidth=1, label=u"Чистота")
    ax2.set_ylabel(u'Доля', color='r')
    ax2.legend(bbox_to_anchor=(1.10, 1), loc=2, borderaxespad=0.)
    kernel=model.score_tracker['DomainTopicKernelScore'].last_average_size
    last_average_contrast=model.score_tracker['DomainTopicKernelScore'].last_average_contrast
    last_average_purity=model.score_tracker['DomainTopicKernelScore'].last_average_purity
    ax1.text(0.14, -0.07, u'Размер ядра: ' + str(round(kernel, 3))+
             u',  Контраст: ' + str(round(last_average_contrast, 3))+
             u',  Чистота: ' + str(round(last_average_purity, 3)),
         transform=plt.gcf().transFigure, bbox=props)
    
    plt.grid(True)
    plt.savefig(topics_directory+'scores_kernel.png', dpi=150, bbox_inches = 'tight')
    
    return kernel, last_average_contrast, last_average_purity

In [None]:
def CallingResultsProcedures(step, model):
    TopicTrackerTable_cur, rows_num=ReadTableResults()
    print rows_num
    topics_directory=CreateFilesWithTopics(rows_num)
    kernel, last_average_contrast, last_average_purity = plot_figures(model, topics_directory)
    WritingModelsResultsToFile(step, TopicTrackerTable_cur, rows_num, topics_directory, kernel, last_average_contrast, last_average_purity)
    
    # Записываем значения перплексии
    with open(results_dir+"perplexities.txt", 'a') as f:
        f.write("\n")
        for perp in model.score_tracker['Perplexity_Score'].value:
            f.write("%s, " % perp) 

Запись итогов работы модели и когерентрости в единый файл результатов

In [None]:
# Шапка таблицы была сформирована заранее
#TopicTrackerTable = pd.DataFrame(columns=['topics num', 'class_ids', 'steps num', 'strategy', 'perplexity', 'Phi-sparcity', 'Theta-sparcity', 'Kernel size', 'Contrast', 'Purity', 'Coherence'])
# Считываем текущую таблицу на предмет определения числа строк в ней, чтобы последующую запись записать ниже

model_results_file='models_testing.csv'

def ReadTableResults():    
    TopicTrackerTable_cur = pd.read_csv(results_dir+model_results_file)
    rows_num= TopicTrackerTable_cur.shape[0]
    return TopicTrackerTable_cur, rows_num

In [None]:
# создаем новую директорию формата folder_name + rows_num (число строк)
# сохраняем все топ-токены всех тем в отдельные файлы новой директории

import os

def CreateFilesWithTopics(rows_num):

    folder_name="topic_tokens_"+str(rows_num+1)
    topics_directory=results_dir + folder_name + "\\"
    if not os.path.exists(topics_directory):
        os.makedirs(topics_directory)
#     else:
#         os.remove(topics_directory)

    top_tokens = model.score_tracker["Bigrams_Top10_Tokens"]
    
    for topic_name in all_topics:
        data_file_name=topic_name+'_tokens.txt'
        with open(topics_directory+data_file_name, 'w') as f:
            for (item, weight) in zip(top_tokens.last_tokens[topic_name], top_tokens.last_weights[topic_name]):
                f.write(item.decode('utf8') +', '+str(round(weight,5))+'\n')
    return topics_directory

In [None]:
# ===  Подсчет когерентности  =======

import itertools
import numpy as np
import pandas as pd


data_dir='.\\data\\'

file_name='Ec_Bu__be_p1_p2_filtered_preprocessed2.csv'
#header=['_id', 'authors', 'fieldsOfStudy', 'id', 'inCitations', 'journalName', 'outCitations', 'paperAbstract', 'title','year']

table_prep = pd.read_csv(data_dir+file_name, delimiter=',', quotechar='"', error_bad_lines=False, engine='python')
table_prep['bigrams'] = table_prep['bigrams'].apply(lambda x: frozenset([word.strip() for word in x.split('|||')]))


def CoherenceCalculating(topics_directory):
    
    # открываем и считываем все слова всех тем - имя директории (topics_directory) зависит от числа строк в таблице результатов
    topic_files = [i for i in os.listdir(topics_directory) if (i !='coherence.txt')&(i.endswith(".txt")&(i.startswith("d")))]
    print len(topic_files)
    # словарь, ключи которого - названия топиков (d8_tokens, b10_tokens и т.п.), 
    # элементы - кортеж: (словарь слов->частоты, словарь пар->частоты(+PMI), связность)   
    topics_dict = {}
#     topics_coherence = {}  # формируем пустой словарь для связности

    # Формируем словарь из тем с элементами:
    for topic_file in topic_files:
#         print topic_file    
        
        # считываем  топ-слова темы
        topic_words = []    
        with open(topics_directory+topic_file, 'r') as f:
            #topic_words = [line.strip('\n') for line in f]
            topic_words = [line.split(',')[0] for line in f]   # в файле и слово, и значение, берем только слово [0]
    
        words = dict.fromkeys(topic_words,0)
        
        # составляем всевозможные пары комбинаций (сразу конвертируем в наборы)
        pairs = dict.fromkeys([frozenset(p) for p in itertools.combinations(topic_words, 2)],0)
        
        coherence = 0.0
        
        # имя топика = имя файла (без .txt)
        topics_dict[topic_file[:-4]] = (words, pairs, coherence) # [i for i in itertools.combinations(topic_words, 2)])  

    rows = table_prep['bigrams']#[-10000:]

    # iterate over rows and then over topics and calculate counts of words & pairs
    for row in rows:
        for topic in topics_dict:
            # unpack tuple
            words, pairs, coherence = topics_dict[topic]
            
            # increment word
            for word in words:
                words[word] += int (word in row)
            
            # increment pair
            for pair in pairs:
                pairs[pair] += int(pair.issubset(row))
            
            # re-pack   
            topics_dict[topic] = (words, pairs, coherence)
        
    D = len(rows)
    
    # calculate PMI coherence by topic
    for topic in topics_dict:
        # unpack tuple
        words, pairs, coherence = topics_dict[topic]
           
        # calc PMI and cusum them for all pairs
        for pair in pairs:
            PMI = calc_PMI(pair, pairs, words, D)
            coherence += PMI
            # now save tuple of count and PMI for pair
            pairs[pair] = pairs[pair], PMI
         
        # normalize PMI by pairs count
        coherence = coherence/float(len(pairs)) # = k(k-1)/2

        # re-pack
        topics_dict[topic] = (words, pairs, coherence)

    # save results and cumulate coherence
    coherence = 0.0
    
    with open(topics_directory+"coherence.txt", 'w') as f:
        for topic in topics_dict:
            coherence += topics_dict[topic][2]
            f.write(topic + " " +str(topics_dict[topic][2])+ "\n")
        
        coherence = coherence/float(len(topics_dict))
        f.write("Итоговая средняя когерентность = " + str(coherence))  
    
    return coherence, topics_dict
       
def calc_PMI(pair, pairs, words, D, set_zero_count_by=1e-6):
    """
    Pointwise Mutual Information
    
    where:
    D - vol of corpus
    """
    
    word = tuple(pair)
    
    # if smth is rare
    
    N12 = pairs[pair]
    if pairs[pair] == 0:
        N12 = set_zero_count_by*set_zero_count_by

    N1 = words[word[0]]
    if N1 == 0:
        N1 = set_zero_count_by
    
    N2 = words[word[1]]
    if words[word[1]] == 0:
        N2 = set_zero_count_by
        
    return np.log(float(D*N12)/float(N1*N2))

In [None]:
def WritingModelsResultsToFile(steps, TopicTrackerTable_cur,rows_num, topics_directory, kernel, last_average_contrast, last_average_purity):
# ---- Записываем в линию значения коэффициентов регуляризации. 
#Если регуляризатор не включен - будет ошибка, поэтому помещаем в try
    DecorrPhi=""
    SmoothPhi=""
    SparsePhi=""
    SparseTheta=""
    SmoothPhi_back=""
    SmoothTheta_back=""
    SmoothPhi_i=""
    #DecorrPhi_ref=""
    
    try:
        
        #DecorrPhi = str("{:.2e}".format(model.regularizers['DecorrPhi'].tau)) 
        #DecorrPhi_ref = str("{:.2e}".format(model.regularizers['DecorrPhi_ref'].tau)) 
        
        
        #SmoothPhi_back = str("{:.2e}".format(model.regularizers['SmoothPhi_back'].tau))
        #SmoothPhi_i = str("{:.2e}".format(tau))
        
        #SmoothTheta_back = str("{:.2e}".format(model.regularizers['SmoothTheta_back'].tau))
        #SmoothPhi_d = str("{:.2e}".format(model.regularizers['SmoothPhi_d'].tau))
#         SmoothPhi_a = str("{:.2e}".format(model.regularizers['SmoothPhi_a'].tau))
        SparsePhi = str("{:.2e}".format(model.regularizers['SparsePhi'].tau))
        SparseTheta = str("{:.2e}".format(model.regularizers['SparseTheta'].tau))
        
    
    
    except Exception:
        pass
    strategy_line= "; ".join([SparsePhi, 
                              SmoothPhi_back,
                              #SmoothPhi_i,
                              #DecorrPhi,
#                               DecorrPhi_ref,
#                               ImproveCoherence
                              
                               #,
                              #SmoothPhi_b, SmoothPhi_a, 
                              SparseTheta
                              
                             ])
# ----

    perpl=str("{:.2e}".format(model.score_tracker['Perplexity_Score'].last_value))
    coherence, topics_dict = CoherenceCalculating(topics_directory)
    
    TopicTrackerTable_cur.loc[rows_num+1]=[rows_num+1, 
                                         len(all_topics), 
                              model.class_ids, # class_ids
                              len(model.score_tracker['Perplexity_Score'].value)+steps,  # число шагов
                              str(model.regularizers)+ ": " + strategy_line,                      
                              perpl, 
                              round(model.score_tracker['SparsityPhiScore_bigrams'].last_value, 4),
                              round(model.score_tracker['SparsityThetaScore'].last_value, 4),
                              round(kernel,4),
                              round(last_average_contrast,4),
                              round(last_average_purity,4),
                              round(coherence,4)
                             ]
    TopicTrackerTable_cur.to_csv(results_dir+'models_testing.csv', encoding='utf-8', index=False)


Вывод результатов модели!

In [None]:
#  --  Печать топ-токенов тем  ---

top_tokens = model.score_tracker["Bigrams_Top10_Tokens"]  #References_Top10  Bigrams_Top10_Tokens Text_Top10_Tokens #Authors_Top15

for topic_name in model.topic_names:
    token_line_list=[]
    print (topic_name)
    for (token, weight) in zip(top_tokens.last_tokens[topic_name],
                               top_tokens.last_weights[topic_name]):    
         token_line_list.append(token)
        #print token  #, '-', round(weight,3)
    print ', '.join(token_line_list) 

In [None]:
# ! Сохраняем авторов и их id. Сохраняем все ссылки в список. Чтобы потом расшифровать
folder_name='authors\\'

def GetIdbyAuthorName(name):
    
    if name<>u'nan':
        
        tab=table_raw[table_raw['authors_name'].str.contains(name.encode('utf8'))][['authors_name', 'authors_ids']][:1]
       
        authors_list=str(tab.iloc[0,0]).split(' ')
       
        author_id = None
        for index, i in enumerate(authors_list):
            if i==name.encode('utf8'):
                author_id=str(tab.iloc[0,1]).split(' ')[index]
                break
        if author_id == None:
            author_id=""
            #print authors_list
        return author_id


top_tokens = model.score_tracker["Authors_Top15"]

table_raw['authors_name'] = table_raw['authors_name'].fillna("")


for topic_name in model.topic_names:
    with open(results_dir+folder_name+topic_name+'.txt', 'w') as f:
        for (token, weight) in zip(top_tokens.last_tokens[topic_name], top_tokens.last_weights[topic_name]):
            if token != u'nan':
                author_id=GetIdbyAuthorName(token)  #Andreas_Löschel
#                 if author_id is None:
#                     #print len(token), token
#                     author_id=" "
#                     break
                f.write(token.encode('utf-8')+', '+author_id.encode('utf8'))    # +', '+str(round(weight,5))+'\n')
                f.write('\n')

In [None]:
# ! Сохраняем все ссылки в список. Чтобы потом расшифровать
top_tokens = model.score_tracker["Bigrams_Top10_Tokens"]  #Bigrams_Top10_Tokens   References_Top10
folder_name='bigrams\\'
#folder_name='references\\'


for topic_name in model.topic_names:
    with open(results_dir+folder_name+topic_name+'.txt', 'w') as f:
        #f.write(topic_name +'\n')
        for (token, weight) in zip(top_tokens.last_tokens[topic_name], top_tokens.last_weights[topic_name]): 
            if token != u'nan':
                f.write(token.encode('utf-8') +', '+str(round(weight,5))+'\n')
                #f.write(token.encode('utf-8') +'\n')

In [None]:
# Импортируем из модели Тета
theta = model.get_theta()
theta_tr = theta.transpose(copy=True)

In [None]:
theta_tr

In [None]:
# Открываем исходный файл:
import pandas as pd
data_dir='.\\data\\'
file_name='Ec_Bu__be_p1_p2_filtered_preprocessed2.csv'
#header=['_id', 'authors', 'fieldsOfStudy', 'id', 'inCitations', 'journalName', 'outCitations', 'paperAbstract', 'title','year']

table_raw = pd.read_csv(data_dir+file_name, delimiter=',', quotechar='"', error_bad_lines=False, engine='python')

In [None]:
table_raw

In [None]:
table_raw.set_index('_id', inplace=True)
table_raw.head()

In [None]:
#  Соединяем Тету и table_raw
table_theta=theta_tr.join(table_raw)
data_file='table_theta.csv'
table_theta.to_csv(results_dir+data_file, sep=';', encoding='utf8', index=True, header=True)

In [None]:
# Выводим по теме публикации:
import numpy as np 
    
def GetTopicDocs(d, alpha):
    def isnan(value):
        try:
            import math
            return math.isnan(float(value))
        except:
            return False
    
    topic_name='d'+str(d)
    print '\n'
    print topic_name
    table_d=table_theta.sort_values(by=[topic_name], ascending=False)
    table_d_threshhold=table_d.loc[table_d[topic_name]>alpha]
    
#----  print some results:  ------
    for index, row in table_d_threshhold.iterrows():
        authors_name=row['authors_name']
       
        if isnan(authors_name)==False:
            authors_name=', '.join(authors_name.split())
            ahref_scr='https://www.semanticscholar.org/paper/'+row['id']
            #print row['id']
            print authors_name + " (" + str(int(row['year'])) +"). " + row['title']
# ---------
    
    results_dir_theta=results_dir+'theta_docs\\'
    table_d_threshhold.to_csv(results_dir_theta+'theta_'+topic_name+'.csv', sep=';', encoding='utf8', index=True, header=True)
    return

GetTopicDocs(11, 0.9)