# Implementação

## Dados dos datasets

Dataset | Query | Dataset | Rel | % Rel
--- | --- | --- | --- | ---
Hall | Defect prediction | 8911 | 104 | 1.17
Radjenovic | Defect prediction metrics | 6000 | 48 | 0.80
Kitchenham | Literature review | 1704 | 45 | 2.64
Wahono | Defect prediction | 7002 | 62 | 0.88

## Bibliotecas

In [1]:
import os
import sys
#import datetime
from collections import OrderedDict
#from random import seed
#from random import randint
import nltk
import pandas as pd
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import gensim
from gensim.corpora import Dictionary
from gensim.models.wrappers import LdaMallet
from gensim.models.wrappers.ldamallet import malletmodel2ldamodel
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.phrases import Phrases, Phraser
#from google.colab import drive

print("Python version")
print (sys.version)
print("Gensim version")
print (gensim.__version__)
print("Pandas version")
print (pd.__version__)

nltk.download('stopwords')

Python version
3.6.10 |Anaconda, Inc.| (default, Mar 25 2020, 23:51:54) 
[GCC 7.3.0]
Gensim version
3.8.0
Pandas version
1.0.3


[nltk_data] Downloading package stopwords to /home/ballke/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Números randômicos

In [2]:
# seed(datetime.datetime.now().second)

random_numbers = [534, 424, 826, 310, 983]

# for _ in range(5):
# 	value = randint(0, 1000)
# 	random_numbers.append(value)
 
print(random_numbers)

[534, 424, 826, 310, 983]


## Tratamento do dataset

In [3]:
def get_corpus_for_docs(docs):
    """
    Faz o tratamento da base e retorna o corpus, dicionário e query tratada
    """
    # Download and import stopwords library
    stopwds = stopwords.words('english')

    # Split the documents into tokens.
    tokenizer = RegexpTokenizer(r'\w+')
    for idx in range(len(docs)):
        docs[idx] = str(docs[idx]).lower()  # Convert to lowercase.
        docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

#     # Remove numbers, but not words that contain numbers.
#     docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

#     # Remove words that are only one character.
#     docs = [[token for token in doc if len(token) > 1] for doc in docs]        
        
    # Remove stopwords.
    docs = [[token for token in doc if token not in stopwds] for doc in docs]

    # Porter stemmer.
    stemmer = PorterStemmer()
    docs = [[stemmer.stem(token) for token in doc] for doc in docs]
    
    # Compute bigrams.
    phrases = Phrases(docs, min_count=1, threshold=2)
    bigram = Phraser(phrases)
    for idx in range(len(docs)):
        docs[idx] = bigram[docs[idx]]
        
    # text processing is done
    query = docs[-1]
    docs = docs[:-1]
    
    # Remove rare and common tokens.
    # Create a dictionary representation of the documents.
    dictionary = Dictionary(docs)

    # Bag-of-words representation of the documents.
    corpus = [dictionary.doc2bow(doc) for doc in docs]

    return corpus, dictionary, docs, query

## Executa LDA e retorna TOP documentos

In [4]:
def get_top_documents(df, corpus, dictionary, query, num_topics, random_state, N=50):
    """
    Retorna TOP 10 documentos da query.
    
    Recebe a base original, corpus, dicionário, query de busca, número de
    tópicos e a semente do random.
    """
    # Make a index to word dictionary.
    temp = dictionary[0]  # This is only to "load" the dictionary.
    id2word = dictionary.id2token

    # Train LDA model.
    path_to_mallet_binary = os.path.join(os.getcwd(), "mallet-2.0.8", "bin", "mallet")
    mallet_model = LdaMallet(
        path_to_mallet_binary,
        corpus=corpus,
        id2word=id2word,
        num_topics=num_topics,
        random_seed=random_state
    )
    model = malletmodel2ldamodel(mallet_model)

    # top_topics = model.top_topics(corpus, topn=10)

    # Topic coherence
    texts = [[dictionary[word_id] for word_id, freq in doc] for doc in corpus]
    cm = CoherenceModel(model=model, corpus=corpus, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence = cm.get_coherence()  # get coherence value
    
    # QUERY ORIGINAL #######
    
    # query topics sorted by score
    initial_query_topics = model.get_document_topics(dictionary.doc2bow(query))
    query_topics = list(sorted(initial_query_topics, key=lambda x: x[1], reverse=True))

    # Get documents with highest probability for each topic. Source: https://stackoverflow.com/a/56814624

    # matrix with the query topics as columns and docs as rows, value is the proba
    topic_matrix = pd.DataFrame(columns=[x[0] for x in query_topics])

    # Loop over all the documents to group the probability of each topic
    for docID in range(len(corpus)):
        topic_matrix.loc[len(topic_matrix)] = 0  # fill with zeros
        topic_vector = OrderedDict(model[corpus[docID]])  # convert list of tuples to OrderedDict to go faster
        for topicID, proba in query_topics:  # only the query topics are relevant
            topic_matrix.at[docID, topicID] = proba * topic_vector.get(topicID, 0)

    # sum probas
    # ids dos docs
    docs = np.array((range(len(corpus))))
    # soma de todas as colunas p/ cada documento
    topic_probas = topic_matrix.sum(axis=1)
    # sorteia da maior p/ a menor proba
    ids = docs[np.argsort(topic_probas[docs])[::-1]]
    print(ids)
    # top_docs = topic_probas[ids]
    
    # todo: calcular a relevancia de cada palavra para a query
    # proba da palavra w * (proba da palavra 1 da query * proba da palavra 2 da query...)
    # ou obter todas as palavras que a relevância seja > 0
    query_per_word_topics = model.get_document_topics(dictionary.doc2bow(query), per_word_topics=True)
    query_per_word_topics = query_per_word_topics[2]
    query_per_word_topics = [OrderedDict(probas) for word_id, probas in query_per_word_topics]
    doc_ids_to_extend = ids[:N]
    new_words_ids = []

    # Loop over all the documents to get the new query words
    for docID in doc_ids_to_extend:
        per_word_topics = model.get_document_topics(corpus[docID], per_word_topics=True)
        per_word_topics = per_word_topics[2]
        
        for wordID, topics_probas in per_word_topics:
            for topicID, proba in topics_probas:
                result = proba
                for d in query_per_word_topics:
                    result *= d.get(topicID, 0)
                
                if result > 0:
                    new_words_ids.append(wordID)
    
    new_words_ids = list(set(new_words_ids))
    new_query = query + [dictionary[wordID] for wordID in new_words_ids]
    
    # QUERY EXTENDIDA #######
    
    # query topics sorted by score
    initial_query_topics = model.get_document_topics(dictionary.doc2bow(new_query))
    query_topics = list(sorted(initial_query_topics, key=lambda x: x[1], reverse=True))

    # Get documents with highest probability for each topic. Source: https://stackoverflow.com/a/56814624

    # matrix with the query topics as columns and docs as rows, value is the proba
    topic_matrix = pd.DataFrame(columns=[x[0] for x in query_topics])

    # Loop over all the documents to group the probability of each topic
    for docID in range(len(corpus)):
        topic_matrix.loc[len(topic_matrix)] = 0  # fill with zeros
        topic_vector = OrderedDict(model[corpus[docID]])  # convert list of tuples to OrderedDict to go faster
        for topicID, proba in query_topics:  # only the query topics are relevant
            topic_matrix.at[docID, topicID] = proba * topic_vector.get(topicID, 0)

    # sum probas
    # ids dos docs
    docs = np.array((range(len(corpus))))
    # soma de todas as colunas p/ cada documento
    topic_probas = topic_matrix.sum(axis=1)
    # sorteia da maior p/ a menor proba
    ids = docs[np.argsort(topic_probas[docs])[::-1]]
    print(ids)
    
    # relvant counter
    num_relevant = 0

    # quantos documentos tenho que revisar até achar o primeiro relevante
    counter = 0
    f_distance = 0
    top_10_true = 0
    top_20_true = 0

    for idx, doc in enumerate(ids):
        counter += 1
        doc_data = df.iloc[doc, :]
        if doc_data['label'] == 'yes':
            num_relevant += 1
            if idx < 10:
                top_10_true += 1
            if idx < 20:
                top_20_true += 1
            if not f_distance:
                f_distance = counter
            counter = 0

    return len(initial_query_topics), coherence, len(ids), len(new_query), num_relevant, f_distance, top_10_true, top_20_true 

## Combinações de teste

In [5]:
# Dados de teste: DBs, número de tópicos e sementes do random state,
DATABASES = {'Kitchenham': 'Literature review', 'Radjenovic': 'Defect prediction metrics', 'Hall': 'Defect prediction', 'Wahono': 'Defect prediction'}
NUM_TOPICS_FOR_TEST = range(10, 201, 10)

# dicionário onde serão armazenados os resultados
results = {
    'database': [],                 # nome da base
    'query': [],                    # query
    'num_topics': [],               # número de tópicos utilizado
    'qry_topics': [],               # número de tópicos retornado para a query
    'num_documents': [],            # número total de documentos
    'rand_seed': [],                # semente do random do modelo LDA
    'coherence': [],                # coerência dos tópicos com a coleção
    'ext_query_size': [],           # tamanho da query estendida
    'num_relevant': [],             # número de documentos relevantes encontrados no TOP 10
    'f_distance': [],               # distância até o primeiro relevante
    'top_10_true': [],              # documentos relevantes no top 10
    'top_20_true': [],              # documentos relevantes no top 20
}

# Tests

In [6]:
print(','.join(['"{}"'.format(key) for key in results.keys()]))

for dbname, query in DATABASES.items():
    path = 'https://github.com/fastread/src/raw/master/workspace/data/{}.csv'.format(dbname)

    #Carregando dataset
    df = pd.read_csv(path, header=0, encoding='latin-1')

    N_DOCS = df.shape[0]  # ilimited n_docs

    base_docs = df['Document Title'] + ' ' + df['Abstract']

    # put de query in the text processing
    base_docs = base_docs.append(pd.Series([query], index=[len(base_docs)]))
    
    corpus, dictionary, docs, query = get_corpus_for_docs(base_docs) # query tem o mesmo preprocessamento da coleção

    for rand_seed in random_numbers:
        for num_topics in NUM_TOPICS_FOR_TEST:
            qry_topics, coherence, num_documents, len_qry, num_relevant, f_distance, top_10_true, top_20_true = get_top_documents(df, corpus, dictionary, query, num_topics, rand_seed)
            values = [dbname, ' '.join(query), str(num_topics), str(qry_topics), str(num_documents), str(rand_seed), str(coherence).replace('.', ','), str(len_qry), str(num_relevant), str(f_distance), str(top_10_true), str(top_20_true)]
            print(','.join(['"{}"'.format(value) for value in values]))
            results['database'].append(dbname)
            results['query'].append(' '.join(query))
            results['num_topics'].append(num_topics)
            results['qry_topics'].append(qry_topics)
            results['num_documents'].append(num_documents)
            results['rand_seed'].append(rand_seed)
            results['coherence'].append(coherence)
            results['ext_query_size'].append(len_qry)
            results['num_relevant'].append(num_relevant)
            results['f_distance'].append(f_distance)
            results['top_10_true'].append(top_10_true)
            results['top_20_true'].append(top_20_true)

"database","query","num_topics","qry_topics","num_documents","rand_seed","coherence","ext_query_size","num_relevant","f_distance","top_10_true","top_20_true"
[1394  787 1339 ...  639  641 1054]
[1394 1338 1339 ...  639  641 1054]
"Kitchenham","literatur_review","10","2","1704","534","0,4642631748470555","1074","45","22","0","0"
[1511  858   95 ...  641  639 1054]
[1511  858  151 ... 1290 1054  639]
"Kitchenham","literatur_review","20","3","1704","534","0,424896746006996","884","45","43","0","0"
[ 253  610 1154 ...  641 1054 1298]
[ 253  610 1154 ...  639  641 1298]
"Kitchenham","literatur_review","30","4","1704","534","0,3949953431593896","683","45","15","0","1"
[1507  536 1627 ... 1066  318   28]
[ 123 1507  536 ...  250 1374  318]
"Kitchenham","literatur_review","40","4","1704","534","0,3538862415113038","553","45","42","0","0"
[ 123 1448 1700 ...  855  221  620]
[ 123  208  158 ...  250 1263 1095]
"Kitchenham","literatur_review","50","2","1704","534","0,3408337181679666","284","45",

[ 988 1271  568 ...  135  310 1684]
[1339 1338 1596 ...  221 1216  851]
"Kitchenham","literatur_review","80","6","1704","826","0,3281701573319299","632","45","42","0","0"
[ 123  154  158 ...  568  988 1271]
[ 787 1394 1339 ... 1329  744  580]
"Kitchenham","literatur_review","90","2","1704","826","0,3367916898691543","298","45","39","0","0"
[ 123 1627  154 ...  630  624 1331]
[ 321  123  320 ...  598 1239 1299]
"Kitchenham","literatur_review","100","4","1704","826","0,33306303387715347","588","45","13","0","1"
[1339 1338  696 ... 1047 1048  851]
[1339 1338  696 ... 1433  652 1443]
"Kitchenham","literatur_review","110","5","1704","826","0,3345743811230972","405","45","15","0","2"
[ 603 1194  788 ...  616 1216  851]
[ 603 1194  788 ... 1473  630  851]
"Kitchenham","literatur_review","120","7","1704","826","0,33386860965029813","622","45","15","0","1"
[1611  274 1547 ...  630 1319    0]
[ 274  827 1611 ... 1319  995  362]
"Kitchenham","literatur_review","130","7","1704","826","0,3337451728

[ 154 1506  749 ... 1084 1085    0]
[ 154 1506  749 ...  960  964    0]
"Kitchenham","literatur_review","160","5","1704","983","0,34044953506508213","214","45","18","0","1"
[ 123  154 1506 ... 1086 1088    0]
[ 123  154 1506 ...  832  833  851]
"Kitchenham","literatur_review","170","3","1704","983","0,33560390409218444","159","45","9","1","1"
[ 127  696   33 ... 1103 1104  851]
[ 127  696   33 ...  784  783 1703]
"Kitchenham","literatur_review","180","6","1704","983","0,3482846079009217","221","45","3","2","3"
[ 696   33 1022 ... 1010 1011  851]
[ 696   33 1022 ...  941 1571  750]
"Kitchenham","literatur_review","190","3","1704","983","0,34759694834694094","168","45","2","1","4"
[ 696 1342  285 ... 1041 1042  851]
[ 696 1342  285 ...  901  905  780]
"Kitchenham","literatur_review","200","5","1704","983","0,35471283801866577","185","45","6","2","6"
[4707 3303 5773 ... 5485 4820 4819]
[4707 3303 5773 ... 5485 4820 4819]
"Radjenovic","defect_predict metric","10","1","6000","534","0,440505

[3362 5313 5257 ... 3672 3270 4820]
[3362 5313 5257 ...  635 5698 3672]
"Radjenovic","defect_predict metric","30","1","6000","826","0,41793987014166617","622","48","10","1","1"
[3448 1505 5138 ... 4552 1729 4183]
[2558 4768 3362 ... 4864 3604 4408]
"Radjenovic","defect_predict metric","40","1","6000","826","0,40228987646529496","360","48","13","0","1"
[3561  384 1121 ... 3404 5700 1302]
[3561  384 1121 ... 3404 5700 1302]
"Radjenovic","defect_predict metric","50","50","6000","826","0,39906255849898287","2","48","3","2","2"
[4078 2644 3478 ... 3031 4851 5189]
[4078 2644 3478 ... 3031 4851 5189]
"Radjenovic","defect_predict metric","60","60","6000","826","0,3848324993162728","2","48","2","1","1"
[4707 4552 4183 ... 2671  763 2674]
[4707 4552 4183 ... 2671  763 2674]
"Radjenovic","defect_predict metric","70","70","6000","826","0,38060053671997374","2","48","60","0","0"
[2644 3167 5773 ... 1118 3220 2120]
[2644 3167 5773 ... 1118 3220 2120]
"Radjenovic","defect_predict metric","80","80","6

[4768 4707 4769 ... 3560 3561    0]
[4768 4707 4769 ... 3561 3563    0]
"Radjenovic","defect_predict metric","100","2","6000","983","0,3567413182048953","2","48","22","0","0"
[4707 4769 4768 ... 3601 3603 2999]
[4707 4769 4768 ... 3601 3603 2999]
"Radjenovic","defect_predict metric","110","2","6000","983","0,3597319079303545","2","48","6","1","1"
[5773 3303 2644 ... 3667 3668    0]
[2644 4247 3172 ... 3819 3820    0]
"Radjenovic","defect_predict metric","120","1","6000","983","0,34212454876453874","311","48","1","5","8"
[4254 3167 1450 ... 3667 3669    0]
[4254 3167 1450 ... 3659 3664    0]
"Radjenovic","defect_predict metric","130","2","6000","983","0,3501281294292627","2","48","14","0","1"
[2644 3303 4248 ... 3732 3733    0]
[2644 3303 4248 ... 3732 3733    0]
"Radjenovic","defect_predict metric","140","2","6000","983","0,34656467171105465","2","48","1","5","8"
[3303 5773 4248 ... 3716 3717    0]
[3303 5773 4248 ... 3717 3718    0]
"Radjenovic","defect_predict metric","150","2","6000

[5373  314 1792 ... 5807 5808    0]
"Hall","defect_predict","190","1","8911","424","0,3884449202275799","315","104","1","7","13"
[1794 7137 3602 ... 5863 5864    0]
[1794 7137 3602 ... 5864 5865    0]
"Hall","defect_predict","200","1","8911","424","0,3950175081164113","409","104","1","7","13"
[1791 1871 4610 ... 3715 8856 1197]
[1791 1871 4610 ... 3715 8856 1197]
"Hall","defect_predict","10","1","8911","826","0,434351914129303","1080","104","1","4","5"
[7137 3602 8910 ... 3658 6725  224]
[7137 3602 8910 ...  958 2711  224]
"Hall","defect_predict","20","1","8911","826","0,45414256778691087","696","104","1","8","13"
[1791 7137 1794 ... 8111 5742  224]
[1791 7137  705 ... 7105  958 1114]
"Hall","defect_predict","30","1","8911","826","0,44873060563471595","812","104","1","6","12"
[6391 5372 4484 ... 7270 3718 4528]
[1791 7137 3602 ... 5559 5561 7997]
"Hall","defect_predict","40","1","8911","826","0,4570160855775491","604","104","1","9","15"
[5363 1841 3596 ...  247 1174 8330]
[ 705 8889 17

[1791 3602 7137 ... 7290 3764 2263]
[1791 3602 1794 ... 5662 5663    0]
"Hall","defect_predict","90","1","8911","983","0,4292597755977897","542","104","1","9","14"
[1791 7137 1794 ... 5698 5699    0]
[1791 7137 1794 ... 5697 5698    0]
"Hall","defect_predict","100","1","8911","983","0,427211052217685","515","104","1","9","15"
[7137 3602 1794 ... 5746 5747    0]
[7137 3602 1794 ... 5746 5747    0]
"Hall","defect_predict","110","1","8911","983","0,4194989445276877","510","104","1","8","15"
[1792 1797 5667 ... 5731 5733    0]
[1792 1797 5667 ... 5734 5735    0]
"Hall","defect_predict","120","1","8911","983","0,417285514613186","329","104","1","7","12"
[1791 1794 7137 ... 5751 5752    0]
[1791 1794 7137 ... 5751 5752    0]
"Hall","defect_predict","130","1","8911","983","0,4095280946705223","469","104","1","9","14"
[1794 3602 7137 ... 5791 5792    0]
[1794 3602 7137 ... 5792 5793    0]
"Hall","defect_predict","140","1","8911","983","0,40428392444991923","443","104","1","8","14"
[7137 3602 1

[1383 4643 1629 ... 4522 4523    0]
[1383 4643 1629 ... 4520 4521 3500]
"Wahono","defect_predict","190","1","7002","424","0,3457881134408704","300","62","6","1","3"
[4643 6800 1657 ... 4535 4536    0]
[4643 6800 1657 ... 4537 4538    0]
"Wahono","defect_predict","200","1","7002","424","0,3496500517494154","256","62","3","1","2"
[3797  491 3841 ... 6407 3413 3919]
[3797  491 3841 ... 6407 3413 3919]
"Wahono","defect_predict","10","1","7002","826","0,4771557439777971","906","62","8","1","2"
[3797 4294 2137 ... 6967 2025 3919]
[3797 4294 2137 ... 3920 3919 2025]
"Wahono","defect_predict","20","1","7002","826","0,4477477810135258","763","62","8","1","4"
[3797 2137 3972 ... 2025 3146 5387]
[3797 2137 3972 ... 2183 5497 2711]
"Wahono","defect_predict","30","1","7002","826","0,4319377243715549","669","62","3","1","3"
[ 186  158 4667 ...   38 2548  216]
[2137 3972 4294 ...  885  886 3500]
"Wahono","defect_predict","40","1","7002","826","0,4298287367649312","536","62","2","2","3"
[ 379 5684 343

[4643 4112 3646 ... 3186  113 2171]
[4643 1524 1657 ... 4050 4051 3500]
"Wahono","defect_predict","90","2","7002","983","0,38990008210467647","523","62","3","3","4"
[2137 3972 4448 ... 4315 4316 3500]
[2137 3972 4448 ... 4315 4316 3500]
"Wahono","defect_predict","100","1","7002","983","0,3714620744530989","518","62","2","2","4"
[3974 3196 2063 ... 4461 4462 3500]
[3974 3196 2063 ... 4459 4461 3500]
"Wahono","defect_predict","110","1","7002","983","0,3714508693674779","352","62","1","3","3"
[4643 1657 3797 ... 4330 4331    0]
[3797 3972 2206 ... 4330 4331    0]
"Wahono","defect_predict","120","2","7002","983","0,36640897257356664","562","62","2","4","5"
[4643 3974 1657 ... 4473 4474    0]
[4643 3974 1657 ... 4473 4474    0]
"Wahono","defect_predict","130","1","7002","983","0,3602357935950329","315","62","2","3","3"
[4643 4112 1657 ... 4370 4372    0]
[3797 3972 2206 ... 4372 4373    0]
"Wahono","defect_predict","140","2","7002","983","0,3537356907908867","525","62","2","3","7"
[4643 613

In [7]:
/ df_results = pd.DataFrame(results)
df_results.to_csv(os.path.join(os.getcwd(), "resultados_lda_mallet_v2.csv"))°