# Implementação

## Dados dos datasets

Dataset | Query | Dataset | Rel | % Rel
--- | --- | --- | --- | ---
Hall | Defect prediction | 8911 | 104 | 1.17
Radjenovic | Defect prediction metrics | 6000 | 48 | 0.80
Kitchenham | Literature review | 1704 | 45 | 2.64
Wahono | Defect prediction | 7002 | 62 | 0.88

## Bibliotecas

In [1]:
import os
import sys
#import datetime
from collections import OrderedDict
#from random import seed
#from random import randint
import nltk
import pandas as pd
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import gensim
from gensim.corpora import Dictionary
from gensim.models.wrappers import LdaMallet
from gensim.models.wrappers.ldamallet import malletmodel2ldamodel
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.phrases import Phrases, Phraser
#from google.colab import drive

print("Python version")
print (sys.version)
print("Gensim version")
print (gensim.__version__)
print("Pandas version")
print (pd.__version__)

nltk.download('stopwords')

Python version
3.6.10 |Anaconda, Inc.| (default, Mar 25 2020, 23:51:54) 
[GCC 7.3.0]
Gensim version
3.8.0
Pandas version
1.0.3


[nltk_data] Downloading package stopwords to /home/ballke/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Números randômicos

In [2]:
# seed(datetime.datetime.now().second)

random_numbers = [534, 424, 826, 310, 983]

# for _ in range(5):
# 	value = randint(0, 1000)
# 	random_numbers.append(value)
 
print(random_numbers)

[534, 424, 826, 310, 983]


## Tratamento do dataset

In [3]:
def get_corpus_for_docs(docs):
    """
    Faz o tratamento da base e retorna o corpus, dicionário e query tratada
    """
    # Download and import stopwords library
    stopwds = stopwords.words('english')

    # Split the documents into tokens.
    tokenizer = RegexpTokenizer(r'\w+')
    for idx in range(len(docs)):
        docs[idx] = str(docs[idx]).lower()  # Convert to lowercase.
        docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

#     # Remove numbers, but not words that contain numbers.
#     docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

#     # Remove words that are only one character.
#     docs = [[token for token in doc if len(token) > 1] for doc in docs]        
        
    # Remove stopwords.
    docs = [[token for token in doc if token not in stopwds] for doc in docs]

    # Porter stemmer.
    stemmer = PorterStemmer()
    docs = [[stemmer.stem(token) for token in doc] for doc in docs]
    
    # Compute bigrams.
#     phrases = Phrases(docs, min_count=1, threshold=2)
#     bigram = Phraser(phrases)
#     for idx in range(len(docs)):
#         docs[idx] = bigram[docs[idx]]
        
    # text processing is done
    query = docs[-1]
    docs = docs[:-1]
    
    # Remove rare and common tokens.
    # Create a dictionary representation of the documents.
    dictionary = Dictionary(docs)

    # Bag-of-words representation of the documents.
    corpus = [dictionary.doc2bow(doc) for doc in docs]

    return corpus, dictionary, docs, query

## Executa LDA e retorna TOP documentos

In [4]:
def get_top_documents(df, corpus, dictionary, query, num_topics, random_state, N=50):
    """
    Retorna TOP 10 documentos da query.
    
    Recebe a base original, corpus, dicionário, query de busca, número de
    tópicos e a semente do random.
    """
    # Make a index to word dictionary.
    temp = dictionary[0]  # This is only to "load" the dictionary.
    id2word = dictionary.id2token

    # Train LDA model.
    path_to_mallet_binary = os.path.join(os.getcwd(), "mallet-2.0.8", "bin", "mallet")
    mallet_model = LdaMallet(
        path_to_mallet_binary,
        corpus=corpus,
        id2word=id2word,
        num_topics=num_topics,
        random_seed=random_state
    )
    model = malletmodel2ldamodel(mallet_model)

    # top_topics = model.top_topics(corpus, topn=10)

    # Topic coherence
    texts = [[dictionary[word_id] for word_id, freq in doc] for doc in corpus]
    cm = CoherenceModel(model=model, corpus=corpus, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence = cm.get_coherence()  # get coherence value
    
    # QUERY ORIGINAL #######
    
    # query topics sorted by score
    initial_query_topics = model.get_document_topics(dictionary.doc2bow(query))
    query_topics = list(sorted(initial_query_topics, key=lambda x: x[1], reverse=True))

    # Get documents with highest probability for each topic. Source: https://stackoverflow.com/a/56814624

    # matrix with the query topics as columns and docs as rows, value is the proba
    topic_matrix = pd.DataFrame(columns=[x[0] for x in query_topics])

    # Loop over all the documents to group the probability of each topic
    for docID in range(len(corpus)):
        topic_matrix.loc[len(topic_matrix)] = 0  # fill with zeros
        topic_vector = OrderedDict(model[corpus[docID]])  # convert list of tuples to OrderedDict to go faster
        for topicID, proba in query_topics:  # only the query topics are relevant
            topic_matrix.at[docID, topicID] = proba * topic_vector.get(topicID, 0)

    # sum probas
    # ids dos docs
    docs = np.array((range(len(corpus))))
    # soma de todas as colunas p/ cada documento
    topic_probas = topic_matrix.sum(axis=1)
    # sorteia da maior p/ a menor proba
    ids = docs[np.argsort(topic_probas[docs])[::-1]]
    print(ids)
    # top_docs = topic_probas[ids]
    
    # todo: calcular a relevancia de cada palavra para a query
    # proba da palavra w * (proba da palavra 1 da query * proba da palavra 2 da query...)
    # ou obter todas as palavras que a relevância seja > 0
    query_per_word_topics = model.get_document_topics(dictionary.doc2bow(query), per_word_topics=True)
    query_per_word_topics = query_per_word_topics[2]
    query_per_word_topics = [OrderedDict(probas) for word_id, probas in query_per_word_topics]
    doc_ids_to_extend = ids[:N]
    new_words_ids = []

    # Loop over all the documents to get the new query words
    for docID in doc_ids_to_extend:
        per_word_topics = model.get_document_topics(corpus[docID], per_word_topics=True)
        per_word_topics = per_word_topics[2]
        
        for wordID, topics_probas in per_word_topics:
            for topicID, proba in topics_probas:
                result = proba
                for d in query_per_word_topics:
                    result *= d.get(topicID, 0)
                
                if result > 0:
                    new_words_ids.append(wordID)
    
    new_words_ids = list(set(new_words_ids))
    new_query = query + [dictionary[wordID] for wordID in new_words_ids]
    
    # QUERY EXTENDIDA #######
    
    # query topics sorted by score
    initial_query_topics = model.get_document_topics(dictionary.doc2bow(new_query))
    query_topics = list(sorted(initial_query_topics, key=lambda x: x[1], reverse=True))

    # Get documents with highest probability for each topic. Source: https://stackoverflow.com/a/56814624

    # matrix with the query topics as columns and docs as rows, value is the proba
    topic_matrix = pd.DataFrame(columns=[x[0] for x in query_topics])

    # Loop over all the documents to group the probability of each topic
    for docID in range(len(corpus)):
        topic_matrix.loc[len(topic_matrix)] = 0  # fill with zeros
        topic_vector = OrderedDict(model[corpus[docID]])  # convert list of tuples to OrderedDict to go faster
        for topicID, proba in query_topics:  # only the query topics are relevant
            topic_matrix.at[docID, topicID] = proba * topic_vector.get(topicID, 0)

    # sum probas
    # ids dos docs
    docs = np.array((range(len(corpus))))
    # soma de todas as colunas p/ cada documento
    topic_probas = topic_matrix.sum(axis=1)
    # sorteia da maior p/ a menor proba
    ids = docs[np.argsort(topic_probas[docs])[::-1]]
    print(ids)
    
    # relvant counter
    num_relevant = 0

    # quantos documentos tenho que revisar até achar o primeiro relevante
    counter = 0
    f_distance = 0
    top_10_true = 0
    top_20_true = 0

    for idx, doc in enumerate(ids):
        counter += 1
        doc_data = df.iloc[doc, :]
        if doc_data['label'] == 'yes':
            num_relevant += 1
            if idx < 10:
                top_10_true += 1
            if idx < 20:
                top_20_true += 1
            if not f_distance:
                f_distance = counter
            counter = 0

    return len(initial_query_topics), coherence, len(ids), len(new_query), num_relevant, f_distance, top_10_true, top_20_true 

## Combinações de teste

In [5]:
# Dados de teste: DBs, número de tópicos e sementes do random state,
DATABASES = {'Kitchenham': 'Literature review', 'Radjenovic': 'Defect prediction metrics', 'Hall': 'Defect prediction', 'Wahono': 'Defect prediction'}
NUM_TOPICS_FOR_TEST = range(10, 201, 10)

# dicionário onde serão armazenados os resultados
results = {
    'database': [],                 # nome da base
    'query': [],                    # query
    'num_topics': [],               # número de tópicos utilizado
    'qry_topics': [],               # número de tópicos retornado para a query
    'num_documents': [],            # número total de documentos
    'rand_seed': [],                # semente do random do modelo LDA
    'coherence': [],                # coerência dos tópicos com a coleção
    'ext_query_size': [],           # tamanho da query estendida
    'num_relevant': [],             # número de documentos relevantes encontrados no TOP 10
    'f_distance': [],               # distância até o primeiro relevante
    'top_10_true': [],              # documentos relevantes no top 10
    'top_20_true': [],              # documentos relevantes no top 20
}

# Tests

In [6]:
print(','.join(['"{}"'.format(key) for key in results.keys()]))

for dbname, query in DATABASES.items():
    path = 'https://github.com/fastread/src/raw/master/workspace/data/{}.csv'.format(dbname)

    #Carregando dataset
    df = pd.read_csv(path, header=0, encoding='latin-1')

    N_DOCS = df.shape[0]  # ilimited n_docs

    base_docs = df['Document Title'] + ' ' + df['Abstract']

    # put de query in the text processing
    base_docs = base_docs.append(pd.Series([query], index=[len(base_docs)]))
    
    corpus, dictionary, docs, query = get_corpus_for_docs(base_docs) # query tem o mesmo preprocessamento da coleção

    for rand_seed in random_numbers:
        for num_topics in NUM_TOPICS_FOR_TEST:
            qry_topics, coherence, num_documents, len_qry, num_relevant, f_distance, top_10_true, top_20_true = get_top_documents(df, corpus, dictionary, query, num_topics, rand_seed)
            values = [dbname, ' '.join(query), str(num_topics), str(qry_topics), str(num_documents), str(rand_seed), str(coherence).replace('.', ','), str(len_qry), str(num_relevant), str(f_distance), str(top_10_true), str(top_20_true)]
            print(','.join(['"{}"'.format(value) for value in values]))
            results['database'].append(dbname)
            results['query'].append(' '.join(query))
            results['num_topics'].append(num_topics)
            results['qry_topics'].append(qry_topics)
            results['num_documents'].append(num_documents)
            results['rand_seed'].append(rand_seed)
            results['coherence'].append(coherence)
            results['ext_query_size'].append(len_qry)
            results['num_relevant'].append(num_relevant)
            results['f_distance'].append(f_distance)
            results['top_10_true'].append(top_10_true)
            results['top_20_true'].append(top_20_true)

"database","query","num_topics","qry_topics","num_documents","rand_seed","coherence","ext_query_size","num_relevant","f_distance","top_10_true","top_20_true"
[1394   98  787 ... 1209  382 1054]
[1394   98  787 ... 1262  508 1054]
"Kitchenham","literatur review","10","10","1704","534","0,5266431204622528","296","45","5","1","2"
[ 603 1194  788 ...  250 1054  639]
[ 603 1194  788 ...  250  639  641]
"Kitchenham","literatur review","20","15","1704","534","0,4968209642846326","220","45","28","0","0"
[ 105   98 1394 ... 1618  832 1615]
[  98  105 1394 ... 1298 1209 1297]
"Kitchenham","literatur review","30","10","1704","534","0,4596289240150061","169","45","14","0","2"
[ 321  320   99 ... 1466   65 1615]
[321 320  98 ... 430 704  65]
"Kitchenham","literatur review","40","4","1704","534","0,436567048040715","132","45","27","0","0"
[1577  578  858 ...  877  801  800]
[ 578 1577  858 ...  588  584 1703]
"Kitchenham","literatur review","50","2","1704","534","0,4199328054880019","133","45","17",

"Kitchenham","literatur review","80","4","1704","826","0,40263739354681666","97","45","21","0","0"
[  98  228  229 ...  157 1315 1567]
[  98  228  229 ... 1068 1069  851]
"Kitchenham","literatur review","90","1","1704","826","0,37232057818771314","81","45","19","0","1"
[ 321  320  229 ... 1076 1077  851]
[ 321  320  229 ... 1274  657  851]
"Kitchenham","literatur review","100","3","1704","826","0,3799265580929314","82","45","9","2","2"
[ 229  228   99 ... 1083 1084  851]
[ 228  229   99 ... 1157  621  851]
"Kitchenham","literatur review","110","2","1704","826","0,37431382741359986","79","45","15","0","2"
[ 229  228 1032 ... 1092 1093  851]
[ 228  229 1032 ... 1086 1089 1703]
"Kitchenham","literatur review","120","4","1704","826","0,3801790004142796","68","45","8","1","4"
[ 229  228   45 ... 1092 1093  851]
[ 228  229   45 ... 1279 1278  422]
"Kitchenham","literatur review","130","7","1704","826","0,37506431715989424","56","45","14","0","3"
[  98  228  229 ... 1099 1101  851]
[  98  228

[ 229  228 1032 ... 1080 1081  851]
[229 228  99 ... 608 607 851]
"Kitchenham","literatur review","170","5","1704","983","0,3705301788900762","102","45","16","0","1"
[  99  228  229 ... 1101 1102  851]
[  99  228  229 ...  610 1260  683]
"Kitchenham","literatur review","180","6","1704","983","0,37167493674658103","56","45","16","0","2"
[ 229  228 1032 ... 1108 1109  851]
[ 228  229 1032 ...  592  591  851]
"Kitchenham","literatur review","190","5","1704","983","0,3715634763156158","47","45","16","0","3"
[1032 1339 1338 ... 1103 1104  851]
[1032 1339 1338 ...  769  767  851]
"Kitchenham","literatur review","200","5","1704","983","0,3676026551548779","47","45","12","0","3"
[3303 1889 3756 ... 3300  322 5335]
[3303 1889 3756 ... 3300  322 5335]
"Radjenovic","defect predict metric","10","10","6000","534","0,44591857193355233","3","48","1","3","4"
[3448 5773 1723 ... 4819 3857 5485]
[3448 5773 1723 ... 4819 3857 5485]
"Radjenovic","defect predict metric","20","20","6000","534","0,4709203709

[2465 1121 1120 ... 3709 2416 5997]
"Radjenovic","defect predict metric","40","40","6000","826","0,4488490186937062","3","48","2","3","3"
[3303 1012 5875 ... 2609 1043 2416]
[3303 1012 5875 ... 2609 1043 2416]
"Radjenovic","defect predict metric","50","50","6000","826","0,43088673123296095","3","48","1","1","1"
[3303 5875 2042 ... 2965 2122 2674]
[3303 5875 2042 ... 2965 2122 2674]
"Radjenovic","defect predict metric","60","60","6000","826","0,41889224794297747","3","48","1","2","2"
[4248 5773 3303 ... 3220 2120 1118]
[4248 5773 3303 ... 3220 2120 1118]
"Radjenovic","defect predict metric","70","70","6000","826","0,41187844686768926","3","48","1","4","7"
[2042 3303 1842 ... 4381  276   49]
[2042 3303 1842 ... 4381  276   49]
"Radjenovic","defect predict metric","80","80","6000","826","0,40819526787623295","3","48","2","2","3"
[2042 5875 3303 ...  239   49   66]
[2042 5875 3303 ...   49  239   66]
"Radjenovic","defect predict metric","90","90","6000","826","0,3896736130428511","3","48",

[1842 2042 1751 ... 3410 3409    0]
"Radjenovic","defect predict metric","110","3","6000","983","0,3920784863912913","3","48","21","0","0"
[1842 2042 2525 ... 3455 3454 2999]
[1842 2042 2525 ... 3455 3454 2999]
"Radjenovic","defect predict metric","120","3","6000","983","0,38539444098764186","3","48","13","0","2"
[3172 5112 5950 ... 3444 3443    0]
[3172 5112 5950 ... 3444 3443    0]
"Radjenovic","defect predict metric","130","3","6000","983","0,38404843803347183","3","48","6","3","3"
[4254 5121 4248 ... 3481 3479    0]
[4254 5121 4248 ... 3479 3476    0]
"Radjenovic","defect predict metric","140","3","6000","983","0,3814166600411773","3","48","3","2","4"
[4254 5875 5838 ... 3514 3515    0]
[4254 5875 5838 ... 3512 3514    0]
"Radjenovic","defect predict metric","150","3","6000","983","0,3796934680019031","3","48","7","2","6"
[4248 2644 5838 ... 3531 3535    0]
[4248 2644 5838 ... 3531 3535    0]
"Radjenovic","defect predict metric","160","3","6000","983","0,3787296135988956","3","48",

[1871 3827 3619 ... 3342 8856 1586]
[1871 3619 1791 ...  232 8856 1586]
"Hall","defect predict","10","9","8911","826","0,49190071130129776","548","104","2","3","5"
[1791 3619 1588 ... 4997 8111  224]
[1791 3619 1588 ... 4997 8111  224]
"Hall","defect predict","20","20","8911","826","0,5128492697615947","2","104","1","4","6"
[3619  923 8515 ... 5742  224 4528]
[3619  923 8515 ... 5742  224 4528]
"Hall","defect predict","30","30","8911","826","0,5153153777075056","2","104","1","6","9"
[1791 8909 2623 ... 1046  566  713]
[1791 1793 7137 ... 3939 3748 2356]
"Hall","defect predict","40","5","8911","826","0,5099891123596603","231","104","1","9","14"
[1791  923  930 ... 1515 3963 7854]
[1791 7059  923 ... 3392 3391    0]
"Hall","defect predict","50","1","8911","826","0,5107215507417479","244","104","1","6","13"
[1791 1588 2341 ... 7929 5264 1131]
[1791 1588 2341 ... 7929 5264 1131]
"Hall","defect predict","60","60","8911","826","0,4860788455678425","2","104","1","5","9"
[1791 1588 2341 ... 48

"Hall","defect predict","110","2","8911","983","0,4668237203951257","2","104","3","6","13"
[1588 7137 2341 ... 5570 5572    0]
[1588 7137 2341 ... 5570 5572    0]
"Hall","defect predict","120","2","8911","983","0,4588550972902147","2","104","2","8","10"
[1792 1588 6462 ... 5587 5588    0]
[1792 1588 6462 ... 5588 5589    0]
"Hall","defect predict","130","2","8911","983","0,45880931795471913","2","104","1","3","7"
[1588 2341 1792 ... 5565 5566    0]
[1588 2341 1792 ... 5562 5564    0]
"Hall","defect predict","140","2","8911","983","0,4513453768752714","2","104","3","2","4"
[1588 2341 1789 ... 5649 5650    0]
[1588 2341 1789 ... 5649 5650    0]
"Hall","defect predict","150","2","8911","983","0,4454249572521998","2","104","3","6","11"
[1791 7137 1794 ... 5663 5665    0]
[1791 7137 1794 ... 5665 5666    0]
"Hall","defect predict","160","2","8911","983","0,4417822509798473","2","104","1","8","16"
[1588 2341 1797 ... 5651 5652    0]
[1588 2341 1797 ... 5649 5650    0]
"Hall","defect predict"

[3674 2116 5169 ... 5992 6967 3920]
[3674 2116 5169 ... 2397 6981 6042]
"Wahono","defect predict","20","16","7002","826","0,45872985673444927","242","62","6","2","2"
[3674  491 1524 ...  338   38 3146]
[3674  491 1524 ... 5264 5878 2139]
"Wahono","defect predict","30","9","7002","826","0,44340479109743863","214","62","4","2","3"
[3191 2298 3719 ... 3297 3324 1181]
[3191 2298 3719 ... 3297 3324 1181]
"Wahono","defect predict","40","40","7002","826","0,43303318242290734","2","62","1","3","4"
[ 614 2176 5101 ... 4153 2253 5721]
[ 614 2176 5101 ... 4153 2253 5721]
"Wahono","defect predict","50","50","7002","826","0,42093772180983763","2","62","9","1","1"
[2206 4700 3974 ... 3397 3969 2003]
[2206 4700 6278 ... 2462  922 5464]
"Wahono","defect predict","60","4","7002","826","0,41555782559791926","180","62","2","5","6"
[1524 6133 1657 ...  980 5903  774]
[1524 6133 1657 ...  980 5903  774]
"Wahono","defect predict","70","70","7002","826","0,4081076022523994","2","62","3","1","1"
[ 109 3674 46

[1524 1657 3646 ... 3643 6823 3500]
[1524 3646 2828 ... 3830 3831 3500]
"Wahono","defect predict","120","3","7002","983","0,3815300978556487","96","62","8","1","1"
[ 328   96  192 ... 3630 3631    0]
[ 328   96  192 ... 3630 3631    0]
"Wahono","defect predict","130","2","7002","983","0,3869957158249618","2","62","14","0","3"
[1524 2137 5684 ... 4045 4046 3500]
[1524 5684 2137 ... 4045 4046 3500]
"Wahono","defect predict","140","2","7002","983","0,38314763430973375","2","62","4","1","3"
[ 328  614   96 ... 3979 3980 3500]
[ 328  614   96 ... 3977 3978 3500]
"Wahono","defect predict","150","2","7002","983","0,38158707319811863","2","62","7","1","2"
[6800 3974 1524 ... 4334 4336    0]
[6800 3974 1524 ... 4334 4336    0]
"Wahono","defect predict","160","2","7002","983","0,3755068208481734","2","62","2","2","2"
[6800 1524 1657 ... 4356 4357    0]
[6800 1524 1657 ... 4356 4357    0]
"Wahono","defect predict","170","2","7002","983","0,37581816655885436","2","62","3","2","3"
[5684 6361 5800 .

In [7]:
df_results = pd.DataFrame(results)
df_results.to_csv(os.path.join(os.getcwd(), "resultados_lda_mallet.csv"))