# Implementação

## Dados dos datasets

Dataset | Query | Dataset | Rel | % Rel
--- | --- | --- | --- | ---
Hall | Defect prediction | 8911 | 104 | 1.17
Radjenovic | Defect prediction metrics | 6000 | 48 | 0.80
Kitchenham | Literature review | 1704 | 45 | 2.64
Wahono | Defect prediction | 7002 | 62 | 0.88

## Bibliotecas

In [1]:
import os
import sys
#import datetime
from collections import OrderedDict
#from random import seed
#from random import randint
import nltk
import pandas as pd
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import gensim
from gensim.corpora import Dictionary
from gensim.models.wrappers import LdaMallet
from gensim.models.wrappers.ldamallet import malletmodel2ldamodel
from gensim.models.coherencemodel import CoherenceModel
#from google.colab import drive

print("Python version")
print (sys.version)
print("Gensim version")
print (gensim.__version__)
print("Pandas version")
print (pd.__version__)

nltk.download('stopwords')

Python version
3.6.10 |Anaconda, Inc.| (default, Mar 25 2020, 23:51:54) 
[GCC 7.3.0]
Gensim version
3.8.0
Pandas version
1.0.3


[nltk_data] Downloading package stopwords to /home/ballke/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Números randômicos

In [2]:
# seed(datetime.datetime.now().second)

random_numbers = [534, 424, 826, 310, 983]

# for _ in range(5):
# 	value = randint(0, 1000)
# 	random_numbers.append(value)
 
print(random_numbers)

[534, 424, 826, 310, 983]


## Tratamento do dataset

In [3]:
def get_corpus_for_docs(docs):
    """
    Faz o tratamento da base e retorna o corpus, dicionário e query tratada
    """
    # Download and import stopwords library
    stopwds = stopwords.words('english')

    # Split the documents into tokens.
    tokenizer = RegexpTokenizer(r'\w+')
    for idx in range(len(docs)):
        docs[idx] = str(docs[idx]).lower()  # Convert to lowercase.
        docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

    # Remove stopwords.
    docs = [[token for token in doc if token not in stopwds] for doc in docs]

    # Porter stemmer.
    stemmer = PorterStemmer()
    docs = [[stemmer.stem(token) for token in doc] for doc in docs]
    
    # Remove rare and common tokens.
    # Create a dictionary representation of the documents.
    dictionary = Dictionary(docs)

    # Bag-of-words representation of the documents.
    corpus = [dictionary.doc2bow(doc) for doc in docs]

    return corpus, dictionary, docs

## Executa LDA e retorna TOP documentos

In [4]:
def get_top_documents(df, corpus, dictionary, query, num_topics, random_state):
    """
    Retorna TOP 10 documentos da query.
    
    Recebe a base original, corpus, dicionário, query de busca, número de
    tópicos e a semente do random.
    """
    # Make a index to word dictionary.
    temp = dictionary[0]  # This is only to "load" the dictionary.
    id2word = dictionary.id2token

    # Train LDA model.
    path_to_mallet_binary = os.path.join(os.getcwd(), "mallet-2.0.8", "bin", "mallet")
    mallet_model = LdaMallet(
        path_to_mallet_binary,
        corpus=corpus,
        id2word=id2word,
        num_topics=num_topics,
        random_seed=random_state
    )
    model = malletmodel2ldamodel(mallet_model)

    # top_topics = model.top_topics(corpus, topn=10)

    # Topic coherence
    texts = [[dictionary[word_id] for word_id, freq in doc] for doc in corpus]
    cm = CoherenceModel(model=model, corpus=corpus, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence = cm.get_coherence()  # get coherence value

    # query topics sorted by score
    initial_query_topics = model[dictionary.doc2bow(query)]
    query_topics = list(sorted(initial_query_topics, key=lambda x: x[1], reverse=True))

    # Get documents with highest probability for each topic. Source: https://stackoverflow.com/a/56814624

    # matrix with the query topics as columns and docs as rows, value is the proba
    topic_matrix = pd.DataFrame(columns=[x[0] for x in query_topics])

    # Loop over all the documents to group the probability of each topic
    for docID in range(len(corpus)):
        topic_matrix.loc[len(topic_matrix)] = 0  # fill with zeros
        topic_vector = OrderedDict(model[corpus[docID]])  # convert list of tuples to OrderedDict to go faster
        for topicID, proba in query_topics:  # only the query topics are relevant
            topic_matrix.at[docID, topicID] = proba * topic_vector.get(topicID, 0)

    # sum probas
    # ids dos docs
    docs = np.array((range(len(corpus))))
    # soma de todas as colunas p/ cada documento
    topic_probas = topic_matrix.sum(axis=1)
    # sorteia da maior p/ a menor proba
    ids = docs[np.argsort(topic_probas[docs])[::-1]]
    # top_docs = topic_probas[ids]

    # get TOP documents for TOP 1 or 2 query topics
    num_relevant = 0

    # quantos documentos tenho que revisar até achar o primeiro relevante
    counter = 0
    f_distance = 0
    top_10_true = 0
    top_20_true = 0

    for idx, doc in enumerate(ids):
        counter += 1
        doc_data = df.iloc[doc, :]
        if doc_data['label'] == 'yes':
            num_relevant += 1
            if idx < 10:
                top_10_true += 1
            if idx < 20:
                top_20_true += 1
            if not f_distance:
                f_distance = counter
            counter = 0

    return len(initial_query_topics), coherence, len(ids), num_relevant, f_distance, top_10_true, top_20_true 

## Combinações de teste

In [5]:
# Dados de teste: DBs, número de tópicos e sementes do random state,
DATABASES = {'Hall': 'Defect prediction', 'Radjenovic': 'Defect prediction metrics', 'Kitchenham': 'Literature review', 'Wahono': 'Defect prediction'}
NUM_TOPICS_FOR_TEST = range(10, 201, 10)

# dicionário onde serão armazenados os resultados
results = {
    'database': [],                 # nome da base
    'query': [],                    # query
    'num_topics': [],               # número de tópicos utilizado
    'qry_topics': [],               # número de tópicos retornado para a query
    'num_documents': [],            # número total de documentos
    'rand_seed': [],                # semente do random do modelo LDA
    'coherence': [],                # coerência dos tópicos com a coleção
    'num_relevant': [],             # número de documentos relevantes encontrados no TOP 10
    'f_distance': [],               # distância até o primeiro relevante
    'top_10_true': [],              # documentos relevantes no top 10
    'top_20_true': [],              # documentos relevantes no top 20
}

# Tests

In [6]:
print(','.join(['"{}"'.format(key) for key in results.keys()]))

for dbname, query in DATABASES.items():
    path = 'https://github.com/fastread/src/raw/master/workspace/data/{}.csv'.format(dbname)

    #Carregando dataset
    df = pd.read_csv(path, header=0, encoding='latin-1')

    N_DOCS = df.shape[0]  # ilimited n_docs

    base_docs = df['Document Title'] + ' ' + df['Abstract']

    corpus, dictionary, docs = get_corpus_for_docs(base_docs)
    tmp_corpus, tmp_dict, tmp_query = get_corpus_for_docs([query]) # query tem o mesmo preprocessamento da coleção
    query = tmp_query[0]


    for rand_seed in random_numbers:
        for num_topics in NUM_TOPICS_FOR_TEST:
            qry_topics, coherence, num_documents, num_relevant, f_distance, top_10_true, top_20_true = get_top_documents(df, corpus, dictionary, query, num_topics, rand_seed)
            values = [dbname, ' '.join(query), str(num_topics), str(qry_topics), str(num_documents), str(rand_seed), str(coherence).replace('.', ','), str(num_relevant), str(f_distance), str(top_10_true), str(top_20_true)]
            print(','.join(['"{}"'.format(value) for value in values]))
            results['database'].append(dbname)
            results['query'].append(' '.join(query))
            results['num_topics'].append(num_topics)
            results['qry_topics'].append(qry_topics)
            results['num_documents'].append(num_documents)
            results['rand_seed'].append(rand_seed)
            results['coherence'].append(coherence)
            results['num_relevant'].append(num_relevant)
            results['f_distance'].append(f_distance)
            results['top_10_true'].append(top_10_true)
            results['top_20_true'].append(top_20_true)

"database","query","num_topics","qry_topics","num_documents","rand_seed","coherence","num_relevant","f_distance","top_10_true","top_20_true"
"Hall","defect predict","10","10","8911","534","0,4930575673558086","104","6","3","3"
"Hall","defect predict","20","20","8911","534","0,511850673698891","104","1","7","12"
"Hall","defect predict","30","30","8911","534","0,5126666526537219","104","1","6","12"
"Hall","defect predict","40","40","8911","534","0,5095368503637164","104","9","1","2"
"Hall","defect predict","50","50","8911","534","0,502004268107069","104","3","3","5"
"Hall","defect predict","60","60","8911","534","0,4970953181966392","104","1","6","8"
"Hall","defect predict","70","70","8911","534","0,4931751379837828","104","3","4","9"
"Hall","defect predict","80","80","8911","534","0,48119800536723556","104","2","5","9"
"Hall","defect predict","90","90","8911","534","0,4756499673918876","104","2","4","11"
"Hall","defect predict","100","2","8911","534","0,48042005287041023","104","3","6",

"Hall","defect predict","140","2","8911","983","0,4513453768752714","104","3","2","4"
"Hall","defect predict","150","2","8911","983","0,4454249572521998","104","3","6","11"
"Hall","defect predict","160","2","8911","983","0,4417822509798473","104","1","8","16"
"Hall","defect predict","170","2","8911","983","0,4332693367327892","104","3","4","8"
"Hall","defect predict","180","2","8911","983","0,4423539941687627","104","11","0","1"
"Hall","defect predict","190","2","8911","983","0,4393805422030183","104","1","6","8"
"Hall","defect predict","200","2","8911","983","0,4316628150738808","104","1","6","11"
"Radjenovic","defect predict metric","10","10","6000","534","0,44591857193355233","48","1","3","4"
"Radjenovic","defect predict metric","20","20","6000","534","0,47092037095150124","48","3","4","5"
"Radjenovic","defect predict metric","30","30","6000","534","0,45071845397437127","48","3","3","4"
"Radjenovic","defect predict metric","40","40","6000","534","0,44466608485120973","48","8","1","2

"Radjenovic","defect predict metric","180","3","6000","310","0,37676647634365945","48","7","2","4"
"Radjenovic","defect predict metric","190","3","6000","310","0,3731072171663354","48","5","2","5"
"Radjenovic","defect predict metric","200","3","6000","310","0,3749215075888832","48","2","3","5"
"Radjenovic","defect predict metric","10","10","6000","983","0,46972393523941475","48","4","3","3"
"Radjenovic","defect predict metric","20","20","6000","983","0,4748806501984836","48","1","2","3"
"Radjenovic","defect predict metric","30","30","6000","983","0,4568035176200304","48","5","1","3"
"Radjenovic","defect predict metric","40","40","6000","983","0,4537972462589167","48","5","1","3"
"Radjenovic","defect predict metric","50","50","6000","983","0,4353346479320671","48","56","0","0"
"Radjenovic","defect predict metric","60","60","6000","983","0,4162796602878766","48","5","3","4"
"Radjenovic","defect predict metric","70","70","6000","983","0,4064806026649031","48","28","0","0"
"Radjenovic","de

"Kitchenham","literatur review","40","40","1704","310","0,4331878515947333","45","16","0","1"
"Kitchenham","literatur review","50","50","1704","310","0,4135254879687744","45","3","2","2"
"Kitchenham","literatur review","60","60","1704","310","0,4045307337022382","45","19","0","1"
"Kitchenham","literatur review","70","70","1704","310","0,4025886280773143","45","18","0","1"
"Kitchenham","literatur review","80","80","1704","310","0,39473329575777943","45","12","0","1"
"Kitchenham","literatur review","90","90","1704","310","0,3957391908671737","45","25","0","0"
"Kitchenham","literatur review","100","1","1704","310","0,38367855049003063","45","17","0","1"
"Kitchenham","literatur review","110","1","1704","310","0,3753908960909754","45","22","0","0"
"Kitchenham","literatur review","120","1","1704","310","0,3727667676077152","45","17","0","2"
"Kitchenham","literatur review","130","1","1704","310","0,37483331442357987","45","15","0","2"
"Kitchenham","literatur review","140","1","1704","310","0,

"Wahono","defect predict","150","2","7002","826","0,37596689941544814","62","8","2","2"
"Wahono","defect predict","160","2","7002","826","0,37728158651497273","62","1","4","4"
"Wahono","defect predict","170","2","7002","826","0,37952191107365396","62","4","2","2"
"Wahono","defect predict","180","2","7002","826","0,374854882094777","62","3","3","5"
"Wahono","defect predict","190","2","7002","826","0,369942138988379","62","5","2","3"
"Wahono","defect predict","200","2","7002","826","0,3719262120798692","62","1","2","4"
"Wahono","defect predict","10","10","7002","310","0,46064367216708824","62","3","1","2"
"Wahono","defect predict","20","20","7002","310","0,46635621361495766","62","3","5","8"
"Wahono","defect predict","30","30","7002","310","0,4560161450659498","62","23","0","0"
"Wahono","defect predict","40","40","7002","310","0,4398931115871395","62","1","3","4"
"Wahono","defect predict","50","50","7002","310","0,4274062524383242","62","3","5","9"
"Wahono","defect predict","60","60","70

In [7]:
df_results = pd.DataFrame(results)
df_results.to_csv(os.path.join(os.getcwd(), "resultados_lda_mallet.csv"))