<a href="https://colab.research.google.com/github/ribeiroti/tcc/blob/master/testes_lda_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Implementação

## Dados dos datasets

Dataset | Query | \|Dataset\| | \|Rel\| | % Rel
--- | --- | --- | --- | ---
Hall | Defect prediction | 8911 | 104 | 1.17
Radjenovic | Defect prediction metrics | 6000 | 48 | 0.80
Kitchenham | Literature review | 1704 | 45 | 2.64
Wahono | Defect prediction | 7002 | 62 | 0.88

## Bibliotecas

In [1]:
import sys
import datetime
from collections import OrderedDict
from random import seed
from random import randint
import nltk
import pandas as pd
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from google.colab import drive

print("Python version")
print (sys.version)
print("Gensim version")
print (gensim.__version__)
print("Pandas version")
print (pd.__version__)


nltk.download('stopwords')
nltk.download('wordnet')

Python version
3.6.9 (default, Apr 18 2020, 01:56:04) 
[GCC 8.4.0]
Gensim version
3.6.0
Pandas version
1.0.3
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

##Números randômicos

In [2]:
# seed(datetime.datetime.now().second)

random_numbers = [534, 424, 826, 310, 983]

# for _ in range(5):
# 	value = randint(0, 1000)
# 	random_numbers.append(value)
 
print(random_numbers)

[534, 424, 826, 310, 983]


## Tratamento do dataset

In [0]:
def get_corpus_for_docs(docs):
    """
    Faz o tratamento da base e retorna o corpus, dicionário e query tratada
    """
    # Download and import stopwords library
    stopwds = stopwords.words('english')

    # Split the documents into tokens.
    tokenizer = RegexpTokenizer(r'\w+')
    for idx in range(len(docs)):
        docs[idx] = str(docs[idx]).lower()  # Convert to lowercase.
        docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

    # Remove stopwords.
    docs = [[token for token in doc if token not in stopwds] for doc in docs]

    # Remove rare and common tokens.
    # Create a dictionary representation of the documents.
    dictionary = Dictionary(docs)

    # Bag-of-words representation of the documents.
    corpus = [dictionary.doc2bow(doc) for doc in docs]

    return corpus, dictionary, docs

## Executa LDA e retorna TOP documentos

In [0]:
def get_top_documents(df, corpus, dictionary, query, num_topics, random_state, top_n):
    """
    Retorna TOP 10 documentos da query.
    
    Recebe a base original, corpus, dicionário, query de busca, número de
    tópicos e a semente do random.
    """
    # Make a index to word dictionary.
    temp = dictionary[0]  # This is only to "load" the dictionary.
    id2word = dictionary.id2token

    # Train LDA model.
    model = LdaModel(
        corpus=corpus,
        id2word=id2word,
        alpha='auto',
        eta='auto',
        num_topics=num_topics,
        random_state=random_state
    )

    # top_topics = model.top_topics(corpus, topn=10)

    # Topic coherence
    texts = [[dictionary[word_id] for word_id, freq in doc] for doc in corpus]
    cm = CoherenceModel(model=model, corpus=corpus, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence = cm.get_coherence()  # get coherence value

    # query topics sorted by score
    initial_query_topics = model.get_document_topics(dictionary.doc2bow(query))
    query_topics = list(sorted(initial_query_topics, key=lambda x: x[1], reverse=True))
    if top_n:
        query_topics = query_topics[:top_n]

    # Get documents with highest probability for each topic. Source: https://stackoverflow.com/a/56814624

    # matrix with the query topics as columns and docs as rows, value is the proba
    topic_ids = [x[0] for x in query_topics]
    topic_matrix = pd.DataFrame(columns=topic_ids)

    # Loop over all the documents to group the probability of each topic
    for docID in range(len(corpus)):
        topic_matrix.loc[len(topic_matrix)] = 0  # fill with zeros
        topic_vector = OrderedDict(model[corpus[docID]])  # convert list of tuples to OrderedDict to go faster
        for topicID in topic_ids:  # only the query topics are relevant
            topic_matrix.at[docID, topicID] = topic_vector.get(topicID, 0)

    # sum probas
    # ids dos docs
    docs = np.array((range(len(corpus))))
    # soma de todas as colunas p/ cada documento
    topic_probas = topic_matrix.sum(axis=1)
    # sorteia da maior p/ a menor proba
    ids = docs[np.argsort(topic_probas[docs])[::-1]]
    # top_docs = topic_probas[ids]

    # get TOP documents for TOP 1 or 2 query topics
    num_relevant = 0
    top_n_topics = len(query_topics)

    # quantos documentos tenho que revisar até achar o primeiro relevante
    counter = 0
    f_distance = 0
    top_10_true = 0
    top_20_true = 0

    for idx, doc in enumerate(ids):
        counter += 1
        doc_data = df.iloc[doc, :]
        if doc_data['label'] == 'yes':
            num_relevant += 1
            if idx < 10:
                top_10_true += 1
            if idx < 20:
                top_20_true += 1
            if not f_distance:
                f_distance = counter
            counter = 0

    return len(initial_query_topics), top_n_topics, coherence, len(ids), num_relevant, f_distance, top_10_true, top_20_true 

## Combinações de teste

In [0]:
# Dados de teste: DBs, número de tópicos e sementes do random state,
DATABASES = {'Hall': 'Defect prediction', 'Radjenovic': 'Defect prediction metrics', 'Kitchenham': 'Literature review', 'Wahono': 'Defect prediction'}
NUM_TOPICS_FOR_TEST = range(10, 201, 10)
FIXED_OPTIONS = [0, 1]

# dicionário onde serão armazenados os resultados
results = {
    'database': [],                 # nome da base
    'query': [],                    # query
    'num_topics': [],               # número de tópicos utilizado
    'qry_topics': [],               # número de tópicos retornado para a query
    'top_n_topics': [],             # número de tópicos usados para reunir os documentos
    'fixed_n_topics': [],           # número de tópicos foi fixado? falso para cálculo proporcional
    'num_documents': [],            # número total de documentos
    'rand_seed': [],                # semente do random do modelo LDA
    'coherence': [],                # coerência dos tópicos com a coleção
    'num_relevant': [],             # número de documentos relevantes encontrados no TOP 10
    'f_distance': [],               # distância até o primeiro relevante
    'top_10_true': [],              # documentos relevantes no top 10
    'top_20_true': [],              # documentos relevantes no top 20
}

#Tests

In [6]:
for dbname, query in DATABASES.items():
    path = 'https://github.com/fastread/src/raw/master/workspace/data/{}.csv'.format(dbname)

    #Carregando dataset
    df = pd.read_csv(path, header=0, encoding='latin-1')

    N_DOCS = df.shape[0]  # ilimited n_docs

    base_docs = df['Document Title'] + ' ' + df['Abstract']

    corpus, dictionary, docs = get_corpus_for_docs(base_docs)
    tmp_corpus, tmp_dict, tmp_query = get_corpus_for_docs([query]) # query tem o mesmo preprocessamento da coleção
    query = tmp_query[0]

    header = ' | '.join([key.ljust(len(' '.join(query))) for key in results.keys()])
    print(header)
    print('='*len(header))

    for fixed in FIXED_OPTIONS:
        for rand_seed in random_numbers:
            for num_topics in NUM_TOPICS_FOR_TEST:
                qry_topics, top_n_topics, coherence, num_documents, num_relevant, f_distance, top_10_true, top_20_true = get_top_documents(df, corpus, dictionary, query, num_topics, rand_seed, fixed)
                values = [dbname, ' '.join(query), str(num_topics), str(qry_topics), str(top_n_topics), str(fixed), str(num_documents), str(rand_seed), '%.6f' % coherence, str(num_relevant), str(f_distance), str(top_10_true), str(top_20_true)]
                print(' | '.join([value.ljust(len(values[1])) for value in values]))
                results['database'].append(dbname)
                results['query'].append(' '.join(query))
                results['num_topics'].append(num_topics)
                results['qry_topics'].append(qry_topics)
                results['top_n_topics'].append(top_n_topics)
                results['fixed_n_topics'].append(fixed)
                results['num_documents'].append(num_documents)
                results['rand_seed'].append(rand_seed)
                results['coherence'].append(coherence)
                results['num_relevant'].append(num_relevant)
                results['f_distance'].append(f_distance)
                results['top_10_true'].append(top_10_true)
                results['top_20_true'].append(top_20_true)

database          | query             | num_topics        | qry_topics        | top_n_topics      | fixed_n_topics    | num_documents     | rand_seed         | coherence         | num_relevant      | f_distance        | top_10_true       | top_20_true      
Hall              | defect prediction | 10                | 10                | 10                | 0                 | 8911              | 534               | 0.463760          | 104               | 217               | 0                 | 0                


KeyboardInterrupt: ignored

In [0]:
drive.mount('drive', force_remount=True)

df_results = pd.DataFrame(results)
df_results.to_csv('drive/My Drive/Pureza Tópicos/resultados_lda_final.csv')