<a href="https://colab.research.google.com/github/ribeiroti/tcc/blob/master/pureza_topicos_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Implementação

## Dados dos datasets

Dataset | Query | Dataset | Rel | % Rel
--- | --- | --- | --- | ---
Hall | Defect prediction | 8911 | 104 | 1.17
Radjenovic | Defect prediction metrics | 6000 | 48 | 0.80
Kitchenham | Literature review | 1704 | 45 | 2.64
Wahono | Defect prediction | 7002 | 62 | 0.88

## Bibliotecas

In [0]:
import sys
import datetime
from collections import OrderedDict
from random import seed
from random import randint
import nltk
import pandas as pd
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from google.colab import drive

print("Python version")
print (sys.version)
print("Gensim version")
print (gensim.__version__)
print("Pandas version")
print (pd.__version__)

nltk.download('stopwords')

##Números randômicos

In [0]:
# seed(datetime.datetime.now().second)

random_numbers = [534, 424, 826, 310, 983]

# for _ in range(5):
# 	value = randint(0, 1000)
# 	random_numbers.append(value)
 
print(random_numbers)

## Tratamento do dataset

In [0]:
def get_corpus_for_docs(docs):
    """
    Faz o tratamento da base e retorna o corpus, dicionário e query tratada
    """
    # Download and import stopwords library
    stopwds = stopwords.words('english')

    # Split the documents into tokens.
    tokenizer = RegexpTokenizer(r'\w+')
    for idx in range(len(docs)):
        docs[idx] = str(docs[idx]).lower()  # Convert to lowercase.
        docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

    # Remove stopwords.
    docs = [[token for token in doc if token not in stopwds] for doc in docs]

    # Remove rare and common tokens.
    # Create a dictionary representation of the documents.
    dictionary = Dictionary(docs)

    # Bag-of-words representation of the documents.
    corpus = [dictionary.doc2bow(doc) for doc in docs]

    return corpus, dictionary, docs

## Executa LDA e retorna TOP documentos

In [0]:
def get_top_documents(df, corpus, dictionary, query, num_topics, random_state):
    """
    Retorna TOP 10 documentos da query.
    
    Recebe a base original, corpus, dicionário, query de busca, número de
    tópicos e a semente do random.
    """
    # Make a index to word dictionary.
    temp = dictionary[0]  # This is only to "load" the dictionary.
    id2word = dictionary.id2token

    # Train LDA model.
    model = LdaModel(
        corpus=corpus,
        id2word=id2word,
        alpha='auto',
        eta='auto',
        num_topics=num_topics,
        random_state=random_state
    )

    total_docs = {}
    relevant_docs = {}

    # Loop over all the documents to group the probability of each topic
    for docID, doc in enumerate(corpus):
        topics = model.get_document_topics(doc)
        for topicID, topic in topics:
            if not topicID in total_docs:
                total_docs[topicID] = 0

            if not topicID in relevant_docs:
                relevant_docs[topicID] = 0

            total_docs[topicID] += 1
            relevant_docs[topicID] += int(df.iloc[docID, :]['label'] == 'yes')

    return total_docs, relevant_docs

## Combinações de teste

In [0]:
# Dados de teste: DBs, número de tópicos e sementes do random state,
DATABASES = {'Hall': 'Defect prediction', 'Radjenovic': 'Defect prediction metrics', 'Kitchenham': 'Literature review', 'Wahono': 'Defect prediction'}
NUM_TOPICS_FOR_TEST = range(10, 201, 10)
FIXED_OPTIONS = [0, 1]

#Tests

In [0]:
for dbname, query in DATABASES.items():
    path = 'https://github.com/fastread/src/raw/master/workspace/data/{}.csv'.format(dbname)

    #Carregando dataset
    df = pd.read_csv(path, header=0, encoding='latin-1')

    base_docs = df['Document Title'] + ' ' + df['Abstract']

    corpus, dictionary, docs = get_corpus_for_docs(base_docs)
    tmp_corpus, tmp_dict, tmp_query = get_corpus_for_docs([query]) # query tem o mesmo preprocessamento da coleção
    query = tmp_query[0]

    for rand_seed in random_numbers:
        total_docs = pd.DataFrame(columns=range(200))
        relevant_docs = pd.DataFrame(columns=range(200))

        for num_topics in NUM_TOPICS_FOR_TEST:
            print(num_topics, dbname, rand_seed)
            d, r = get_top_documents(df, corpus, dictionary, query, num_topics, rand_seed)
            total_docs = total_docs.append(d, ignore_index=True)
            relevant_docs = relevant_docs.append(r, ignore_index=True)

        total_docs.index = NUM_TOPICS_FOR_TEST
        relevant_docs.index = NUM_TOPICS_FOR_TEST

        total_docs = total_docs.T
        relevant_docs = relevant_docs.T

        drive.mount('drive', force_remount=True)
        total_docs.to_csv('drive/My Drive/Pureza Tópicos/{}_{}_{}.csv'.format('total_docs', dbname, rand_seed))
        relevant_docs.to_csv('drive/My Drive/Pureza Tópicos/{}_{}_{}.csv'.format('relevant_docs', dbname, rand_seed))