# Preparación del Dataset
---

- Se obtienen y preprocesan textos directos del dataset para luego ser persistidos.

In [1]:
import pandas as pd
import os
from document import Document
import util.log as log
from multiprocessing import Process
from multiprocessing import Manager
from multiprocessing import Pool
import pandas as pd
from datetime import datetime
log.init_logger()
from util.exceptions import InvalidDocument

In [2]:
preprocess_list = ['lemmatized_string', 'sentences', 'preprocessed_sentences', 'paragraphs', 'preprocessed_paragraphs', 'named_entities', 'urls']

In [3]:
def process_file(file):
    log.info('Processing file: {}'.format(file))
    try:
        path = '{}{}'.format(directory, file)
        document = Document(path = path, preprocess_list = preprocess_list)
        splitted_file = file.split(' $ ') 
        topic = splitted_file[0]
        title = splitted_file[1]
        return [path, title, document.string, document.word_count(), document.type_count(), document.sentences,\
                       document.lemmatized_string, document.stemmed_string, document.simple_preprocessed_string, topic, \
                       document.named_entities, document.bigrams, document.trigrams, document.lemmatized_bigrams, \
                       document.lemmatized_trigrams, document.stemmed_bigrams, document.stemmed_trigrams, \
                       document.simple_preprocessed_bigrams, document.simple_preprocessed_trigrams, document.preprocessed_sentences,
                       document.paragraphs, document.preprocessed_paragraphs, document.urls]
    except InvalidDocument:
        return []

In [4]:
corpora = []
init_time = datetime.now()
directory = '../labeled_dataset/'
files = os.listdir(directory)
number_of_cores_to_use = 10
dataframe_dataset = None


with Pool(number_of_cores_to_use) as pool:
    corpora = pool.map(process_file, files)

time = datetime.now() - init_time
log.info('{} documents were processed. {} documents errored. Total time used: {}, Total cores used: {}'.format(len(corpora), corpora.count([]), \
                                                                                                          str(time), number_of_cores_to_use))

In [5]:
dataframe_dataset = pd.DataFrame(data = corpora, columns=['original_path', 'title', 'string', 'word_count', 'type_count', 'sentences','lemmatized_string', 'stemmed_string', 'simple_preprocessed_string', 'topic', 'named_entities', 'tokens_bigrams', 'tokens_trigrams', 'lemmatized_bigrams', 'lemmatized_trigrams', 'stemmed_bigrams', 'stemmed_trigrams', 'simple_preprocessed_bigrams', 'simple_preprocessed_trigrams', 'preprocessed_sentences', 'paragraphs', 'preprocessed_paragraphs', 'urls'])

In [6]:
dataframe_dataset.to_csv('../data/dataset.csv')

In [7]:
dataframe_dataset

Unnamed: 0,original_path,title,string,word_count,type_count,sentences,lemmatized_string,stemmed_string,simple_preprocessed_string,topic,...,lemmatized_bigrams,lemmatized_trigrams,stemmed_bigrams,stemmed_trigrams,simple_preprocessed_bigrams,simple_preprocessed_trigrams,preprocessed_sentences,paragraphs,preprocessed_paragraphs,urls
0,../labeled_dataset/La larga cola $ TP2 - La La...,TP2 - La Larga Cola de Chris Anderson corto - ...,Universidad Tecnológica Nacional Facultad Reg...,1234,477,[ Universidad Tecnológica Nacional Facultad Re...,,,,La larga cola,...,,,,,,,,[ Universidad Tecnológica Nacional Facultad Re...,[sistema curso cuat marketing nuevo dr ayudant...,[]
1,../labeled_dataset/Sistemas emergentes $ TP_6_...,TP_6_Weiss_Gonzalo.pdf,TRABAJO PRÁCTICO N°6 Curso: K5052 Profesor: Al...,480,231,[TRABAJO PRÁCTICO N°6 Curso: K5052 Profesor: A...,,,,Sistemas emergentes,...,,,,,,,,[TRABAJO PRÁCTICO N°6 Curso: K5052 Profesor: A...,[trabajo curso fecha entregar nombre apellido ...,[]
2,../labeled_dataset/La sociedad de costo margin...,TP N° 5 – La sociedad de costo marginal cero -...,MARKETING EN INTERNET Y NUEVA ECONOMÍA Cátedra...,1031,444,[MARKETING EN INTERNET Y NUEVA ECONOMÍA Cátedr...,,,,La sociedad de costo marginal cero,...,,,,,,,,[MARKETING EN INTERNET Y NUEVA ECONOMÍA Cátedr...,[marketing nuevo ayudante maximiliano bracho t...,[]
3,../labeled_dataset/Wikinomics $ TP1-Franco Zan...,TP1-Franco Zanette.docx,Alumno: Franco Zanette (147. 074-7)Institución...,1180,472,"[Alumno: Franco Zanette (147., 074-7)Instituci...",,,,Wikinomics,...,,,,,,,,"[Alumno: Franco Zanette (147.074-7), Instituci...","[, frba, tp tp, fecha, , concepto, , concepto ...",[]
4,../labeled_dataset/Economia de experiencia $ T...,TP 3 (1).docx,TP 3¿Qué 3 elementos hacen resurgir con fuerza...,590,309,[TP 3¿Qué 3 elementos hacen resurgir con fuerz...,,,,Economia de experiencia,...,,,,,,,,"[TP 3, ¿Qué 3 elementos hacen resurgir con fue...","[tp, elemento hacer resurgir fuerza idea exper...",[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301,../labeled_dataset/La sociedad de costo margin...,TP5-Gariglio.docx,PREGUNTAS TP5 - (Test de lectura sobre capítul...,698,330,[PREGUNTAS TP5 - (Test de lectura sobre capítu...,,,,La sociedad de costo marginal cero,...,,,,,,,,[PREGUNTAS TP5 - (Test de lectura sobre capítu...,"[pregunta lectura, , sociedad costo marginal c...",[]
302,../labeled_dataset/La sociedad de costo margin...,TP5 - La sociedad de MG Cero - Rifkin - Santia...,curso: K5071 - 2 Cuat. 2016Marketing en Intern...,745,336,"[curso: K5071 - 2 Cuat., 2016Marketing en Inte...",,,,La sociedad de costo marginal cero,...,,,,,,,,"[curso: K5071 - 2 Cuat. 2016, Marketing en Int...","[curso cuat, marketing nuevo, , dr, , ayudante...",[]
303,../labeled_dataset/Sistemas emergentes $ TP6 ...,TP6 - Sistemas emergentes - Joel Melamed.docx,"MarketingProfesor: Alejandro Prince, Hernan Bo...",434,222,"[MarketingProfesor: Alejandro Prince, Hernan B...",,,,Sistemas emergentes,...,,,,,,,,"[, , Marketing, Profesor: Alejandro Prince, He...","[, , marketing, , trabajo, , , , fecha entrega...",[]
304,../labeled_dataset/Economia de experiencia $ T...,TP 3 - Economía de Experiencia - Andrés Basso ...,The experience economy - Joseph PINE II y Jame...,771,380,[The experience economy - Joseph PINE II y Jam...,,,,Economia de experiencia,...,,,,,,,,[The experience economy - Joseph PINE II y Jam...,"[experience ii, pregunta trabajo, nota requeri...",[]


# Preparado de Libros

In [8]:
preprocess_list = ['lemmatized_string', 'sentences', 'preprocessed_sentences', 'paragraphs', 'preprocessed_paragraphs']

In [9]:
def process_book(file):
    log.info('Processing file: {}'.format(file))
    try:
        path = '{}{}'.format(directory, file)
        document = Document(path = path, preprocess_list = preprocess_list)
        splitted_file = file.split(' $ ') 
        topic = splitted_file[0]
        title = splitted_file[1]
        return [path, title, document.string, document.word_count(), document.type_count(), document.sentences,\
                       document.lemmatized_string, document.stemmed_string, document.simple_preprocessed_string, topic, \
                       document.named_entities, document.bigrams, document.trigrams, document.lemmatized_bigrams, \
                       document.lemmatized_trigrams, document.stemmed_bigrams, document.stemmed_trigrams, \
                       document.simple_preprocessed_bigrams, document.simple_preprocessed_trigrams, document.preprocessed_sentences,
                       document.paragraphs, document.preprocessed_paragraphs, document.urls]
    except InvalidDocument:
        return []

In [10]:
books = []
init_time = datetime.now()
directory = '../books/'
files = os.listdir(directory)
number_of_cores_to_use = 10
dataframe_books = None


with Pool(number_of_cores_to_use) as pool:
    books = pool.map(process_file, files)

time = datetime.now() - init_time
log.info('{} documents were processed. {} documents errored. Total time used: {}, Total cores used: {}'.format(len(books), books.count([]), \
                                                                                                          str(time), number_of_cores_to_use))

In [14]:
dataframe_books = pd.DataFrame(data = books, columns=['original_path', 'document_title', 'string', 'word_count', 'type_count', 'sentences','lemmatized_text', 'stemmed_text', 'simple_preprocessed', 'topic', 'named_entities', 'tokens_bigrams', 'tokens_trigrams', 'lemmatized_bigrams', 'lemmatized_trigrams', 'stemmed_bigrams', 'stemmed_trigrams', 'simple_preprocessed_bigrams', 'simple_preprocessed_trigrams', 'preprocessed_sentences', 'paragraphs', 'preprocessed_paragraphs', 'urls'])

In [15]:
dataframe_books.to_csv('../data/books.csv')

In [None]:
dataframe_books