# Limpieza de los datos

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
from pathlib import Path
import re
import nltk
nltk.download(['punkt','stopwords','wordnet'])
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
#import unidecode

[nltk_data] Downloading package punkt to /home/cmejia3/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/cmejia3/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/cmejia3/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
from nltk import *
from nltk.corpus import *

In [3]:
base_path =  Path("~").expanduser().resolve()
#base_path = Path.cwd().expanduser().resolve()
input_file_path  = base_path / 'datasets/salidas/'
datasetOut =base_path / "datasets/salidas_procesamiento/"
datasetOut_freq = base_path / "datasets/salidas_freq/"

In [4]:
input_file_path

PosixPath('/home/cmejia3/datasets/salidas')

In [5]:
def lang_ratio(input):
    lang_ratio = {}
    tokens = wordpunct_tokenize(input)
    words = [word.lower() for word in tokens]
    for language in stopwords.fileids():
        stopwords_set = set(stopwords.words(language))
        word_set = set(words)
        common_elements = word_set.intersection(stopwords_set)
        lang_ratio[language] = len(common_elements)
    return lang_ratio

def detect_language(input):
    ratios = lang_ratio(input)
    language = max(ratios, key = ratios.get)
    return language

Conjunto de las stop words que serán eliminadas ya que no aportan valor:

In [6]:
stopWords = stopwords.words('english')

Palabras que considermos StopWords que no estan incluidas en el conjunto descargado:

In [7]:
listoStopWords = ['www','https','html','figure', 'chapter']

Se añaden las palabras que consideramos al conjunto principal

In [8]:
stopWords.extend(listoStopWords)

### Función principal

In [9]:
def clean_files(texto,stopWords):
    
    #Quitar todos los acentos
    #texto = unidecode.unidecode(texto)
    
    #Quitar todos los caracteres especiales
    texto = re.sub('[^A-Za-z0-9]+',' ',texto)
    
    #Pasar todo a minisculas
    texto = texto.lower()
    
    #Tokenizar
    tokens = texto.split()
    
    #Variable que guarda el año en el que estamos que es el limite superior de los números que no se van a eliminar
    currentYear = int(dt.datetime.now().year)
    
    #Verificar que las palabras tengan más de un caracter, que además sean solo sean letras
    # o si son números que esten entre un rango que sea admisible para no eliminar información de año que se mencione en los artículos
    # y finalmente que no sean palabras que estan en el dicccionario de stopwords.
    
    tokens = [w for w in tokens if (len(w)>1)&(w.isalpha() or (w.isnumeric() and int(w)>=1800 and int(w)<=currentYear))&(w not in stopWords)]
    
    #Stemmer
    ps = PorterStemmer() 
    tokens = [ps.stem(w) for w in tokens]
    
    #Lematización
    word_net_lemmatizar = WordNetLemmatizer()

    tokens = [word_net_lemmatizar.lemmatize(w, pos = "v") for w in tokens]
    
    #Se retorna el texto nuevamente en un solo string luego de ser procesado
    to_return = ' '.join(tokens)
    
    #Se retorna el vocabulario de cada documento
    set_words = set(tokens)
    
    #Y la frecuencia de las palabras
    freq = nltk.FreqDist(tokens)
    return to_return,set_words,freq

In [10]:
import csv

def saving_freq(fd,fileOut,cant):
    file = csv.writer(open(fileOut, 'w'))
    for key, count in fd.most_common(cant):
        file.writerow([key, count])

Inicialización de los conjuntos:

    - Vocabulary: el conjunto de todas las palabras que contienen los documentos
    - results_text: la lista con los documentos ya organizados para construir el bag of words
    - results_frecuency: información de cada documento de las palabras que contiene cuántas veces las contiene

In [11]:
vocabulary = set()
results_text = []
results_frecuency = []

In [12]:
indexFiles = []
for f in input_file_path.glob('*.txt'):
#for f in docs_to_process['FileName']:
    input_file = open(f, "r", encoding = 'utf-8')
    texto = input_file.read()
    aux = detect_language(texto)
    if(aux == 'english'):
        text_cleanned,set_words,freq = clean_files(texto,stopWords)
        name_freq = str(f).split('/')[-1].split('.')[0] + '_freq.csv'
        path_freq = datasetOut_freq / name_freq
        saving_freq(freq,path_freq,50)
        out2 = datasetOut / str(f).split('/')[-1]
        outputFile2= open(out2, 'w', encoding='UTF-8')
        outputFile2.write(text_cleanned)
        outputFile2.close()
        vocabulary = vocabulary.union(set_words)
        results_text.append(text_cleanned)
        #results_frecuency.append(freq)
        indexFiles.append(str(out2))
    else:
        print(aux + ':' + str(f))

german:/home/cmejia3/datasets/salidas/1508.02340.txt


In [13]:
len(vocabulary)

87073

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import vstack,save_npz,load_npz

Construido el vocabulario podemos construir el bag of words, que se hace con la ayuda de la funcion CountVectorizer

In [15]:
vectorizer = CountVectorizer(analyzer = "word",vocabulary =vocabulary , tokenizer = None, preprocessor = None, stop_words = 'english', max_features = 5000) 
#train_data_features = vectorizer.fit_transform(results_text)
#vectorizer.transform(["Machine learning is great"]).toarray()

In [16]:
input_file = open(indexFiles[0], "r", encoding = 'utf-8')
texto = input_file.read()
train_data_features = vectorizer.fit_transform([texto])

In [17]:
for i in range(1,len(indexFiles)):
    input_file = open(indexFiles[i], "r", encoding = 'utf-8')
    texto = input_file.read()
    aux = vectorizer.fit_transform([texto])
    train_data_features = vstack((train_data_features, aux))

In [18]:
save_npz('sparse_matrix.npz', train_data_features)
#sparse_matrix = load_npz('sparse_matrix.npz')

In [19]:
vectorizer.vocabulary_

{'a': 0,
 'aa': 1,
 'aaa': 2,
 'aaaa': 3,
 'aaaaa': 4,
 'aaaaaa': 5,
 'aaaaaaa': 6,
 'aaaaaaaaaaaaaaaababa': 7,
 'aaaab': 8,
 'aaaababa': 9,
 'aaab': 10,
 'aaabbbbaaccccaaaacbbbcccbb': 11,
 'aaai': 12,
 'aaaiaaaipaperview': 13,
 'aaaibarri': 14,
 'aaaimit': 15,
 'aaak': 16,
 'aab': 17,
 'aaba': 18,
 'aabaabc': 19,
 'aabab': 20,
 'aababa': 21,
 'aababaababb': 22,
 'aabb': 23,
 'aabbb': 24,
 'aabc': 25,
 'aabcdaccaac': 26,
 'aabort': 27,
 'aacccgctcgt': 28,
 'aachen': 29,
 'aacn': 30,
 'aad': 31,
 'aadbk': 32,
 'aadom': 33,
 'aaecc': 34,
 'aaeccdoi': 35,
 'aaem': 36,
 'aaf': 37,
 'aag': 38,
 'aagaard': 39,
 'aagea': 40,
 'aah': 41,
 'aai': 42,
 'aaia': 43,
 'aaihaa': 44,
 'aain': 45,
 'aaini': 46,
 'aaixj': 47,
 'aaj': 48,
 'aajbb': 49,
 'aak': 50,
 'aakv': 51,
 'aalbersberg': 52,
 'aalborg': 53,
 'aalg': 54,
 'aall': 55,
 'aalto': 56,
 'aam': 57,
 'aama': 58,
 'aamarkov': 59,
 'aamodt': 60,
 'aan': 61,
 'aana': 62,
 'aanderaa': 63,
 'aank': 64,
 'aanstad': 65,
 'aanund': 66,
 'aar': 67,

In [20]:
with open('vocabulary.txt', 'w') as f:
    for item in vectorizer.get_feature_names():
        f.write("%s\n" % item)

In [21]:
import pickle
filename = 'BoW1.sav'
pickle.dump(vectorizer, open(filename, 'wb'))
#loaded_model = pickle.load(open('BoW1.sav', 'rb'))

In [24]:
print(train_data_features)

  (0, 300)	1
  (0, 305)	3
  (0, 306)	2
  (0, 310)	1
  (0, 333)	1
  (0, 430)	1
  (0, 477)	1
  (0, 480)	2
  (0, 563)	2
  (0, 628)	2
  (0, 725)	1
  (0, 806)	1
  (0, 826)	1
  (0, 912)	1
  (0, 1997)	1
  (0, 2058)	1
  (0, 2101)	1
  (0, 2430)	1
  (0, 2696)	1
  (0, 2743)	6
  (0, 2876)	1
  (0, 3076)	1
  (0, 3105)	1
  (0, 3108)	2
  (0, 3164)	2
  :	:
  (978, 79743)	1
  (978, 79803)	11
  (978, 79982)	1
  (978, 80292)	24
  (978, 80335)	1
  (978, 80418)	1
  (978, 80422)	11
  (978, 80438)	3
  (978, 80708)	1
  (978, 80973)	1
  (978, 81000)	1
  (978, 81125)	1
  (978, 81285)	1
  (978, 82345)	1
  (978, 82397)	3
  (978, 82804)	1
  (978, 82885)	1
  (978, 83010)	1
  (978, 83308)	5
  (978, 83651)	1
  (978, 83694)	1
  (978, 83843)	1
  (978, 84533)	2
  (978, 85707)	2
  (978, 86467)	1
