In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import re
import nltk
nltk.download(['punkt','stopwords','wordnet'])
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to /home/cmejia3/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/cmejia3/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/cmejia3/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [45]:
stopWords = stopwords.words('english')

In [46]:
def clean_files(texto,stopWords):
    texto = re.sub('[^A-Za-z0-9]+',' ',texto)
    
    #Pasar todo a minisculas
    texto = texto.lower()
    
    #Tokenizar
    tokens = texto.split()

    #Solo las palabras que tengas mas de un caracter, alpha y no stopword
    tokens = [w for w in tokens if (len(w)>1)&(w.isalpha() or len(w)==4)&(w not in stopWords)]
    
    #Stemmer
    ps = PorterStemmer() 
    tokens = [ps.stem(w) for w in tokens]
    
    word_net_lemmatizar = WordNetLemmatizer()

    tokens = [word_net_lemmatizar.lemmatize(w, pos = "v") for w in tokens]
    to_return = ' '.join(tokens)
    set_words = set(tokens)
    freq = nltk.FreqDist(tokens)
    return to_return,set_words,freq

In [68]:
base_path = base_path = Path("~").expanduser().resolve()
input_file_path  = base_path / 'datasets/papers-txt'

In [69]:
vocabulary = set()
results_text = []
results_frecuency = []

In [71]:
for f in input_file_path.glob('*.txt'):
    input_file = open(f, "r", encoding = 'iso-8859-1')
    texto = input_file.read()
    text_cleanned,set_words,freq = clean_files(texto,stopWords)
    vocabulary = vocabulary.union(set_words)
    results_text.append(text_cleanned)
    results_frecuency.append(freq)

In [72]:
len(vocabulary)

6426

In [75]:
from sklearn.feature_extraction.text import CountVectorizer

In [76]:
vectorizer = CountVectorizer(analyzer = "word",vocabulary =vocabulary , tokenizer = None, preprocessor = None, stop_words = 'english', max_features = 5000) 
train_data_features = vectorizer.fit_transform(results_text)
#vectorizer.transform(["Machine learning is great"]).toarray()

In [78]:
print(train_data_features)

  (0, 8)	1
  (0, 203)	1
  (0, 206)	1
  (0, 207)	1
  (0, 208)	3
  (0, 209)	1
  (0, 210)	1
  (0, 295)	1
  (0, 302)	1
  (0, 384)	1
  (0, 390)	1
  (0, 405)	1
  (0, 411)	2
  (0, 422)	5
  (0, 451)	14
  (0, 489)	2
  (0, 546)	1
  (0, 555)	3
  (0, 556)	2
  (0, 561)	1
  (0, 616)	1
  (0, 617)	1
  (0, 619)	4
  (0, 623)	1
  (0, 637)	1
  :	:
  (24, 6154)	1
  (24, 6155)	1
  (24, 6163)	1
  (24, 6169)	2
  (24, 6174)	6
  (24, 6210)	1
  (24, 6229)	1
  (24, 6232)	1
  (24, 6238)	23
  (24, 6241)	1
  (24, 6242)	2
  (24, 6254)	5
  (24, 6265)	12
  (24, 6266)	3
  (24, 6278)	2
  (24, 6286)	8
  (24, 6292)	4
  (24, 6306)	1
  (24, 6347)	1
  (24, 6349)	1
  (24, 6350)	2
  (24, 6369)	1
  (24, 6390)	2
  (24, 6400)	1
  (24, 6402)	1


In [77]:
vectorizer.vocabulary_

{'0002': 0,
 '0041': 1,
 '0044': 2,
 '0045': 3,
 '0156': 4,
 '01ma': 5,
 '0200': 6,
 '0636': 7,
 '0704': 8,
 '0709': 9,
 '0710': 10,
 '074e': 11,
 '0803': 12,
 '0804': 13,
 '0806': 14,
 '0807': 15,
 '0808': 16,
 '0811': 17,
 '0812': 18,
 '0862': 19,
 '0900': 20,
 '0901': 21,
 '0902': 22,
 '0903': 23,
 '0904': 24,
 '0907': 25,
 '0910': 26,
 '0911': 27,
 '0dbm': 28,
 '0ih0': 29,
 '0n1n': 30,
 '0pex': 31,
 '1000': 32,
 '1002': 33,
 '1016': 34,
 '1025': 35,
 '1045': 36,
 '1060': 37,
 '1085': 38,
 '1095': 39,
 '1099': 40,
 '10m': 41,
 '10th': 42,
 '1104': 43,
 '1106': 44,
 '1108': 45,
 '1117': 46,
 '1123': 47,
 '114a': 48,
 '114b': 49,
 '114c': 50,
 '1150': 51,
 '115a': 52,
 '115b': 53,
 '11u1': 54,
 '1201': 55,
 '1212': 56,
 '1223': 57,
 '1229': 58,
 '1234': 59,
 '1236': 60,
 '1238': 61,
 '1245': 62,
 '1248': 63,
 '1261': 64,
 '1265': 65,
 '1267': 66,
 '1268': 67,
 '1269': 68,
 '1273': 69,
 '1275': 70,
 '1288': 71,
 '1289': 72,
 '12s0': 73,
 '1302': 74,
 '1306': 75,
 '1341': 76,
 '1343': 7

In [63]:
train_data_features[vectorizer.vocabulary_['0704'],2]

1

In [57]:
print(vectorizer.get_feature_names())

['0704', '2000', '2003', '2004', '2005', '2006', '2007', '3378', '3788', 'abovement', 'abstract', 'account', 'acknowledg', 'actual', 'aep', 'al', 'along', 'also', 'although', 'amplif', 'analog', 'analogon', 'analyz', 'appli', 'applic', 'approach', 'apr', 'argu', 'arxiv', 'asiacrypt', 'assum', 'attain', 'avail', 'averag', 'base', 'basic', 'behav', 'believ', 'berri', 'bind', 'bjelakov', 'bori', 'breiman', 'call', 'cambridg', 'captur', 'case', 'central', 'ch', 'chapter', 'chuang', 'class', 'classic', 'clearli', 'close', 'com', 'commut', 'compact', 'comparison', 'complet', 'compress', 'comput', 'concentr', 'conclud', 'condit', 'connect', 'consequ', 'consid', 'constant', 'context', 'contrari', 'convent', 'converg', 'correspond', 'countabl', 'cover', 'cryptograph', 'data', 'defin', 'definit', 'denot', 'densiti', 'depend', 'dept', 'detail', 'diagon', 'differ', 'digit', 'dimens', 'direct', 'directli', 'discuss', 'diss', 'distanc', 'distribut', 'earli', 'eas', 'edit', 'efer', 'ei', 'eigenvalu',

In [60]:
freq

FreqDist({'0704': 1,
          '2000': 1,
          '2003': 1,
          '2004': 1,
          '2005': 3,
          '2006': 1,
          '2007': 1,
          '3378': 1,
          '3788': 1,
          'abovement': 1,
          'abstract': 1,
          'account': 1,
          'acknowledg': 2,
          'actual': 5,
          'aep': 14,
          'al': 2,
          'along': 1,
          'also': 8,
          'although': 1,
          'amplif': 1,
          'analog': 3,
          'analogon': 2,
          'analyz': 1,
          'appli': 1,
          'applic': 1,
          'approach': 4,
          'apr': 1,
          'argu': 1,
          'arxiv': 1,
          'asiacrypt': 1,
          'assum': 1,
          'attain': 1,
          'avail': 3,
          'averag': 4,
          'base': 8,
          'basic': 1,
          'behav': 1,
          'believ': 1,
          'berri': 2,
          'bind': 11,
          'bjelakov': 1,
          'bori': 1,
          'breiman': 1,
          'call': 2,
          'c