### Búsqueda de documentos por índice inverso

#### Búsqueda de documentos por palabras

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from collections import  Counter

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
import nltk
nltk.download(['punkt','averaged_perceptron_tagger','wordnet'])

from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet
from nltk.corpus.reader.wordnet import NOUN, VERB, ADV, ADJ

[nltk_data] Downloading package punkt to /home/vania/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/vania/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/vania/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Cargamos los datos

In [3]:
db = fetch_20newsgroups(remove=('headers','footers','quotes'))

In [4]:
len(db.data)

11314

In [5]:
db.data[0]

'I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.'

In [6]:
morphy_tag = {
    'JJ' : ADJ,
    'JJR' : ADJ,
    'JJS' : ADJ,
    'VB' : VERB,
    'VBD' : VERB,
    'VBG' : VERB,
    'VBN' : VERB,
    'VBP' : VERB,
    'VBZ' : VERB,
    'RB' : ADV,
    'RBR' : ADV,
    'RBS' : ADV
}

def doc_a_tokens(doc):
    tagged = pos_tag(word_tokenize(doc.lower()))
    print(tagged)
    lemmatizer = WordNetLemmatizer()
    tokens = []
    for p,t in tagged:
        tokens.append(lemmatizer.lemmatize(p, pos=morphy_tag.get(t, NOUN)))

    return tokens

Guardamos el conjunto preprocesado como una lista de cadenas, una por documento

In [7]:
corpus = []
for d in db.data[:100]:
    d = d.replace('\n',' ').replace('\r',' ').replace('\t',' ')
    tokens = doc_a_tokens(d)
    corpus.append(' '.join(tokens))

[('i', 'NN'), ('was', 'VBD'), ('wondering', 'VBG'), ('if', 'IN'), ('anyone', 'NN'), ('out', 'IN'), ('there', 'RB'), ('could', 'MD'), ('enlighten', 'VB'), ('me', 'PRP'), ('on', 'IN'), ('this', 'DT'), ('car', 'NN'), ('i', 'NN'), ('saw', 'VBD'), ('the', 'DT'), ('other', 'JJ'), ('day', 'NN'), ('.', '.'), ('it', 'PRP'), ('was', 'VBD'), ('a', 'DT'), ('2-door', 'JJ'), ('sports', 'NNS'), ('car', 'NN'), (',', ','), ('looked', 'VBD'), ('to', 'TO'), ('be', 'VB'), ('from', 'IN'), ('the', 'DT'), ('late', 'JJ'), ('60s/', 'CD'), ('early', 'JJ'), ('70s', 'CD'), ('.', '.'), ('it', 'PRP'), ('was', 'VBD'), ('called', 'VBN'), ('a', 'DT'), ('bricklin', 'NN'), ('.', '.'), ('the', 'DT'), ('doors', 'NNS'), ('were', 'VBD'), ('really', 'RB'), ('small', 'JJ'), ('.', '.'), ('in', 'IN'), ('addition', 'NN'), (',', ','), ('the', 'DT'), ('front', 'NN'), ('bumper', 'NN'), ('was', 'VBD'), ('separate', 'JJ'), ('from', 'IN'), ('the', 'DT'), ('rest', 'NN'), ('of', 'IN'), ('the', 'DT'), ('body', 'NN'), ('.', '.'), ('this',

[('hello', 'NN'), (',', ','), ('i', 'NN'), ('am', 'VBP'), ('looking', 'VBG'), ('to', 'TO'), ('add', 'VB'), ('voice', 'NN'), ('input', 'NN'), ('capability', 'NN'), ('to', 'TO'), ('a', 'DT'), ('user', 'JJ'), ('interface', 'NN'), ('i', 'NN'), ('am', 'VBP'), ('developing', 'VBG'), ('on', 'IN'), ('an', 'DT'), ('hp730', 'NN'), ('(', '('), ('unix', 'JJ'), (')', ')'), ('workstation', 'NN'), ('.', '.'), ('i', 'NN'), ('would', 'MD'), ('greatly', 'RB'), ('appreciate', 'VB'), ('information', 'NN'), ('anyone', 'NN'), ('would', 'MD'), ('care', 'VB'), ('to', 'TO'), ('offer', 'VB'), ('about', 'IN'), ('voice', 'NN'), ('input', 'NN'), ('systems', 'NNS'), ('that', 'WDT'), ('are', 'VBP'), ('easily', 'RB'), ('accessible', 'JJ'), ('from', 'IN'), ('the', 'DT'), ('unix', 'JJ'), ('environment', 'NN'), ('.', '.'), ('the', 'DT'), ('names', 'NNS'), ('or', 'CC'), ('adresses', 'NNS'), ('of', 'IN'), ('applicable', 'JJ'), ('vendors', 'NNS'), (',', ','), ('as', 'RB'), ('well', 'RB'), ('as', 'IN'), ('any', 'DT'), ('exp

[('an', 'DT'), ('excellent', 'JJ'), ('automatic', 'NN'), ('can', 'MD'), ('be', 'VB'), ('found', 'VBN'), ('in', 'IN'), ('the', 'DT'), ('subaru', 'NN'), ('legacy', 'NN'), ('.', '.'), ('it', 'PRP'), ('switches', 'VBZ'), ('to', 'TO'), ('``', '``'), ('sport', 'VB'), ("''", "''"), ('mode', 'NN'), ('when', 'WRB'), ('the', 'DT'), ('electronics', 'NNS'), ('figure', 'NN'), ('it', 'PRP'), (',', ','), ('not', 'RB'), ('when', 'WRB'), ('the', 'DT'), ('driver', 'NN'), ('sets', 'VBZ'), ('the', 'DT'), ('switch', 'NN'), ('..', 'NN'), ('which', 'WDT'), ('is', 'VBZ'), ('the', 'DT'), ('proper', 'JJ'), ('way', 'NN'), ('to', 'TO'), ('do', 'VB'), ('it', 'PRP'), (',', ','), ('imo', 'RB'), ('.', '.'), ('so', 'RB'), ('what', 'WP'), ('does', 'VBZ'), ('``', '``'), ('sport', 'VB'), ("''", "''"), ('mode', 'NN'), ('entail', 'NN'), ('?', '.'), ('several', 'JJ'), ('things', 'NNS'), (':', ':'), ('1', 'CD'), (')', ')'), ('revving', 'NN'), ('to', 'TO'), ('red', 'JJ'), ('line', 'NN'), ('(', '('), ('or', 'CC'), ('to', 'TO')

[('yo', 'NN'), ('!', '.'), ('watch', 'VB'), ('the', 'DT'), ('attributions', 'NNS'), ('--', ':'), ('i', 'NN'), ('did', 'VBD'), ("n't", 'RB'), ('say', 'VB'), ('that', 'IN'), ('!', '.'), ('again', 'RB'), (',', ','), ('this', 'DT'), ('is', 'VBZ'), ("n't", 'RB'), ('an', 'DT'), ('appropriate', 'JJ'), ('forum', 'NN'), ('for', 'IN'), ('discussions', 'NNS'), ('on', 'IN'), ('whether', 'IN'), ('you', 'PRP'), ('should', 'MD'), ('shoot', 'VB'), ('someone', 'NN'), ('for', 'IN'), ('property', 'NN'), ('damage/vandalism/theft', 'NN'), (',', ','), ('but', 'CC'), ('every', 'DT'), ('responsible', 'JJ'), ('gun', 'NN'), ('owner', 'NN'), ('realizes', 'VBZ'), ('that', 'IN'), ('there', 'EX'), ('are', 'VBP'), ('limits', 'NNS'), (',', ','), ('and', 'CC'), ('the', 'DT'), ('punishment', 'NN'), ('must', 'MD'), ('fit', 'VB'), ('the', 'DT'), ('crime', 'NN'), ('.', '.'), ('i', 'JJ'), ('mean', 'VBP'), (',', ','), ('think', 'VBP'), ('about', 'IN'), ('it', 'PRP'), ('--', ':'), ('is', 'VBZ'), ('a', 'DT'), ('(', '('), ('re

In [8]:
corpus[0]

'i be wonder if anyone out there could enlighten me on this car i saw the other day . it be a 2-door sport car , look to be from the late 60s/ early 70 . it be call a bricklin . the door be really small . in addition , the front bumper be separate from the rest of the body . this be all i know . if anyone can tellme a model name , engine spec , year of production , where this car be make , history , or whatever info you have on this funky look car , please e-mail .'

Obtenemos las bolsas de palabras de los documentos preprocesados usando la clase CountVectorizer de scikit-learn

In [9]:
v = CountVectorizer(stop_words='english', max_features=5000, max_df=0.8)
bolsas = v.fit_transform(corpus)

In [10]:
v

CountVectorizer(max_df=0.8, max_features=5000, stop_words='english')

In [11]:
print('Componentes de primer documento: {0}'.format(corpus[0]))

Componentes de primer documento: i be wonder if anyone out there could enlighten me on this car i saw the other day . it be a 2-door sport car , look to be from the late 60s/ early 70 . it be call a bricklin . the door be really small . in addition , the front bumper be separate from the rest of the body . this be all i know . if anyone can tellme a model name , engine spec , year of production , where this car be make , history , or whatever info you have on this funky look car , please e-mail .


In [11]:
print('Bolsa de primer documento: [\n{0}]'.format(bolsas[0]))

Bolsa de primer documento: [
  (0, 3903)	1
  (0, 1412)	1
  (0, 820)	4
  (0, 3149)	1
  (0, 1147)	1
  (0, 1307)	2
  (0, 3354)	1
  (0, 2195)	2
  (0, 2118)	1
  (0, 269)	1
  (0, 1351)	1
  (0, 281)	1
  (0, 740)	1
  (0, 2930)	1
  (0, 3294)	1
  (0, 384)	1
  (0, 766)	1
  (0, 3213)	1
  (0, 3035)	1
  (0, 708)	1
  (0, 2095)	1
  (0, 3538)	1
  (0, 2369)	1
  (0, 1407)	1
  (0, 3338)	1
  (0, 3945)	1
  (0, 2829)	1
  (0, 2243)	1
  (0, 1820)	1
  (0, 1944)	1
  (0, 1643)	1
  (0, 2234)	1]


Definimos la clase para el índice inverso con un método para recuperar los documentos que contienen una lista de palabras

In [12]:
class IndiceInverso:
    def  __getitem__(self, idx):
        return self.ifs[idx]

    def __repr__(self):
        contenido = ['%d::%s' % (i, self.ifs[i]) for i in range(len(self.ifs))]
        return "<IFS :%s >" % ('\n'.join(contenido))

    def __str__(self):
        contenido = ['%d::%s' % (i, self.ifs[i]) for i in range(len(self.ifs))]
        return '\n'.join(contenido)

    def recupera(self, l):
        return Counter([j for (i,_) in l for j in self.ifs[i]])

    def from_csr(self, csr):
        self.ifs = [[] for _ in range(csr.shape[1])]
        coo = csr.tocoo()    
        print(coo)
        for i,j,v in zip(coo.row, coo.col, coo.data):
            self.ifs[j].append(i)

Instanciamos nuestra clase IndiceInverso y creamos la estructura a partir de nuestras bolsas de palabras

In [13]:
ifs = IndiceInverso()
ifs.from_csr(bolsas)

  (0, 3903)	1
  (0, 1412)	1
  (0, 820)	4
  (0, 3149)	1
  (0, 1147)	1
  (0, 1307)	2
  (0, 3354)	1
  (0, 2195)	2
  (0, 2118)	1
  (0, 269)	1
  (0, 1351)	1
  (0, 281)	1
  (0, 740)	1
  (0, 2930)	1
  (0, 3294)	1
  (0, 384)	1
  (0, 766)	1
  (0, 3213)	1
  (0, 3035)	1
  (0, 708)	1
  (0, 2095)	1
  (0, 3538)	1
  (0, 2369)	1
  (0, 1407)	1
  (0, 3338)	1
  :	:
  (99, 3201)	1
  (99, 1895)	1
  (99, 1114)	1
  (99, 814)	1
  (99, 1714)	2
  (99, 1922)	1
  (99, 2831)	2
  (99, 1616)	1
  (99, 1071)	1
  (99, 1250)	1
  (99, 670)	1
  (99, 2226)	1
  (99, 2026)	1
  (99, 3550)	1
  (99, 3343)	1
  (99, 509)	1
  (99, 3173)	1
  (99, 38)	1
  (99, 3243)	2
  (99, 3910)	2
  (99, 2090)	1
  (99, 1811)	3
  (99, 819)	2
  (99, 366)	1
  (99, 1041)	2


Definimos una función que convierta de arreglos dispersos CSR a listas de listas

In [15]:
def csr_to_ldb(csr):
    ldb = [[] for _ in range(csr.shape[0])]
    coo = csr.tocoo()    
    for i,j,v in zip(coo.row, coo.col, coo.data):
        ldb[i].append((j, v))

    return ldb



Generamos algunas consultas y calculamos sus bolsas de palabras



In [16]:
consultas = []
for c in ['nasa space mission satellite','government crime enforcement security']:
    tokens = doc_a_tokens(c)
    consultas.append(' '.join(tokens))

[('nasa', 'JJ'), ('space', 'NN'), ('mission', 'NN'), ('satellite', 'NN')]
[('government', 'NN'), ('crime', 'NN'), ('enforcement', 'NN'), ('security', 'NN')]


In [17]:
consultas

['nasa space mission satellite', 'government crime enforcement security']

In [18]:
v

CountVectorizer(max_df=0.8, max_features=5000, stop_words='english')

In [19]:
bc = v.transform(consultas)
cl = csr_to_ldb(bc)

In [20]:
cl

[[(2356, 1), (2437, 1), (3142, 1), (3328, 1)],
 [(1089, 1), (1406, 1), (1709, 1), (3193, 1)]]

Usamos el índice inverso para recuperar los documentos que contienen las palabras de la primera consulta ordenados por coincidencias y visualizamos el primer documento recuperado

In [21]:
recs = ifs.recupera(cl[0])
top = recs.most_common()[0]
recs.most_common()

[(59, 4), (13, 2), (67, 1), (49, 1)]

In [22]:
recs

Counter({13: 2, 59: 4, 67: 1, 49: 1})

In [26]:
db.data[top[0]]

'Archive-name: space/new_probes\nLast-modified: $Date: 93/04/01 14:39:17 $\n\nUPCOMING PLANETARY PROBES - MISSIONS AND SCHEDULES\n\n    Information on upcoming or currently active missions not mentioned below\n    would be welcome. Sources: NASA fact sheets, Cassini Mission Design\n    team, ISAS/NASDA launch schedules, press kits.\n\n\n    ASUKA (ASTRO-D) - ISAS (Japan) X-ray astronomy satellite, launched into\n    Earth orbit on 2/20/93. Equipped with large-area wide-wavelength (1-20\n    Angstrom) X-ray telescope, X-ray CCD cameras, and imaging gas\n    scintillation proportional counters.\n\n\n    CASSINI - Saturn orbiter and Titan atmosphere probe. Cassini is a joint\n    NASA/ESA project designed to accomplish an exploration of the Saturnian\n    system with its Cassini Saturn Orbiter and Huygens Titan Probe. Cassini\n    is scheduled for launch aboard a Titan IV/Centaur in October of 1997.\n    After gravity assists of Venus, Earth and Jupiter in a VVEJGA\n    trajectory, the sp



Repetimos el proceso anterior para la segunda consulta


In [23]:
recs = ifs.recupera(cl[1])
top = recs.most_common()[0]
print(recs.most_common())
print(db.data[top[0]])

[(70, 2), (37, 2), (71, 1), (80, 1), (55, 1), (67, 1), (76, 1), (17, 1), (38, 1)]

: Pardon me? Here is to an amherst-clown:
: 
: "Your three chiefs, Dro, Hamazasp and Kulkhandanian are the ringleaders
:  of the bands which have destroyed Tartar villages and have staged 
:  massacres in Zangezour, Surmali, Etchmiadzin, and Zangibasar. This is
:  intolerable.


Were you expecting a different response? Here is another one:

Source: K. S. Papazian, "Patriotism Perverted," Baikar Press, Boston, 1934, 
        (73 pages with Appendix).

p. 25 (third paragraph)

"Some real fighters sprang up from among the people, who struck terror
 into the hearts of the Turks."


"Within a few months after the war began, these Armenian guerrilla
 forces, operating in close coordination with the Russians, were
 savagely attacking Turkish cities, towns and villages in the east,
 massacring their inhabitants without mercy, while at the same time
 working to sabotage the Ottoman army's war effort by destroying

#### Búsqueda de documentos similares

Ahora vamos a realizar búsquedas de documentos similares a un documento de consulta.

Primero tomamos 1 documento que sirva de consulta y lo visualizamos

In [24]:
dc = db.data[0]
dc

'I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.'

Obtenemos su bolsa

In [25]:
tokens = doc_a_tokens(dc)
bolsa_dc = v.transform([' '.join(tokens)])
print('Componentes para consulta: {0}'.format(tokens))
print('Bolsa para consulta: [\n{0}]'.format(bolsa_dc))

[('i', 'NN'), ('was', 'VBD'), ('wondering', 'VBG'), ('if', 'IN'), ('anyone', 'NN'), ('out', 'IN'), ('there', 'RB'), ('could', 'MD'), ('enlighten', 'VB'), ('me', 'PRP'), ('on', 'IN'), ('this', 'DT'), ('car', 'NN'), ('i', 'NN'), ('saw', 'VBD'), ('the', 'DT'), ('other', 'JJ'), ('day', 'NN'), ('.', '.'), ('it', 'PRP'), ('was', 'VBD'), ('a', 'DT'), ('2-door', 'JJ'), ('sports', 'NNS'), ('car', 'NN'), (',', ','), ('looked', 'VBD'), ('to', 'TO'), ('be', 'VB'), ('from', 'IN'), ('the', 'DT'), ('late', 'JJ'), ('60s/', 'CD'), ('early', 'JJ'), ('70s', 'CD'), ('.', '.'), ('it', 'PRP'), ('was', 'VBD'), ('called', 'VBN'), ('a', 'DT'), ('bricklin', 'NN'), ('.', '.'), ('the', 'DT'), ('doors', 'NNS'), ('were', 'VBD'), ('really', 'RB'), ('small', 'JJ'), ('.', '.'), ('in', 'IN'), ('addition', 'NN'), (',', ','), ('the', 'DT'), ('front', 'NN'), ('bumper', 'NN'), ('was', 'VBD'), ('separate', 'JJ'), ('from', 'IN'), ('the', 'DT'), ('rest', 'NN'), ('of', 'IN'), ('the', 'DT'), ('body', 'NN'), ('.', '.'), ('this',

Definimos una función para hacer búsqueda por fuerza bruta dada una función de distancia o similitud

In [28]:
def fuerza_bruta(base, consulta, fd):
    medidas = np.zeros(base.shape[0])
    for i,x in enumerate(base):
        medidas[i] = fd(consulta, x)

    return medidas

Definimos la función para la similitud coseno

In [29]:
def similitud_coseno(x, y):
    x = x.toarray()[0]
    y = y.toarray()[0]
    pnorma = (np.sqrt(x @ x) * np.sqrt(y @ y))

    if pnorma > 0:
        return (x @ y) / pnorma
    else: 
        return np.nan 

In [30]:
bolsa_dc

<1x3970 sparse matrix of type '<class 'numpy.int64'>'
	with 32 stored elements in Compressed Sparse Row format>

In [31]:
sims = fuerza_bruta(bolsas[1:], bolsa_dc, similitud_coseno)

In [32]:
print('Similitud máxima es {0} de documento {1}'.format(np.nanmax(sims), np.nanargmax(sims)+ 1))

Similitud máxima es 0.2527318481572156 de documento 17




Revisamos documento más similar



In [33]:
print(db.data[np.nanargmax(sims) + 1])

I recently posted an article asking what kind of rates single, male
drivers under 25 yrs old were paying on performance cars. Here's a summary of
the replies I received.
 
 
 
 
-------------------------------------------------------------------------------
 
I'm not under 25 anymore (but is 27 close enough).
 
1992 Dodge Stealth RT/Twin Turbo (300hp model).
No tickets, no accidents, own a house, have taken defensive driving 1,
airbag, abs, security alarm, single.
 
$1500/year  $500 decut. State Farm Insurance (this includes the additional $100
for the $1,000,000 umbrella policy over my car and house)  The base
policy is the standard $100,000 - $100,000 - $300,000 policy required in DE.
 
After 2nd defensive driving course it will be 5% less.
 
I bought the car in September 1992.  The company I was with (never had
and accident or ticket in 11 years) quoted me $2,500.
 
Hope this helps.
 
Steve Flynn
University of Delaware
 
    Kevin:
 
    (Hope I remembered your name correctly)...
 


Definimos la distancia euclidiana

In [34]:
def distancia_euclidiana(x, y):   
    x = x.toarray()[0]
    y = y.toarray()[0]
    return np.sqrt(np.sum((x - y)**2))

Repetimos el proceso anterior con la distancia euclidiana

In [35]:
euclids = fuerza_bruta(bolsas[1:], bolsa_dc, distancia_euclidiana)
print('Distancia mínima es {0} de documento {1}'.format(np.nanmin(euclids), np.nanargmin(euclids) + 1))

Distancia mínima es 7.280109889280518 de documento 12


Visualizamos el documento

In [36]:
print(db.data[np.nanargmin(euclids) + 1])

--

