In [1]:
import pickle
import pandas as pd
from math import log
import re
import nltk
nltk.download(['punkt','stopwords','wordnet','words'])
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import metapy

[nltk_data] Downloading package punkt to /home/cmejia3/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/cmejia3/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/cmejia3/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to /home/cmejia3/nltk_data...
[nltk_data]   Package words is already up-to-date!


# Cargar el modelo de datos

In [2]:
loaded_model = pickle.load(open('estructuraDatos.sav', 'rb'))
idexFiles = loaded_model['idexFiles']
vectorizer = loaded_model['vectorizer']
matrix = loaded_model['matriz']
indexMeta = loaded_model['metapyIndex']

# Rankin solo por conteo sin indice invertido

In [3]:
# Encontrar los documentos que contiene una palabra en particular
def encontrarDoc(palabra):
    col = vectorizer.vocabulary_[palabra]
    matx = matrix[:,col]
    indx = matx.nonzero()[0]
    lista =indx.tolist() 
    dfresult = pd.DataFrame()
    for i in range(len(lista)):
        auxres= pd.DataFrame({'NombreArchivo': idexFiles[lista[i]], 'Frecuencia': [matx.data[i]]})
        dfresult = pd.concat([dfresult, auxres])
    dfresult.sort_values('Frecuencia',ascending = False,inplace = True)
    return dfresult

# Construcción del índice invertido

In [4]:
def indice_invertido(dic):
    inv = {}
    N = matrix.shape[0]
    for k, v in vectorizer.vocabulary_.items():
        inv.setdefault(k, {})
        #Los documentos que contienen la palabra v
        matx = matrix[:,v]
        #Indicador de los documentos que contienen la palabra
        indx = matx.nonzero()[0]
        lista =indx.tolist()
        docs = {}
        if len(lista)== 0:
            print(k)
        else:
            #Calculo del IDF, lista contiene todos los documentos que contienen la palabra
            inv[k]['IDF'] = log((N+1)/(len(lista)))
            for i in range(len(lista)):
                keys = docs.setdefault(idexFiles[lista[i]], [])
                #Frecuencia de la palabra V en el documento lista[i]
                keys.append(matx.data[i])
                #Las palabras que contiene el documento lista[i]
                matx2 = matrix[lista[i],:]
                #La frecuencia de cada palabra, que sumada el vector da el total de palabras en el documento
                keys.append(matx2.data.sum())
        inv[k]['Documentos'] = docs
    return inv

In [5]:
ind_inv = indice_invertido(vectorizer.vocabulary_)

across
all
almost
along
also
although
am
among
amongst
amount
an
and
anyhow
anyway
around
at
back
be
beforehand
behind
between
beyond
bill
both
bottom
call
can
cannot
cant
co
con
could
de
detail
do
down
due
eg
eight
either
eleven
enough
etc
even
ever
except
fifteen
fill
find
fire
first
five
for
former
found
four
front
full
further
get
give
go
have
he
here
herein
how
i
ie
if
in
inc
interest
it
keep
last
latter
least
less
ltd
may
me
might
mill
mine
move
much
must
name
neither
never
nevertheless
next
nine
no
none
nor
nothing
off
often
on
one
onto
or
other
out
over
own
part
per
put
rather
re
same
see
seem
serious
show
side
six
so
somehow
still
system
take
ten
then
therein
these
thick
thin
third
though
three
throughout
thru
top
toward
two
un
under
up
upon
us
via
well
where
wherein
whereupon
whether
whoever
whole
whose
will
with
within
without
would
yet


# Función de limpieza del query

In [6]:
stopWords = stopwords.words('english')
def queryClean(texto):
    #Pasar todo a minisculas
    texto = texto.lower()
    texto =re.sub('(á|à|ä)','a',texto) # Reemplazar a acentuada
    texto =re.sub('(é|è|ë)','e',texto) # Reemplazar e acentuada
    texto =re.sub('(í|ì|ï)','i',texto) # Reemplazar i acentuada
    texto =re.sub('(ó|ò|ö)','o',texto) # Reemplazar o acentuada
    texto =re.sub('(ú|ù|ü)','u',texto) # Reemplazar u acentuada
    texto =re.sub('[^a-zA-Z]',' ',texto) # Eliminar caracteres que no sean: letra, número o vocales acentuadas
    texto =re.sub(' +',' ',texto) # Eliminar espacios en blanco
    #Tokenizar
    tokens = texto.split()
    tokens = [w for w in tokens if (len(w)>1)&(w.isalpha())&(w not in stopWords)]
    #Lemma
    word_net_lemmatizar = WordNetLemmatizer()
    tokens = [word_net_lemmatizar.lemmatize(w, pos = "v") for w in tokens]

    #Stemmer
    ps = PorterStemmer() 
    tokens = [ps.stem(w) for w in tokens]

    return tokens

# Rankin por Term Frequency

In [7]:
def queryTF(word,top):
    respuesta = sorted(ind_inv[word]['Documentos'].items(), key = lambda kv:(kv[1], kv[0]),reverse=True)
    return respuesta[0:top]

# Rankin por Term Frequency / Doc Length

In [8]:
def queryTFDL(word,top):
    aux = ind_inv[word]['Documentos']
    auxdic = {}
    for k,v in aux.items():
        keys = auxdic.setdefault(k, [])
        keys.append(v[0]/v[1])
    respuesta = sorted(auxdic.items(), key = lambda kv:(kv[1], kv[0]),reverse=True)
    return respuesta[0:top]

# Rankin usando BM25

In [9]:
def cal_bm25(idf,frec,k,b,length,avgdl):
    aux = idf*((frec*(k+1))/(frec+k*(1-b+b*length/avgdl)))
    return aux

In [10]:
def queryBM25(query, vocabulary, prom, k1, b, top):
    query_word = queryClean(query)
    dfresultb25 = pd.DataFrame()
    resultadoBm25 = pd.DataFrame()
    for word in query_word:
        if (word in vocabulary):
            aux = ind_inv[word]['Documentos']
            IDF = ind_inv[word]['IDF']
            for k,v in aux.items():
            #     keys = bm25.setdefault(k, [])
                aux25 = cal_bm25(IDF,v[0],k1,b,v[1],prom)
                auxresb25= pd.DataFrame({'NombreArchivo': k.split('\\')[-1], 'Word': word, 'BM25' : [aux25]})
                dfresultb25 = pd.concat([dfresultb25, auxresb25])
            resultadoBm25 = dfresultb25.groupby('NombreArchivo').agg({'BM25':'sum'}).sort_values('BM25',ascending = False).reset_index()
            resultadoBm25.reset_index(inplace = True)
            resultadoBm25.rename(columns = {'index':'Ranking'}, inplace = True)
        else:
            print(f'{word} is not in the vocabulary')
    return resultadoBm25.head(top)

### Definición de parámetros para el BM25

In [11]:
top = 20
prom = 27544.226762002043
k1 = 1.2
b = 0.75
vocabulary = vectorizer.vocabulary_

# Metapy

In [12]:
inv_idx = metapy.index.make_inverted_index('cranfield.toml')

In [13]:
print(f'Total de documentos: {inv_idx.num_docs()}')
print(f'Cantidad de palabras únicas: {inv_idx.unique_terms()}')
print(f'Promedio de longitud de los documentos: {inv_idx.avg_doc_length()}')

Total de documentos: 980
Cantidad de palabras únicas: 51229
Promedio de longitud de los documentos: 3984.62646484375


# Metapy Rankin

In [14]:
def rankerMeta(top, querywords):
    ranker = metapy.index.OkapiBM25(k1 = k1, b = b)
    query = metapy.index.Document()
    query.content(querywords) # query from AP news
    top_docs = ranker.score(inv_idx, query, num_results=top)
    metaresult = pd.DataFrame()
    for doc in top_docs:
        auxmeta= pd.DataFrame({'NombreArchivo': indexMeta[doc[0]],  'BM25_Meta' : [doc[1]]})
        metaresult = pd.concat([metaresult, auxmeta])
    metaresult = metaresult.reset_index(drop = True).reset_index()
    metaresult.rename(columns = {'index':'RankingMeta'},inplace = True)
    return metaresult

# Evaluación de los queries

In [15]:
def calculate_sens(queries,top):
    sens = pd.DataFrame()
    for query in queries:
        resultados = queryBM25(query, vocabulary, prom, k1, b,top)
        metares = rankerMeta(top, query)
        merget = resultados.merge(metares, how = 'left', on = 'NombreArchivo')
        sensibilidad = (merget['RankingMeta']>=0).sum()/len(merget)
        auxsens= pd.DataFrame({'Query': query,  'Sensibilidad' : [sensibilidad]})
        sens = pd.concat([sens, auxsens])
    return sens

In [16]:
queries = ["Data Science","Machine Learning", "Math","Computer Science","Algorithms in dynamic networks", "triangle free process"]
sensibilidad = calculate_sens(queries,top)

In [17]:
sensibilidad

Unnamed: 0,Query,Sensibilidad
0,Data Science,0.75
0,Machine Learning,0.95
0,Math,0.85
0,Computer Science,0.65
0,Algorithms in dynamic networks,0.9
0,triangle free process,0.75


In [26]:
query = "Math"

In [27]:
resultados = queryBM25(query, vocabulary, prom, k1, b,top)
metares = rankerMeta(top, query)

In [28]:
resultados.merge(metares, how = 'left', on = 'NombreArchivo')

Unnamed: 0,Ranking,NombreArchivo,BM25,RankingMeta,BM25_Meta
0,0,1502.02348.txt,5.732738,0.0,5.528401
1,1,1509.01347.txt,5.638492,1.0,5.394665
2,2,1409.3562.txt,5.563857,3.0,5.00259
3,3,1404.3186.txt,5.563733,2.0,5.123775
4,4,1506.07094.txt,5.367842,4.0,4.545814
5,5,1508.00315.txt,5.319576,5.0,4.534198
6,6,1401.6312.txt,5.230354,10.0,4.087492
7,7,1306.3261.txt,5.107487,8.0,4.249309
8,8,1506.08238.txt,5.088667,9.0,4.179452
9,9,1409.3176.txt,5.044031,13.0,3.948689
