In [2]:
import pickle
import pandas as pd
from math import log
import re
import nltk
nltk.download(['punkt','stopwords','wordnet','words'])
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import metapy

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\cmejia\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cmejia\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\cmejia\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\cmejia\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


# Cargar el modelo de datos

In [3]:
loaded_model = pickle.load(open('estructuraDatos.sav', 'rb'))
idexFiles = loaded_model['idexFiles']
vectorizer = loaded_model['vectorizer']
matrix = loaded_model['matriz']
indexMeta = loaded_model['metapyIndex']

# Rankin solo por conteo sin indice invertido

In [4]:
# Encontrar los documentos que contiene una palabra en particular
def encontrarDoc(palabra):
    col = vectorizer.vocabulary_[palabra]
    matx = matrix[:,col]
    indx = matx.nonzero()[0]
    lista =indx.tolist() 
    dfresult = pd.DataFrame()
    for i in range(len(lista)):
        auxres= pd.DataFrame({'NombreArchivo': idexFiles[lista[i]], 'Frecuencia': [matx.data[i]]})
        dfresult = pd.concat([dfresult, auxres])
    dfresult.sort_values('Frecuencia',ascending = False,inplace = True)
    return dfresult

# Construcción del índice invertido

In [5]:
def indice_invertido(dic):
    inv = {}
    N = matrix.shape[0]
    for k, v in vectorizer.vocabulary_.items():
        inv.setdefault(k, {})
        #Los documentos que contienen la palabra v
        matx = matrix[:,v]
        #Indicador de los documentos que contienen la palabra
        indx = matx.nonzero()[0]
        lista =indx.tolist()
        docs = {}
        if len(lista)== 0:
            print(k)
        else:
            #Calculo del IDF, lista contiene todos los documentos que contienen la palabra
            inv[k]['IDF'] = log((N+1)/(len(lista)))
            for i in range(len(lista)):
                keys = docs.setdefault(idexFiles[lista[i]], [])
                #Frecuencia de la palabra V en el documento lista[i]
                keys.append(matx.data[i])
                #Las palabras que contiene el documento lista[i]
                matx2 = matrix[lista[i],:]
                #La frecuencia de cada palabra, que sumada el vector da el total de palabras en el documento
                keys.append(matx2.data.sum())
        inv[k]['Documentos'] = docs
    return inv

In [6]:
ind_inv = indice_invertido(vectorizer.vocabulary_)

across
all
almost
along
also
although
among
amongst
amount
and
anyhow
anyway
around
back
be
beforehand
behind
between
beyond
bill
both
bottom
call
can
cannot
cant
con
could
de
detail
do
eight
either
eleven
enough
even
except
fifteen
fill
find
fire
first
five
former
found
front
full
further
get
give
go
have
here
herein
how
in
interest
it
keep
last
latter
least
less
might
mine
move
must
name
neither
never
nevertheless
next
nor
nothing
off
often
one
onto
other
out
over
own
part
per
put
rather
same
see
seem
serious
show
side
somehow
still
system
take
ten
then
therein
these
thick
thin
third
though
three
throughout
thru
top
toward
two
under
upon
well
where
wherein
whereupon
whether
whoever
whole
whose
with
within
without
would


# Función de limpieza del query

In [7]:
stopWords = stopwords.words('english')
def queryClean(texto):
    #Pasar todo a minisculas
    texto = texto.lower()
    texto =re.sub('(á|à|ä)','a',texto) # Reemplazar a acentuada
    texto =re.sub('(é|è|ë)','e',texto) # Reemplazar e acentuada
    texto =re.sub('(í|ì|ï)','i',texto) # Reemplazar i acentuada
    texto =re.sub('(ó|ò|ö)','o',texto) # Reemplazar o acentuada
    texto =re.sub('(ú|ù|ü)','u',texto) # Reemplazar u acentuada
    texto =re.sub('[^a-zA-Z]',' ',texto) # Eliminar caracteres que no sean: letra, número o vocales acentuadas
    texto =re.sub(' +',' ',texto) # Eliminar espacios en blanco
    #Tokenizar
    tokens = texto.split()
    tokens = [w for w in tokens if (len(w)>1)&(w.isalpha())&(w not in stopWords)]
    #Lemma
    word_net_lemmatizar = WordNetLemmatizer()
    tokens = [word_net_lemmatizar.lemmatize(w, pos = "v") for w in tokens]

    #Stemmer
    ps = PorterStemmer() 
    tokens = [ps.stem(w) for w in tokens]

    return tokens

# Rankin por Term Frequency

In [8]:
def queryTF(word,top):
    respuesta = sorted(ind_inv[word]['Documentos'].items(), key = lambda kv:(kv[1], kv[0]),reverse=True)
    return respuesta[0:top]

# Rankin por Term Frequency / Doc Length

In [9]:
def queryTFDL(word,top):
    aux = ind_inv[word]['Documentos']
    auxdic = {}
    for k,v in aux.items():
        keys = auxdic.setdefault(k, [])
        keys.append(v[0]/v[1])
    respuesta = sorted(auxdic.items(), key = lambda kv:(kv[1], kv[0]),reverse=True)
    return respuesta[0:top]

# Rankin usando BM25

In [10]:
def cal_bm25(idf,frec,k,b,length,avgdl):
    aux = idf*((frec*(k+1))/(frec+k*(1-b+b*length/avgdl)))
    return aux

In [11]:
def queryBM25(query, vocabulary, prom, k1, b, top):
    query_word = queryClean(query)
    dfresultb25 = pd.DataFrame()
    resultadoBm25 = pd.DataFrame()
    for word in query_word:
        if (word in vocabulary):
            aux = ind_inv[word]['Documentos']
            IDF = ind_inv[word]['IDF']
            for k,v in aux.items():
            #     keys = bm25.setdefault(k, [])
                aux25 = cal_bm25(IDF,v[0],k1,b,v[1],prom)
                auxresb25= pd.DataFrame({'NombreArchivo': k.split('\\')[-1], 'Word': word, 'BM25' : [aux25]})
                dfresultb25 = pd.concat([dfresultb25, auxresb25])
            resultadoBm25 = dfresultb25.groupby('NombreArchivo').agg({'BM25':'sum'}).sort_values('BM25',ascending = False).reset_index()
            resultadoBm25.reset_index(inplace = True)
            resultadoBm25.rename(columns = {'index':'Ranking'}, inplace = True)
        else:
            print(f'{word} is not in the vocabulary')
    return resultadoBm25.head(top)

### Definición de parámetros para el BM25

In [12]:
top = 20
prom = 27544.226762002043
k1 = 1.2
b = 0.75
vocabulary = vectorizer.vocabulary_

# Metapy

In [13]:
inv_idx = metapy.index.make_inverted_index('cranfield.toml')

In [14]:
print(f'Total de documentos: {inv_idx.num_docs()}')
print(f'Cantidad de palabras únicas: {inv_idx.unique_terms()}')
print(f'Promedio de longitud de los documentos: {inv_idx.avg_doc_length()}')

Total de documentos: 980
Cantidad de palabras únicas: 51229
Promedio de longitud de los documentos: 3984.62646484375


# Metapy Rankin

In [15]:
def rankerMeta(top, querywords):
    ranker = metapy.index.OkapiBM25(k1 = k1, b = b)
    query = metapy.index.Document()
    query.content(querywords) # query from AP news
    top_docs = ranker.score(inv_idx, query, num_results=top)
    metaresult = pd.DataFrame()
    for doc in top_docs:
        auxmeta= pd.DataFrame({'NombreArchivo': indexMeta[doc[0]],  'BM25_Meta' : [doc[1]]})
        metaresult = pd.concat([metaresult, auxmeta])
    metaresult = metaresult.reset_index(drop = True).reset_index()
    metaresult.rename(columns = {'index':'RankingMeta'},inplace = True)
    return metaresult

# Evaluación de los queries

In [16]:
def calculate_sens(queries,top):
    sens = pd.DataFrame()
    for query in queries:
        resultados = queryBM25(query, vocabulary, prom, k1, b,top)
        metares = rankerMeta(top, query)
        merget = resultados.merge(metares, how = 'left', on = 'NombreArchivo')
        sensibilidad = (merget['RankingMeta']>=0).sum()/len(merget)
        auxsens= pd.DataFrame({'Query': query,  'Sensibilidad' : [sensibilidad]})
        sens = pd.concat([sens, auxsens])
    return sens

In [17]:
queries = ["Data Science","Machine Learning", "Math","Computer Science","Algorithms in dynamic networks", "triangle free process"]
sensibilidad = calculate_sens(queries,top)

data is not in the vocabulary


In [18]:
sensibilidad

Unnamed: 0,Query,Sensibilidad
0,Data Science,0.7
0,Machine Learning,0.95
0,Math,0.75
0,Computer Science,0.65
0,Algorithms in dynamic networks,0.85
0,triangle free process,0.75


In [19]:
query = "Math"

In [20]:
resultados = queryBM25(query, vocabulary, prom, k1, b,top)
metares = rankerMeta(top, query)

In [21]:
resultados.merge(metares, how = 'left', on = 'NombreArchivo')

Unnamed: 0,Ranking,NombreArchivo,BM25,RankingMeta,BM25_Meta
0,0,1502.02348.txt,6.224341,0.0,5.528401
1,1,1509.01347.txt,6.108082,1.0,5.394665
2,2,1409.3562.txt,6.061366,3.0,5.00259
3,3,1404.3186.txt,6.050293,2.0,5.123775
4,4,1506.07094.txt,5.850258,4.0,4.545814
5,5,1508.00315.txt,5.799963,5.0,4.534198
6,6,1401.6312.txt,5.712589,10.0,4.087492
7,7,1306.3261.txt,5.567244,8.0,4.249309
8,8,1409.3176.txt,5.511779,13.0,3.948689
9,9,1509.02900.txt,5.489485,16.0,3.797037


In [22]:
gruposbydoc = pd.read_csv('docByCluster.csv')

## Cuál documento quiere explorar más?
Decir el ranking

In [23]:
documento = 3

In [24]:
cluster = gruposbydoc[gruposbydoc['name_file']==resultados['NombreArchivo'].iloc[documento]]['cluster'].values[0]

## Documentos cercanos

In [25]:
gruposbydoc[gruposbydoc['cluster']==cluster]

Unnamed: 0,identifier,title,description,subject,creator,combine_column,combine_cleaned,cuenta,cluster,name_file
79,http://arxiv.org/abs/1111.7013,A Taxation Policy for Maximizing Social Welfar...,We present a simple tatonnement process base...,Mathematics - Optimization and Control ; Compu...,"Kakhbod, Ali ; Koo, Joseph ; Teneketzis, Demos...",A Taxation Policy for Maximizing Social Welfar...,taxation policy maximize social welfare networ...,51,0,1111.7013.txt
104,http://arxiv.org/abs/1204.1846,Approximate Revenue Maximization with Multiple...,Maximizing the revenue from selling _more th...,Computer Science - Computer Science and Game T...,"Hart, Sergiu ; Nisan, Noam ;",Approximate Revenue Maximization with Multiple...,approximate revenue maximization multiple item...,88,0,1204.1846.txt
125,http://arxiv.org/abs/1208.6408,Java Source-code Clustering: Unifying Syntacti...,This is a companion draft to paper 'Software...,Computer Science - Software Engineering ; D.2....,"Misra, Janardan ; Kaulgud, Vikrant ; Titus, Ga...",Java Source-code Clustering: Unifying Syntacti...,java source code cluster unify syntactic seman...,57,0,1208.6408.txt
157,http://arxiv.org/abs/1301.1027,On online energy harvesting in multiple access...,We investigate performance limits of a multi...,Computer Science - Information Theory ;,"Khuzani, Masoud Badiei ; Mitran, Patrick ;",On online energy harvesting in multiple access...,online energy harvest multiple access communic...,126,0,1301.1027.txt
187,http://arxiv.org/abs/1304.6116,Selling Multiple Correlated Goods: Revenue Max...,"We consider the well known, and notoriously ...",Computer Science - Computer Science and Game T...,"Hart, Sergiu ; Nisan, Noam ;",Selling Multiple Correlated Goods: Revenue Max...,sell multiple correlate goods revenue maximiza...,109,0,1304.6116.txt
191,http://arxiv.org/abs/1305.2386,Disappointment in Social Choice Protocols,Social choice theory is a theoretical framew...,Computer Science - Multiagent Systems ; 91B14 ;,"Javidian, Mohammad Ali ; Ramezanian, Rasoul ;",Disappointment in Social Choice Protocols Soc...,disappointment social choice protocols social ...,55,0,1305.2386.txt
268,http://arxiv.org/abs/1311.2828,Private Matchings and Allocations,We consider a private variant of the classic...,Computer Science - Computer Science and Game T...,"Hsu, Justin ; Huang, Zhiyi ; Roth, Aaron ; Rou...",Private Matchings and Allocations We consider...,private match allocations consider private var...,150,0,1311.2828.txt
331,http://arxiv.org/abs/1403.1639,Optimal Patching in Clustered Malware Epidemics,Studies on the propagation of malware in mob...,Computer Science - Cryptography and Security ;...,"Eshghi, Soheil ; Khouzani, MHR. ; Sarkar, Sasw...",Optimal Patching in Clustered Malware Epidemic...,optimal patch cluster malware epidemics study ...,81,0,1403.1639.txt
332,http://arxiv.org/abs/1403.1642,Optimal Energy-Aware Epidemic Routing in DTNs,"In this work, we investigate the use of epid...",Computer Science - Systems and Control ; Compu...,"Eshghi, Soheil ; Khouzani, MHR. ; Sarkar, Sasw...",Optimal Energy-Aware Epidemic Routing in DTNs ...,optimal energy aware epidemic rout dtns work i...,93,0,1403.1642.txt
341,http://arxiv.org/abs/1403.5715,Mining Attribute-Based Access Control Policies...,Attribute-based access control (ABAC) provid...,Computer Science - Cryptography and Security ;...,"Xu, Zhongyuan ; Stoller, Scott D. ;",Mining Attribute-Based Access Control Policies...,mine attribute base access control policies lo...,57,0,1403.5715.txt


## Grupos de documentos

In [37]:
gruposdocsA = pd.read_csv('clustertable_articles.csv')
gruposdocsA['name_file'] = gruposdocsA['identifier'].apply(lambda x: x.split('/')[-1]+'.txt')
gruposdocsA.rename(columns = {'prediction':'clusterA'}, inplace = True)
gruposdocsA_merge = gruposdocsA[['name_file','clusterA']].copy()
del gruposdocsA

In [38]:
ldaA = pd.read_csv('ldaresults_articles.csv')
ldaA['name_file'] = ldaA['identifier'].apply(lambda x: x.split('/')[-1]+'.txt')
ldaA_merge = ldaA[['name_file','mainTopic']].copy()
del ldaA

In [40]:
totalDocs = gruposbydoc.merge(gruposdocsA_merge, how = 'outer',on = 'name_file')
totalDocs = totalDocs.merge(ldaA_merge, how = 'outer',on = 'name_file')
totalDocs

Unnamed: 0,identifier,title,description,subject,creator,combine_column,combine_cleaned,cuenta,cluster,name_file,clusterA,mainTopic
0,http://arxiv.org/abs/0704.3504,Smooth R\'enyi Entropy of Ergodic Quantum Info...,We prove that the average smooth Renyi entro...,Quantum Physics ; Computer Science - Informati...,"Schoenmakers, Berry ; Tjoelker, Jilles ; Tuyls...",Smooth R\'enyi Entropy of Ergodic Quantum Info...,smooth enyi entropy ergodic quantum informatio...,34,11,0704.3504.txt,1,10
1,http://arxiv.org/abs/0706.1402,Analyzing Design Process and Experiments on th...,"In the field of tutoring systems, investigat...",Computer Science - Computers and Society ; Com...,"Brust, Matthias R. ; Rothkugel, Steffen ;",Analyzing Design Process and Experiments on th...,analyze design process experiment anita generi...,100,10,0706.1402.txt,1,6
2,http://arxiv.org/abs/0710.0736,Colour image segmentation by the vector-valued...,We propose a new method for the numerical so...,Computer Science - Computer Vision and Pattern...,"Kay, David A ; Tomasi, Alessandro ;",Colour image segmentation by the vector-valued...,colour image segmentation vector value allen c...,69,11,0710.0736.txt,1,1
3,http://arxiv.org/abs/0803.2570,Unequal Error Protection: An Information Theor...,An information theoretic framework for unequ...,Computer Science - Information Theory ; Comput...,"Borade, Shashi ; Nakiboglu, Baris ; Zheng, Liz...",Unequal Error Protection: An Information Theor...,unequal error protection information theoretic...,75,5,0803.2570.txt,1,19
4,http://arxiv.org/abs/0808.0084,On the hitting times of quantum versus random ...,In this paper we define new Monte Carlo type...,Quantum Physics ; Computer Science - Data Stru...,"Magniez, Frederic ; Nayak, Ashwin ; Richter, P...",On the hitting times of quantum versus random ...,hit time quantum versus random walk paper defi...,120,11,0808.0084.txt,1,8
5,http://arxiv.org/abs/0811.1254,Coding Theory and Algebraic Combinatorics,This chapter introduces and elaborates on th...,Mathematics - Combinatorics ; Computer Science...,"Huber, Michael ;",Coding Theory and Algebraic Combinatorics Thi...,cod theory algebraic combinatorics chapter int...,62,5,0811.1254.txt,1,17
6,http://arxiv.org/abs/0811.2853,Generating Random Networks Without Short Cycles,Random graph generation is an important tool...,Computer Science - Data Structures and Algorit...,"Bayati, Mohsen ; Montanari, Andrea ; Saberi, A...",Generating Random Networks Without Short Cycle...,generate random network without short cycle ra...,107,3,0811.2853.txt,1,16
7,http://arxiv.org/abs/0812.2709,Variations on a theme by Schalkwijk and Kailath,Schalkwijk and Kailath (1966) developed a cl...,Computer Science - Information Theory ;,"Gallager, Robert G. ; Nakiboglu, Baris ;",Variations on a theme by Schalkwijk and Kailat...,variations theme schalkwijk kailath schalkwijk...,99,5,0812.2709.txt,1,10
8,http://arxiv.org/abs/0903.0197,Rotation Distance is Fixed-Parameter Tractable,Rotation distance between trees measures the...,Computer Science - Data Structures and Algorit...,"Cleary, Sean ; John, Katherine St. ;",Rotation Distance is Fixed-Parameter Tractable...,rotation distance fix parameter tractable rota...,48,4,0903.0197.txt,1,17
9,http://arxiv.org/abs/0903.0199,A Linear-Time Approximation Algorithm for Rota...,Rotation distance between rooted binary tree...,Computer Science - Data Structures and Algorit...,"Cleary, Sean ; John, Katherine St. ;",A Linear-Time Approximation Algorithm for Rota...,linear time approximation algorithm rotation d...,43,4,0903.0199.txt,1,17
