In [1]:
# Multiple outputs per cell in Jupyter 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
%%javascript
// Evitar autoscroll.
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [3]:
import numpy as np
# from toolbox import *
import toolbox as tb
from skeleton import *
import pandas as pd
from gensim.models import FastText
%load_ext autoreload
%autoreload 2
import os
import tqdm
from microtc.utils import tweet_iterator
import datetime
from collections import Counter
import gc
from collections import defaultdict

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
class Tokens():
    tokenized_pointer=None
    
    def tokenize(self,json_file,npy_file=None,replace=False,showProgressEach=1000):
        self.npy_file=str(os.path.splitext(json_file)[0]+".npy") if not npy_file else npy_file

        if not replace and os.path.isfile(self.npy_file):
            print(f"** Replace is off. {os.path.abspath(self.npy_file)} already exists, then load.")
        else:
            tokenized_docs=[]
            self.N=0
            print(f"** Processing {json_file} ...")
            tx = datetime.datetime.now()
            for idx, tw in enumerate(tweet_iterator(json_file)):
                tb.show_progress(showProgressEach, tx, idx)
                twTokens = tb.process_line(tw['text']) # Tokenizando tweet.
                tokenized_docs.append(twTokens)
            self.N=idx+1
        
            maxLen=len(max(tokenized_docs))
            for i, doc in enumerate(tokenized_docs):
                tokenized_docs[i]=[doc[x] if x<len(doc) else '' for x in range(maxLen)]
            
            np.save(self.npy_file,tokenized_docs)
            del(tokenized_docs)
            gc.collect()
            print(f"** Processed {self.N} lines. Saved to {os.path.abspath(self.npy_file)}.")
        
        self.pointer=np.load(self.npy_file, mmap_mode='r')
        return self
    
    def get(self,n):
        return [x for x in self.pointer[n,:] if x != '']
    
    def getDocs(self,docs):
#         if isinstance(docs,int): docs=[docs]
        res=[]
        for n in docs:
            res.append(self.get(n))
        return res

In [13]:
tokens=Tokens().tokenize("data/geo-mx-2004_min.json",replace=True,showProgressEach=10000)

** Processing data/geo-mx-2004_min.json ...
2020-05-12 19:16:27.990884 :: 0.000193 - Processing item #0 
** Processed 10000 lines. Saved to /home/jovyan/public/B6_ProyectoIntegrador/data/geo-mx-2004_min.npy.


In [16]:
class Index():
    N = 0
    
    def computePostingLists(self, tokens, showProgressEach=1000):
        tx = datetime.datetime.now()
        DF = Counter({})
        TF = []
        
        print("** Counting TF & DF ...")
        for idx in range(tokens.N):
            tb.show_progress(showProgressEach,tx,idx)
            tok=tokens.get(idx)
            twCnt = Counter(tok) # Sumando frecuencias individiales.
            TF.append(twCnt)
            DF.update(list(twCnt.keys())) # Sumando frecuencias por documento (DF).
            
        self.N=idx+1 # Contabilizando documentos.
        self.postlists=defaultdict(list)
        print("** Calculating TF-IDF...")
        for i,c in enumerate(TF):
            tb.show_progress(showProgressEach,tx,i)
            for w,tf in c.items():
                self.postlists[w].append((tf*np.log2(self.N/(DF[w]+1)),i))
        del(TF)
        del(DF)
        gc.collect()
        
        print("** Sorting posting lists...")
        for idx,(w,p) in enumerate(self.postlists.items()):
            tb.show_progress(showProgressEach,tx,idx)
            p.sort()
            p=list(zip(*p))
            self.postlists[w]=[list(p[1]),list(p[0])]
        
        return self
    
    def getIdxs(self,word):
        return self.postlists[word][0]
    
    def getScores(self,word):
        return self.postlists[word][1]
    
    def a(self, q, isText=False):
        """Búsqueda AND."""
        if isText: q=tb.process_line(q)
        res = None
        for word in q:
            if not res:
                res = set(self.getIdxs(word))
            else:
                res = res.intersection(self.getIdxs(word))
        return list(sorted(res))

    def o(self, q, isText=False):
        """Búsqueda OR."""
        if isText: q=tb.process_line(q)
        res = set()
        for word in q:
            res = res.union(self.getIdxs(word))
        return list(sorted(res))    

In [17]:
index_=Index().computePostingLists(tokens,showProgressEach=10000)
# index_.postlists

** Counting TF & DF ...
2020-05-12 19:16:53.001421 :: 0.001253 - Processing item #0 
** Calculating TF-IDF...
2020-05-12 19:16:53.861872 :: 0.861704 - Processing item #0 
** Sorting posting lists...
2020-05-12 19:16:54.663991 :: 1.663823 - Processing item #0 
2020-05-12 19:16:54.758209 :: 1.758041 - Processing item #10000 


In [18]:
len(index_.postlists.keys())

10383

In [23]:
ds=index_.a(['hoy'])
print(ds)
print(tokens.getDocs(ds))

[3, 5, 45, 73, 100, 124, 151, 154, 169, 258, 270, 280, 300, 343, 358, 464, 473, 572, 608, 610, 612, 624, 635, 657, 667, 669, 781, 800, 919, 953, 990, 1000, 1003, 1019, 1039, 1106, 1109, 1122, 1154, 1162, 1189, 1213, 1215, 1217, 1300, 1306, 1310, 1406, 1451, 1490, 1545, 1592, 1595, 1771, 1877, 1915, 1933, 1981, 2017, 2115, 2244, 2276, 2401, 2482, 2561, 2571, 2584, 2585, 2637, 2641, 2696, 2741, 2777, 2800, 2825, 2835, 2854, 2892, 2916, 3053, 3141, 3163, 3171, 3210, 3231, 3246, 3276, 3371, 3374, 3378, 3389, 3434, 3490, 3505, 3515, 3643, 3659, 3676, 3719, 3787, 3789, 3792, 3856, 3988, 4026, 4041, 4047, 4073, 4201, 4245, 4282, 4355, 4432, 4449, 4605, 4613, 4638, 4825, 4886, 4956, 4957, 5087, 5157, 5164, 5169, 5179, 5216, 5247, 5263, 5267, 5279, 5463, 5468, 5544, 5575, 5576, 5792, 5827, 5876, 5967, 5992, 6011, 6036, 6052, 6069, 6102, 6173, 6197, 6211, 6238, 6280, 6295, 6336, 6355, 6393, 6402, 6406, 6447, 6458, 6500, 6503, 6509, 6515, 6563, 6571, 6586, 6612, 6623, 6626, 6641, 6707, 6742, 6743

In [None]:
gc.collect()
import sys
local_vars = list(locals().items())
tot=0
vars_=[]
for var, obj in local_vars:
    mem=sys.getsizeof(obj)
    tot+=mem
    vars_.append((mem,var))
print("Total =",tot)
vars_.sort(reverse=True)
print(vars_)