In [1]:
import numpy as np
from scipy.sparse import coo_matrix,csr_matrix,csc_matrix,save_npz,load_npz
import time
import sys
import os
import re
import json
import string
from collections import Counter
import sqlite3
import glob
import matplotlib.pyplot as plt
%matplotlib qt
from mpl_toolkits.mplot3d import axes3d, Axes3D

In [2]:
table = str.maketrans(string.punctuation, ' '*len(string.punctuation))
stop_words=["a","abord","absolument","afin","ah","ai","aie","aient","aies","ailleurs","ainsi","ait","allaient","allo","allons","allô","alors","anterieur","anterieure","anterieures","apres","après","as","assez","attendu","au","aucun","aucune","aucuns","aujourd","aujourd'hui","aupres","auquel","aura","aurai","auraient","aurais","aurait","auras","aurez","auriez","aurions","aurons","auront","aussi","autre","autrefois","autrement","autres","autrui","aux","auxquelles","auxquels","avaient","avais","avait","avant","avec","avez","aviez","avions","avoir","avons","ayant","ayez","ayons","b","bah","bas","basee","bat","beau","beaucoup","bien","bigre","bon","boum","bravo","brrr","c","car","ce","ceci","cela","celle","celle-ci","celle-là","celles","celles-ci","celles-là","celui","celui-ci","celui-là","celà","cent","cependant","certain","certaine","certaines","certains","certes","ces","cet","cette","ceux","ceux-ci","ceux-là","chacun","chacune","chaque","cher","chers","chez","chiche","chut","chère","chères","ci","cinq","cinquantaine","cinquante","cinquantième","cinquième","clac","clic","combien","comme","comment","comparable","comparables","compris","concernant","contre","couic","crac","d","da","dans","de","debout","dedans","dehors","deja","delà","depuis","dernier","derniere","derriere","derrière","des","desormais","desquelles","desquels","dessous","dessus","deux","deuxième","deuxièmement","devant","devers","devra","devrait","different","differentes","differents","différent","différente","différentes","différents","dire","directe","directement","dit","dite","dits","divers","diverse","diverses","dix","dix-huit","dix-neuf","dix-sept","dixième","doit","doivent","donc","dont","dos","douze","douzième","dring","droite","du","duquel","durant","dès","début","désormais","e","effet","egale","egalement","egales","eh","elle","elle-même","elles","elles-mêmes","en","encore","enfin","entre","envers","environ","es","essai","est","et","etant","etc","etre","eu","eue","eues","euh","eurent","eus","eusse","eussent","eusses","eussiez","eussions","eut","eux","eux-mêmes","exactement","excepté","extenso","exterieur","eûmes","eût","eûtes","f","fais","faisaient","faisant","fait","faites","façon","feront","fi","flac","floc","fois","font","force","furent","fus","fusse","fussent","fusses","fussiez","fussions","fut","fûmes","fût","fûtes","g","gens","h","ha","haut","hein","hem","hep","hi","ho","holà","hop","hormis","hors","hou","houp","hue","hui","huit","huitième","hum","hurrah","hé","hélas","i","ici","il","ils","importe","j","je","jusqu","jusque","juste","k","l","la","laisser","laquelle","las","le","lequel","les","lesquelles","lesquels","leur","leurs","longtemps","lors","lorsque","lui","lui-meme","lui-même","là","lès","m","ma","maint","maintenant","mais","malgre","malgré","maximale","me","meme","memes","merci","mes","mien","mienne","miennes","miens","mille","mince","mine","minimale","moi","moi-meme","moi-même","moindres","moins","mon","mot","moyennant","multiple","multiples","même","mêmes","n","na","naturel","naturelle","naturelles","ne","neanmoins","necessaire","necessairement","neuf","neuvième","ni","nombreuses","nombreux","nommés","non","nos","notamment","notre","nous","nous-mêmes","nouveau","nouveaux","nul","néanmoins","nôtre","nôtres","o","oh","ohé","ollé","olé","on","ont","onze","onzième","ore","ou","ouf","ouias","oust","ouste","outre","ouvert","ouverte","ouverts","o|","où","p","paf","pan","par","parce","parfois","parle","parlent","parler","parmi","parole","parseme","partant","particulier","particulière","particulièrement","pas","passé","pendant","pense","permet","personne","personnes","peu","peut","peuvent","peux","pff","pfft","pfut","pif","pire","pièce","plein","plouf","plupart","plus","plusieurs","plutôt","possessif","possessifs","possible","possibles","pouah","pour","pourquoi","pourrais","pourrait","pouvait","prealable","precisement","premier","première","premièrement","pres","probable","probante","procedant","proche","près","psitt","pu","puis","puisque","pur","pure","q","qu","quand","quant","quant-à-soi","quanta","quarante","quatorze","quatre","quatre-vingt","quatrième","quatrièmement","que","quel","quelconque","quelle","quelles","quelqu'un","quelque","quelques","quels","qui","quiconque","quinze","quoi","quoique","r","rare","rarement","rares","relative","relativement","remarquable","rend","rendre","restant","reste","restent","restrictif","retour","revoici","revoilà","rien","s","sa","sacrebleu","sait","sans","sapristi","sauf","se","sein","seize","selon","semblable","semblaient","semble","semblent","sent","sept","septième","sera","serai","seraient","serais","serait","seras","serez","seriez","serions","serons","seront","ses","seul","seule","seulement","si","sien","sienne","siennes","siens","sinon","six","sixième","soi","soi-même","soient","sois","soit","soixante","sommes","son","sont","sous","souvent","soyez","soyons","specifique","specifiques","speculatif","stop","strictement","subtiles","suffisant","suffisante","suffit","suis","suit","suivant","suivante","suivantes","suivants","suivre","sujet","superpose","sur","surtout","t","ta","tac","tandis","tant","tardive","te","tel","telle","tellement","telles","tels","tenant","tend","tenir","tente","tes","tic","tien","tienne","tiennes","tiens","toc","toi","toi-même","ton","touchant","toujours","tous","tout","toute","toutefois","toutes","treize","trente","tres","trois","troisième","troisièmement","trop","très","tsoin","tsouin","tu","té","u","un","une","unes","uniformement","unique","uniques","uns","v","va","vais","valeur","vas","vers","via","vif","vifs","vingt","vivat","vive","vives","vlan","voici","voie","voient","voilà","vont","vos","votre","vous","vous-mêmes","vu","vé","vôtre","vôtres","w","x","y","z","zut","à","â","ça","ès","étaient","étais","était","étant","état","étiez","étions","été","étée","étées","étés","êtes","être","ô"]
stop_words+=["»","«","''"," ","–"]
stop_words=set(stop_words)

In [3]:
def remove_tag(text):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', text)
    return cleantext

def get_counters(text):
    text_without_tag=remove_tag(text)
#     text_without_tag=''.join([i for i in text_without_tag if not i.isdigit()])
    text_split=text_without_tag.translate(table).lower().replace('\n', ' ').split(' ')
    text_without_tag=[i for i in text_split if i not in stop_words and i!=""]
#     bigrams=[text_without_tag[i]+" "+text_without_tag[i+1] for i in range(0,len(text_without_tag)-1)
#             if text_without_tag[i]!='' and text_without_tag[i+1]!='']
    
    counter_text=Counter(text_without_tag)
#     counter_bigrams=Counter(bigrams)
    counter_bigrams={}
    return counter_text,counter_bigrams


def clean_dataset(voc_dict,voc_dict_inverse,voc_dict_triplet):
    index_to_remove=set([key for key in voc_dict_triplet.keys() if len(voc_dict_triplet[key])==1])
    voc_dict={key:value for key,value in voc_dict.items() if value not in index_to_remove}
    voc_dict_inverse={key:value for key,value in voc_dict_inverse.items() if key not in index_to_remove}
    voc_dict_triplet={key:value for key,value in voc_dict_triplet.items() if key not in index_to_remove}
#     return voc_dict,voc_dict_inverse,voc_dict_triplet

# def get_matrix(voc_dict_triplet):
#     row=[]
#     column=[]
#     data=[]
#     index=0
#     new_voc_dict={}
#     for key,value in voc_dict_triplet.items():
#         new_voc_dict[voc_dict_inverse[key]]=index
#         for triplet in value:
#             row.append(triplet[0])
#             column.append(index)
#             data.append(triplet[1])
#         index+=1
#     return csc_matrix((data, (row, column)), shape=(np.max(row)+1,len(voc_dict_triplet))),new_voc_dict

def get_triplet(key):
    files_list=glob.glob('*data_*')
    json_list=[]
    for file in files_list:
        f = open(file)
        dic = json.load(f)
        json_list.append(dic)
        f.close()
    del json_list

def get_matrix(voc_dict_triplet):
    row=[]
    column=[]
    data=[]
    index=0
    new_voc_dict={}
    for key,value in voc_dict_triplet.items():
        new_voc_dict[key]=index
        for duo in value:
            for id_ in duo[0]:
                row.append(id_)
                column.append(index)
                data.append(duo[1])
        index+=1
    return csc_matrix((data, (row, column)), shape=(np.max(row)+1,len(voc_dict_triplet))),new_voc_dict

def increase_title_weights(B,M,dataset,new_voc_dict):
    B=B.tolil()
    for i in range(M.shape[0]):
        article_title=dataset[i][1]
        counter_text,counter_bigrams=get_counters(article_title)
        index=[new_voc_dict[key] for counter in [counter_text,counter_bigrams] for key in counter.keys() if key in new_voc_dict]
        for j in index:
            B[i,j]=10*B[i,j]
    B=B.tocsc()
    return B
    
def get_C(M,dataset,new_voc_dict):
    B=M.copy()
    B.data=np.log10(B.data)+1
    B=increase_title_weights(B,M,dataset,new_voc_dict)
    A=count/np.sum(M>0,axis=0)
    A.data=np.log10(A.data)
    C=B.multiply(A).astype("float16")
    del A
    del B
    return C

def get_text(adress,line,query):
    with open(adress) as f:
        lines = [line.rstrip('\n') for line in f]
    text=json.loads(lines[line])
    text=remove_tag(text['text'])
    text=highlight_text(text,query)
    return text
    
def highlight_text(text,query):
    for i in query.split(' '):
        text=text.replace(i, "<b>"+i+"</b>")
    return text
    

def get_closest_articles(C,M,new_voc_dict,dataset,query,k=50):
    counter_words,counter_bigrams=get_counters(query)
    keys=[]
    tf_idf=[]
    for counter in [counter_words,counter_bigrams]:
        for key,value in counter.items():
            if key in new_voc_dict:
                tf=1+np.log10(value/(len(counter_words)+len(counter_bigrams)))
                idf=np.log10(C.shape[0]/np.sum(M.getcol(new_voc_dict[key])))
                tf_idf.append(tf*idf)
                keys.append(new_voc_dict[key])
    dist = np.array((C[:,keys] + np.array(tf_idf).reshape(1,len(tf_idf))))
    dist = np.sum(dist**2, axis=1)
    argsort=np.argsort(dist)[::-1]
    return [dataset[argsort[i]] for i in range(k)]

In [4]:
# conn = sqlite3.connect('../Database/Database.db')
# cursor = conn.cursor()
# create_db="CREATE TABLE TMP(ID_WORD NUMERIC,KEY Text,ID_ARTICLE Numeric,);"
# cursor.execute(create_db)

In [5]:
path='/home/gabriel/Documents/MPRI/Web_Data_Management/wikiextractor-master/text/'
count=0
dataset={}
voc_dict_triplet={}
index=0

for w,i in enumerate(os.listdir(path)):
    for j in os.listdir(path+i):
        for filename in os.listdir(path+i+'/'+j):
            with open(path+i+'/'+j+'/'+filename) as f:
                lines = [line.rstrip('\n') for line in f]
            for line_index,line in enumerate(lines):
                a=json.loads(line) 
                if 'text' in a:
                    counter_text,counter_bigrams=get_counters(a['text'])
                    for counter in [counter_text]:
                        for key,value in counter.items():
                            if key not in voc_dict_triplet:
                                voc_dict_triplet[key]=[([count],value)]
                                index+=1
                            else:
                                add=False
                                for qq in voc_dict_triplet[key]:
                                    if qq[1]==value:
                                        qq[0].append(count)
                                        add=True
                                        break
                                if add==False:
                                    voc_dict_triplet[key].append(([count],value))
                                    
                    location=path+i+'/'+j+'/'+filename
                    dataset[count]=[a['id'],a['title'],a['url'],location,line_index]
                    count+=1
                        
                    if count%100000==0:
                        somme=lambda  x:sum([i[1]*len(i[0]) for i in x])
                        
                        quartile1=np.percentile([somme(value) for key,value in voc_dict_triplet.items()],70)
#                         quartile2=np.percentile([somme(value) for key,value in voc_dict_triplet.items()],99)
                        
                        index_to_remove=set([key for key,value in voc_dict_triplet.items() if (somme(value)<=quartile1)])
#                          or somme(value)>=quartile2)
#                         print(quartile1,quartile2)
                        print(quartile1)
#                         print(index_to_remove)
#                         assert(True==False)
                        voc_dict_triplet={key:value for key,value in voc_dict_triplet.items() if key not in index_to_remove}
                        
                        print(count,len(voc_dict_triplet),sys.getsizeof(voc_dict_triplet)/1024)

4.0
100000 117671 6144.09375
6.0
200000 129714 6144.09375
7.0
300000 146956 6144.09375
8.0
400000 160189 6144.09375
10.0
500000 161741 6144.09375
14.0
600000 151440 6144.09375
6.0
700000 184583 12288.09375
8.0
800000 199843 12288.09375
25.0
900000 147096 6144.09375
32.0
1000000 136979 6144.09375
26.0
1100000 137777 6144.09375
16.0
1200000 140099 6144.09375
11.0
1300000 144036 6144.09375
8.0
1400000 155217 6144.09375
12.0
1500000 155238 6144.09375
42.0
1600000 136667 6144.09375
5.0
1700000 172004 6144.09375
5.0
1800000 229944 12288.09375
26.0
1900000 148061 6144.09375
37.0
2000000 142784 6144.09375


In [6]:
somme=lambda  x:sum([i[1] for i in x])
quartile1=np.percentile([somme(value) for key,value in voc_dict_triplet.items()],30)
#                         quartile2=np.percentile([somme(value) for key,value in voc_dict_triplet.items()],99)

index_to_remove=set([key for key,value in voc_dict_triplet.items() if (somme(value)<=quartile1)])
#                          or somme(value)>=quartile2)
#                         print(quartile1,quartile2)
print(quartile1)
#                         print(index_to_remove)
#                         assert(True==False)
voc_dict_triplet={key:value for key,value in voc_dict_triplet.items() if key not in index_to_remove}
with open('data_'+str(count)+'.json', 'w') as fp:
    json.dump(voc_dict_triplet, fp)
voc_dict_triplet={key:[] for key in voc_dict_triplet.keys()}

0.0


In [25]:
tmp=max(voc_dict_triplet,key=lambda x:sys.getsizeof(voc_dict_triplet[x]))
# sys.getsizeof(voc_dict_triplet[tmp])
tmp

'né'

In [None]:
def sizeof_fmt(num, suffix='B'):
    ''' By Fred Cirera, after https://stackoverflow.com/a/1094933/1870254'''
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)

for name, size in sorted(((name, sys.getsizeof(value)) for name,value in locals().items()),
                         key= lambda x: -x[1])[:10]:
    print("{:>30}: {:>8}".format(name,sizeof_fmt(size)))

In [16]:
len(voc_dict_triplet),sys.getsizeof(voc_dict_triplet)

(484987, 25165920)

In [None]:
# f = open('../Database/dataset.json')
# dataset = json.load(f)
# f.close()
# dataset={int(key):[value[0],int(value[1]),value[2],value[3],value[4]] for key,value in dataset.items()}

In [None]:
# conn = sqlite3.connect('../Database/info.db')
# cursor = conn.cursor()
# create_db="CREATE TABLE Informations(ID NUMERIC,Title TEXT,URL TEXT,Location TEXT,Line_index NUMERIC);"
# cursor.execute(create_db)
# index="CREATE INDEX index_id ON Informations (ID);"
# cursor.execute(index)

In [6]:
M,new_voc_dict=get_matrix(voc_dict_triplet)
del voc_dict_triplet
print("Second Step")
C=get_C(M,dataset,new_voc_dict)

Second Step




In [None]:
# query="théorie des graphes et des cliques et des graphes complets et des graphes bipartis"
# get_closest_articles(C,M,new_voc_dict,dataset,query,k=10)

In [None]:
3+4

In [7]:
save_npz('../Database/C.npz', C)
save_npz('../Database/M.npz', M)
j = json.dumps(dataset)
f = open("../Database/dataset.json","w")
f.write(j)
f.close()
j = json.dumps(new_voc_dict)
f = open("../Database/new_voc_dict.json","w")
f.write(j)
f.close()

In [None]:
C = load_npz("C.npz")
M = load_npz("M.npz")
f = open('new_voc_dict.json')
new_voc_dict = json.load(f)
f.close()
f = open('dataset.json')
dataset = json.load(f)
f.close()a['id'],a['title'],a['url'],location,line_index
dataset={int(key):[int(value[0]),value[1],value[2],value[3],value[4]] for key,value in dataset.items()}

In [28]:
query="théorie des graphes et des cliques et des graphes complets et des graphes bipartis"
result_list=get_closest_articles(C,M,new_voc_dict,dataset,query,k=10)

In [29]:
u, s, vt = svds(C, k=3)

In [36]:
w=u*s.reshape(1,3)

In [39]:
fig = plt.figure()
ax = Axes3D(fig)
ax.scatter3D(w[:1000,0],w[:1000,1],w[:1000,2])

<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x7f8bcc149630>