In [1]:
import os
import json
import urllib.parse
import re
import numpy as np
from scipy.sparse import coo_matrix,csc_matrix
import operator
import time
import json

#### This notebook computes 4 things :

- the link dict : list of links (id of the forwarding article) in each article
- the link dict inverse : list of articles ids leading to an article id
- the page rank vector of each article
- indices dict : a dictionnary which associates an article id to an row/column index

In [2]:
def upper_sentence(sentence):
    if len(sentence)>1:
        return sentence[0].upper()+sentence[1:]
    if len(sentence)==1:
        return sentence[0].upper() 
    return sentence

def remove_tag(text):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', text)
    return cleantext

# transform the name of the links into id of article
def compute_name_to_id_of_article(title_dict_inverse,link_dic):
    count=0
    count2=0
    for key,value in link_dict.items():
        for i in value:
            if i not in title_dict_inverse:
                count+=1
            else:
                count2+=1
        id_list= [title_dict_inverse[i] for i in value if i in title_dict_inverse]
        link_dict[key]=id_list
    print(count,count2)
    return link_dic

# to avoid duplicates in the links
# to avoid self link
def clean_links(link_dict):
    for key,value in link_dict.items():
        link_dict[key]=list(set(value))
        if key in link_dict[key]:
            link_dict[key].remove(key)
    return link_dict

def construct_inverse_link_dict(link_dict):
    link_dict_inverse={}
    for key,value in link_dict.items():
        for i in value:
            if i in link_dict_inverse:
                link_dict_inverse[i].append(key)
            else:
                link_dict_inverse[i]=[key]
    return link_dict_inverse

# give a line/column number to each article
# because when creating the matrix we don't use the id anymore, only the nb of line/row
def get_column_id(title_dict):
    indices_dict={}
    indices_dict_inverse={}
    count=0
    for i in title_dict:
        indices_dict[i]=count
        indices_dict_inverse[count]=i
        count+=1
    return indices_dict,indices_dict_inverse

# prepare the 3 list for the sparse matrix
def prepare_data_sparse(title_dict,link_dict,indices_dict):
    row=[]
    column=[]
    data=[]
    for i in title_dict:
        out_links=link_dict[i]
        for j in out_links:
            row.append(indices_dict[j])
            column.append(indices_dict[i])
            data.append(1/len(out_links))
    return row,column,data

# main algorithm
# compute the page rank using the matrix M constructed before and returning a vector r of pagerank of each article
def page_rank(title_dict,M,BETA=0.85,eps=1e-5):
    n=len(title_dict)
    r=np.array([1/n for i in range(n)]).reshape(-1,1)
    last_r=np.ones((n,1))
    while np.linalg.norm(r - last_r, 2) > eps:
        last_r=r
        r=BETA*M.dot(r)+((1-BETA)/n)*np.ones((n,1))
        r/=np.sum(r)
    return r

def advice_next_article(title="None",id="None"):
    if title!="None":
        if title in title_dic_inverse:
            id=title_dic_inverse[title]
            list_input_links=link_dic_inverse[id]
        else:
            print("Title not in the dict")
            return
    elif id!="None":
        if id in link_dic_inverse:
            list_input_links=link_dic_inverse[id]
        else:
            print("Id not in dict")
            return
    
    list_input_id=[indides_dic[i] for i in list_input_links]
    output=[[indides_dic_inverse[i],r[i]] for i in list_input_id]
    output=sorted(output, key=lambda x: x[1],reverse=True)
    return output

In [4]:
# main function
# compute the list of links of each article
path='/home/gabriel/Documents/MPRI/Web_Data_Management/wikiextractor-master/text/'
title_list=[]
text_list=[]
title_dict={}
title_dict_inverse={}
text_dict={}
link_dict={}
location_dict={}
wv_dict={}
www_adress={}
df=[]
count=0
for w,i in enumerate(os.listdir(path)):
    for j in os.listdir(path+i):
        for filename in os.listdir(path+i+'/'+j):
            with open(path+i+'/'+j+'/'+filename) as f:
                lines = [line.rstrip('\n') for line in f]
            for line_index,line in enumerate(lines):
                a=json.loads(line)
                if 'text' in a:
                    title_dict[a['id']]=a['title']
                    title_dict_inverse[a['title']]=a['id']
                    urls=re.findall(r'href=[\'"]?([^\'" >]+)', a['text'])
                    title_list_iri=[upper_sentence(urllib.parse.unquote(i)) for i in urls]
                    link_dict[a['id']]=title_list_iri
                count+=1

In [5]:
# at this point, each article is represented in link_dict by a id and a list of titles of articles leading to them 

#transform the list of article titles by a list of ids
link_dict=compute_name_to_id_of_article(title_dict_inverse,link_dict)

#removes duplicates in the links of each page
link_dict=clean_links(link_dict)

#computes the inverse dictionnary of link_dict, where each link has a list of articles ids where each of these
# article lead to him
link_dict_inverse=construct_inverse_link_dict(link_dict)

# transform all the ids into number of column and row (we don't want to use the ids anymore)
indices_dict,indices_dict_inverse=get_column_id(title_dict)

#prepare the data to compute the Stochastic matrix for Page Rank
row,column,data=prepare_data_sparse(title_dict,link_dict,indices_dict)

# compute this stochastic matrix, we use the sparse format because this matrix is essentially filled whith 0
M=csc_matrix((data, (row, column)), shape=(len(title_dict),len(title_dict)))

# computes the page rank vector
r=page_rank(title_dict,M,BETA=0.85,eps=1e-5)

4566433 30694716


In [6]:
4566433/30694716

0.1487693517020975

In [None]:
compute_name_to_id_of_article(title_dict_inverse,link_dict)

In [8]:
s=sorted(range(len(r)), key=lambda k: r[k],reverse=True)
for i in range(15):
    print(title_dict[indices_dict_inverse[s[i]]],len(link_dict[indices_dict_inverse[s[i]]]),r[s[i]])
# print(title_dic[indides_dic_inverse[np.argmax(r)]])

Calendrier julien 138 [0.00524479]
Année (calendrier) 14 [0.0039354]
Jour 45 [0.00391385]
France 1346 [0.00381026]
États-Unis 793 [0.00365807]
Calendrier grégorien 89 [0.00332039]
Lettre dominicale 19 [0.00257525]
31 décembre 8 [0.00255353]
Année commune commençant un vendredi 5 [0.00210453]
Année commune commençant un lundi 5 [0.00189827]
Année commune commençant un mercredi 5 [0.00189555]
Année commune commençant un samedi 10 [0.00176511]
Année commune commençant un dimanche 5 [0.00172646]
Année commune commençant un jeudi 6 [0.00170424]
Année commune commençant un mardi 5 [0.00169258]


In [7]:
j = json.dumps(link_dict_inverse)
f = open("../Database/link_dict_inverse.json","w")
f.write(j)
f.close()

In [5]:
np.save("../Database/r.npy",r)
j = json.dumps(indices_dict)
f = open("../Database/indices_dict.json","w")
f.write(j)
f.close()