# Page Rank

A vida é muito bonita, gostaria de viver para sempre.

- Eduardo Adame
- Rodrigo Pintucci

In [1]:
import numpy as np
import numpy.linalg as la
import pandas as pd
import requests
import re
from urllib.parse import urlparse, urldefrag
from bs4 import BeautifulSoup as bs

### Criando variáveis importantes.

In [2]:
all_links = set()

### Definindo funções.

In [3]:
def find_probs(url: str):
    ################
    ### SCRAPING ###
    ################
    
    # Requisitando a página.
    raw_page = requests.get(url)
    
    # Adquirindo seu HTML.
    html_page = raw_page.text
    
    # Criando o objeto scrapper.
    soup = bs(html_page, 'lxml')
    
    # Capturando todos os links que levam à própria Wikipédia.
    links = soup.findAll('a', attrs= {'rel': 'mw:WikiLink'})
    
    # Criando lista de tuplas com links.
    data = [link.get('href') for link in links]
    
    # Criando dataframe a partir desses dados.
    df = pd.DataFrame(data, columns = ['link'])
    
    #####################
    ### PROBABILIDADE ###
    #####################
    
    # Retirando fragmento da url
    df['link'] = df['link'].apply(lambda x: urldefrag(x)[0])
    
    # Contabilizando quantas referências a cada link o site possui. 
    df = df.groupby(['link']).size().reset_index(name='count')
    
    # Criando a identificação da coluna a partir da url (padronizar).
    title = url.replace('https://pt.wikipedia.org/api/rest_v1/page/html', '.')
    
    # Criando essa coluna de probabilidades identificada pelo site.
    df[title] = df['count']/df['count'].sum()
    
    return df[['link', title]]

In [4]:
def scraping_loop(first_url: str):
    ##################################
    ### LOOP PRINCIPAL DE SCRAPING ###
    ##################################
    
    # Referenciando ao set que contém todos os links
    global all_links
    
    # Adicionando o link principal a esse set
    all_links.add(first_url)
    
    # Encontra os links (e suas probabilidades) para a url inicial
    links = find_probs(first_url)
    
    # Cria uma cópia para ser retornada após merge
    main_frame = links.copy()
    
    # Itera entre os links da url principal
    for link in links['link']:
        
        # Transformando a url relativa para a absoluta (e removendo o #titulo).
        full_link = link.replace('./', 'https://pt.wikipedia.org/api/rest_v1/page/html/')
        full_link = urldefrag(full_link)[0]
        
        # Checa se o link já foi utilizado
        if full_link in all_links:
            continue
        else:
            # Adiciona ao set de links o link
            all_links.add(full_link)
            
            # Cria a coluna de probabilidades para esse link
            probs = find_probs(full_link)
            
            # Faz um outer join com o main_frame
            main_frame = pd.merge(main_frame, probs, how='outer', on='link')
            
    return main_frame.fillna(0)


### Gerando dataset.

In [5]:
url = 'https://pt.wikipedia.org/api/rest_v1/page/html/Álgebra_linear'

In [6]:
df2 = scraping_loop(url)

In [7]:
df2

Unnamed: 0,link,./Álgebra_linear,./Ajuda:Controle_de_autoridade,./Anel_(matemática),./Análise_complexa,./Análise_funcional,./Análise_matemática,./Análise_matricial,./Análise_numérica,./Análise_não_padronizada,...,./Zero,./Álgebra,./Álgebra_abstrata,./Álgebra_booleana,./Álgebra_comutativa,./Álgebra_elementar,./Álgebra_linear_numérica,./Álgebra_multilinear,./Álgebra_não_linear,./Álgebra_universal
0,./Ajuda:Controle_de_autoridade,0.005319,0.033333,0.006369,0.007576,0.004926,0.003774,0.0,0.004831,0.000000,...,0.008403,0.005848,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000
1,./Anel_(matemática),0.005319,0.000000,0.019108,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.011696,0.010929,0.012195,0.022222,0.008333,0.0,0.000000,0.0,0.024793
2,./Análise_complexa,0.005319,0.000000,0.000000,0.045455,0.009852,0.018868,0.0,0.004831,0.000000,...,0.000000,0.005848,0.005464,0.000000,0.022222,0.008333,0.0,0.013699,0.0,0.000000
3,./Análise_funcional,0.010638,0.000000,0.000000,0.007576,0.014778,0.018868,0.0,0.009662,0.000000,...,0.000000,0.005848,0.005464,0.000000,0.000000,0.008333,0.0,0.013699,0.0,0.000000
4,./Análise_matemática,0.005319,0.000000,0.000000,0.015152,0.019704,0.011321,0.0,0.009662,0.012658,...,0.000000,0.011696,0.005464,0.000000,0.000000,0.008333,0.0,0.013699,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7057,./William_Lawvere,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.016529
7058,./Álgebra_de_grafo,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.008264
7059,./Álgebra_de_termos,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.008264
7060,./Álgebra_simples,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.008264


In [8]:
df2.drop(columns = ['link']).sum(axis=0).sum(axis=0)

139.0

In [9]:
df2['link'].nunique()

7062

In [10]:
df2.to_csv("my_wikipedia.csv", index=False)