# Outra forma de criar um dataset

Talvez a matriz tenha que ser quadrada. Vamos para o approach.

In [1]:
import numpy as np
import numpy.linalg as la
import pandas as pd
import requests
import re
from urllib.parse import urlparse, urldefrag
from bs4 import BeautifulSoup as bs

### A ideia se mantém a mesma, no geral.

In [2]:
all_links = set()

A diferença é que temos três funções. E que há uma filtragem dos links, como se a internet fosse limitada somente a esses sites.

In [3]:
def get_df(url: str):
    ################
    ### SCRAPING ###
    ################
    
    # Requisitando a página.
    raw_page = requests.get(url)
    
    # Adquirindo seu HTML.
    html_page = raw_page.text
    
    # Criando o objeto scrapper.
    soup = bs(html_page, 'lxml')
    
    # Capturando todos os links que levam à própria Wikipédia.
    links = soup.findAll('a', attrs= {'rel': 'mw:WikiLink'})
    
    # Criando lista de tuplas com links.
    data = [link.get('href') for link in links]
    
    # Criando dataframe a partir desses dados.
    df = pd.DataFrame(data, columns = ['link'])
    
    return df

In [4]:
def find_probs(url: str, df: pd.DataFrame): 
    #####################
    ### PROBABILIDADE ###
    #####################
    
    # Retirando fragmento da url
    df['link'] = df['link'].apply(lambda x: urldefrag(x)[0])
    
    # Contabilizando quantas referências a cada link o site possui. 
    df = df.groupby(['link']).size().reset_index(name='count')
    
    # Criando a identificação da coluna a partir da url (padronizar).
    title = url.replace('https://pt.wikipedia.org/api/rest_v1/page/html', '.')
    
    # Criando essa coluna de probabilidades identificada pelo site.
    df[title] = df['count']/df['count'].sum()
    
    return df[['link', title]]

In [29]:
def get_links(url: str, depth: int):
    #############################################
    ### CAPTURA TODOS OS LINKS PROFUNDIDADE 2 ###
    #############################################
    
    # Requisitando a página.
    raw_page = requests.get(url)
    
    # Adquirindo seu HTML.
    html_page = raw_page.text
    
    # Criando o objeto scrapper.
    soup = bs(html_page, 'lxml')
    
    # Capturando todos os links que levam à própria Wikipédia.
    links = soup.findAll('a', attrs= {'rel': 'mw:WikiLink'})
    
    # Criando set com links. (Removendo o #param)
    link_set = {urldefrag(link.get('href'))[0] for link in links}
    
    
    if depth > 0:
        for link in link_set:
            # Transformando link em absoluto
            full_link = link.replace('./', 'https://pt.wikipedia.org/api/rest_v1/page/html/') 
            # Recursividade
            new_link_set = get_links(full_link, depth -1)
            # Unindo os sets
            link_set = set.union(link_set, new_link_set)
        
        # Adicionando url original ao set
        title = url.replace('https://pt.wikipedia.org/api/rest_v1/page/html', '.')
        link_set.add(title)
        
        # Criando dataframe a partir desse set.
        link_filter = pd.DataFrame(link_set, columns = ['link'])

        return link_filter
    else:
        return link_set
    

In [6]:
def scraping_loop(first_url: str):
    ##################################
    ### LOOP PRINCIPAL DE SCRAPING ###
    ##################################
    
    # Referenciando ao set que contém todos os links
    global all_links
    
    # Adicionando o link principal a esse set
    all_links.add(first_url)
    
    # Encontra os links (e suas probabilidades) para a url inicial
    links_df = get_df(first_url)
    links = find_probs(first_url, links_df)
    
    # Cria uma cópia para ser retornada após merge
    main_frame = links.copy()
    
    # Itera entre os links da url principal
    for link in links['link']:
        
        # Transformando a url relativa para a absoluta (e removendo o #titulo).
        full_link = link.replace('./', 'https://pt.wikipedia.org/api/rest_v1/page/html/')
        full_link = urldefrag(full_link)[0]
        
        # Checa se o link já foi utilizado
        if full_link in all_links:
            continue
        else:
            # Adiciona ao set de links o link
            all_links.add(full_link)
            
            # Cria o df para esse link
            df = get_df(full_link)
            
            # Filtra os links para conter somente os de nossa internet
            df = df[df['link'].isin(links['link'])]
            
            # Cria a coluna (df) de probabilidades.
            probs = find_probs(full_link, df) 
            
            # Faz um outer join com o main_frame
            main_frame = pd.merge(main_frame, probs, how='outer', on='link')
            
    return main_frame.fillna(0)

In [7]:
def second_scraping_loop(first_url: str, link_filter: pd.DataFrame):
    ####################################
    ### LOOP PRINCIPAL DE SCRAPING 2 ###
    ####################################
    
    # Referenciando ao set que contém todos os links
    all_links = set()
    
    # Adicionando o link principal a esse set
    all_links.add(first_url)
    
    # Encontra os links (e suas probabilidades) para a url inicial
    links_df = get_df(first_url)
    links = find_probs(first_url, links_df)
    
    # Cria uma cópia para ser retornada após merge
    main_frame = links.copy()
    
    # Itera entre os links da url principal
    for link in link_filter['link']:
        
        # Transformando a url relativa para a absoluta.
        full_link = link.replace('./', 'https://pt.wikipedia.org/api/rest_v1/page/html/')
        
        # Checa se o link já foi utilizado
        if full_link in all_links:
            continue
        else:
            # Adiciona ao set de links o link
            all_links.add(full_link)
            
            # Cria o df para esse link
            df = get_df(full_link)
            
            # Filtra os links para conter somente os de nossa internet
            df = df[df['link'].isin(link_filter['link'])]
            
            # Cria a coluna (df) de probabilidades.
            probs = find_probs(full_link, df) 
            
            # Faz um outer join com o main_frame
            main_frame = pd.merge(main_frame, probs, how='outer', on='link')
            
    return main_frame.fillna(0)

In [30]:
url = 'https://pt.wikipedia.org/api/rest_v1/page/html/Álgebra_linear'

In [31]:
%%time
links = get_links(url, 1)

CPU times: user 11.4 s, sys: 129 ms, total: 11.5 s
Wall time: 29.9 s


In [32]:
links2 = links.copy()
links2

Unnamed: 0,link
0,./Modelagem_Computacional
1,./Banco_de_dados
2,./Sofistas
3,./Nababo_de_Bengala
4,./Escriba
...,...
7057,./Filosofia_da_matemática
7058,./Cossecante
7059,./Ciências
7060,./Espaço_nulo


In [34]:
%%time
flinks = []
for link in set(links['link']):
    full_link = link.replace('./', 'https://pt.wikipedia.org/wiki/')
    if requests.get(full_link).ok:
        flinks.append(link)
    else:
        continue

CPU times: user 2min 5s, sys: 5.62 s, total: 2min 10s
Wall time: 27min 11s


In [36]:
pd.DataFrame(flinks, columns = ['link'])

Unnamed: 0,link
0,./Banco_de_dados
1,./Sofistas
2,./Revista
3,./Grafo_bipartido_completo
4,./Conexão_de_Levi-Civita
...,...
5992,./401(k)
5993,./Omar_Khayyām
5994,./Filosofia_da_matemática
5995,./Século_XV_a.C.


In [None]:
%%time
df3 = second_scraping_loop(url, links)

In [10]:
links.to_csv("datasets/links.csv", index=False)

In [None]:
df3.to_csv("datasets/my_new_wikipedia.csv", index=False)

In [7]:
# df2 = scraping_loop(url)

In [8]:
df2

Unnamed: 0,link,./Álgebra_linear,./Ajuda:Controle_de_autoridade,./Anel_(matemática),./Análise_complexa,./Análise_funcional,./Análise_matemática,./Análise_matricial,./Análise_numérica,./Análise_não_padronizada,...,./Zero,./Álgebra,./Álgebra_abstrata,./Álgebra_booleana,./Álgebra_comutativa,./Álgebra_elementar,./Álgebra_linear_numérica,./Álgebra_multilinear,./Álgebra_não_linear,./Álgebra_universal
0,./Ajuda:Controle_de_autoridade,0.005319,0.2,0.018868,0.032258,0.010101,0.008696,0.0,0.010989,0.000000,...,0.0625,0.008850,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
1,./Anel_(matemática),0.005319,0.0,0.056604,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0000,0.017699,0.019417,0.142857,0.035714,0.011236,0.000000,0.000000,0.0,0.142857
2,./Análise_complexa,0.005319,0.0,0.000000,0.096774,0.020202,0.043478,0.0,0.010989,0.000000,...,0.0000,0.008850,0.009709,0.000000,0.035714,0.011236,0.000000,0.016949,0.0,0.000000
3,./Análise_funcional,0.010638,0.0,0.000000,0.032258,0.030303,0.043478,0.0,0.021978,0.000000,...,0.0000,0.008850,0.009709,0.000000,0.000000,0.011236,0.000000,0.016949,0.0,0.000000
4,./Análise_matemática,0.005319,0.0,0.000000,0.064516,0.040404,0.026087,0.0,0.021978,0.142857,...,0.0000,0.017699,0.009709,0.000000,0.000000,0.011236,0.000000,0.016949,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143,./Álgebra_linear,0.015957,0.0,0.037736,0.000000,0.020202,0.008696,0.0,0.010989,0.000000,...,0.0000,0.026549,0.029126,0.000000,0.035714,0.022472,0.036364,0.033898,0.2,0.000000
144,./Álgebra_linear_numérica,0.005319,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.018182,0.000000,0.0,0.000000
145,./Álgebra_multilinear,0.005319,0.0,0.000000,0.000000,0.010101,0.008696,0.0,0.010989,0.000000,...,0.0000,0.008850,0.009709,0.000000,0.000000,0.011236,0.000000,0.016949,0.0,0.000000
146,./Álgebra_não_linear,0.005319,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000


In [13]:
df2.drop(columns = ['link']).sum(axis=0).unique()

array([1., 1., 0., 1., 1.])

In [11]:
df2['link'].nunique()

148

In [14]:
# df2.to_csv("datasets/my_wikipedia_squared.csv", index=False)

In [17]:
my_array = df2.drop(columns = ['link']).to_numpy()

In [18]:
my_array.shape

(148, 148)