## Refrences
https://en.wikipedia.org/w/api.php


## Workstation

### Imports

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import wikipedia
import time
import json
from typing import List

### Class

In [2]:
BASE_WIKIPEDIA_URL = "https://en.wikipedia.org/"

class Article:
    def __init__(self, title: str):
        self.title, self.url_title = self.__check_title(title)
        self.url = self.__build_url()
        self.soup = self.__get_page_soup(self.url)

    def __check_title(self, title: str) -> str:
        valid_page_names = wikipedia.search(title, results=3)
        if title in valid_page_names:
            print(f"found {title} in {valid_page_names}")
            return title, title.replace(" ", "_")
        else:
            raise Exception(f"I could not find that exact page, is this what you were looking for? {valid_page_names}")

    def __build_url(self) -> str:
        return BASE_WIKIPEDIA_URL + "wiki/" + self.url_title

    def __get_page_soup(self, url: str) -> BeautifulSoup:
        page = requests.get(url)
        soup = BeautifulSoup(page.content, "html.parser")
        return soup

    def what_links_here(self, num_of_links: int=7) -> List[str]:
        """
        Navigates to the 'What Links Here' section of your Article and gets all the article names. Does not get redirects.
        """
        links_list = []
        title = self.title.replace(" ", "+").replace("'", "&")
        what_links_here_url = f'{BASE_WIKIPEDIA_URL}w/index.php?title=Special:WhatLinksHere/{title}&namespace=0&hideredirs=1&limit=500'
        what_links_here_page_soup = self.__get_page_soup(what_links_here_url)
        soup_obj = what_links_here_page_soup.find(id="mw-whatlinkshere-list")
        for li in soup_obj.find_all("li")[:num_of_links]:
            links_list.append(li.find_all("a")[0].text)
        return links_list

    def categories(self) -> List[str]:
        """
        Parses HTML with BS4 to get all categories on the page.
        """
        catlinks = self.soup.find("div", attrs={"id": "mw-normal-catlinks"}).find_all("li")
        return [category.text for category in catlinks]

    def navbox_hierarchy(self) -> list:
        """
        Get all categories in the navbox section that have the V-T-E links to their left.
        """
        navbox_hierarchy = []
        navbox_titles = self.soup.findAll("th", attrs = {"class", "navbox-title"})
        if len(navbox_titles) == 0 :
            print(f"{self.title} page doesn't have a navbox hierarchy to scrape!")
            return None
        else:
            for navbox_title in navbox_titles:
                if len(navbox_title.findAll("div", attrs = {"class", "navbar plainlinks hlist navbar-mini"})) == 0:
                    continue

                navbox_category_item = navbox_title.findAll("div")[1].text
                navbox_hierarchy.append(navbox_category_item)
        return navbox_hierarchy
        
class ArticleConnections:
    def __init__(self, root_article_title: str, num_of_connections: int=5):
        self.root_article = Article(root_article_title)
        self.num_of_connections = num_of_connections

    def __write_to_file(self, content: dict, path: str) -> None:
        """
        Use when you want to save outputs to a file instead of running code again.
        """
        with open(path, mode="w") as f:
            f.write(json.dumps(content))
        print(f"writing your data to {path}")

    def categories_in_what_links_here(self) -> dict:
        """
        Gets categories from a number of pages that link to main page.
        """
        links_here = self.root_article.what_links_here(self.num_of_connections)
        categories = {}
        for related_page in links_here:
            page = Article(related_page)
            categories[related_page] = page.categories()
            time.sleep(2)
        self.__write_to_file(categories, f'outputs/{self.root_article.url_title}_{self.num_of_connections}.json')
        return categories

    def get_data(self) -> dict:
        """
        TODO
        """
        url_title = self.root_article.url_title

        related_links = self.root_article.what_links_here(self.num_of_connections)
        data = {}
        for title in related_links:
            page = Article(title)
            data[title] = {
                "categories": page.categories(),
                "navbox": page.navbox_hierarchy()
                }
        self.__write_to_file(data, f'outputs/get_data_{url_title}_{self.num_of_connections}.json')
        return data


    def analyze_categories_in_what_links_here(self, categories: dict) -> dict:
        """
        Take output from categories_in_what_links_here() and pass it into this method.
        """
        analysis = {}
        for page in categories:
            for category in categories[page]:
                if category not in analysis:
                    analysis[category] = {"count": 1, "pages": [page]}
                else:
                    analysis[category]["count"] += 1
                    analysis[category]["pages"].append(page)
        return analysis




In [5]:
connections = ArticleConnections("Pingala", 2)

found Pingala in ['Pingala', 'Nadi (yoga)', '0']


In [6]:
#this is not working yet, need to iron out issues with getting attributes from Article Class
connections.get_data()

found Ancient philosophy in ['Ancient philosophy', 'Ancient Greek philosophy', 'Cynicism (philosophy)']
found Binomial coefficient in ['Binomial coefficient', 'Gaussian binomial coefficient', 'Central binomial coefficient']
Binomial coefficient page doesn't have a navbox hierarchy to scrape!
writing your data to outputs/get_data_Pingala_2.json


{'Ancient philosophy': {'categories': ['Ancient philosophy'],
  'navbox': ['Philosophy', 'Ancient Greek schools of philosophy']},
 'Binomial coefficient': {'categories': ['Combinatorics',
   'Factorial and binomial topics',
   'Integer sequences',
   'Triangles of numbers',
   'Operations on numbers',
   'Articles with example Scheme (programming language) code'],
  'navbox': None}}

### Getting Started

In [3]:
root = Article("Pingala")

found Pingala in ['Pingala', 'Nadi (yoga)', '0']


In [58]:
root.navbox_hierarchy()

['Indian mathematics']

In [38]:
categories = root.categories_in_what_links_here(2)

found Complex analysis in ['Complex analysis', 'Residue (complex analysis)', 'Argument (complex analysis)']
found Complex number in ['Complex number', 'Absolute value', 'Split-complex number']
writing your data to outputs/Imaginary_number_2.json


In [33]:
root.analyze_categories_in_what_links_here(categories)

{'Complex analysis': {'count': 2,
  'pages': ['Complex analysis', 'Cauchy–Riemann equations']},
 'Composition algebras': {'count': 1, 'pages': ['Complex number']},
 'Complex numbers': {'count': 1, 'pages': ['Complex number']},
 'Control theory': {'count': 1, 'pages': ['Control theory']},
 'Control engineering': {'count': 1, 'pages': ['Control theory']},
 'Computer engineering': {'count': 1, 'pages': ['Control theory']},
 'Cybernetics': {'count': 1, 'pages': ['Control theory']},
 'Partial differential equations': {'count': 1,
  'pages': ['Cauchy–Riemann equations']},
 'Harmonic functions': {'count': 1, 'pages': ['Cauchy–Riemann equations']},
 'Bernhard Riemann': {'count': 1, 'pages': ['Cauchy–Riemann equations']},
 'Augustin-Louis Cauchy': {'count': 1, 'pages': ['Cauchy–Riemann equations']},
 'Definition': {'count': 1, 'pages': ['Definition']},
 'Philosophical logic': {'count': 1, 'pages': ['Definition']},
 'Philosophy of language': {'count': 1, 'pages': ['Definition']},
 'Semantics': {

In [46]:
#finds all the navbox category items on a page!
navbox_hierarchy = []
pages_without_navbox_hierarchy = []

navbox_titles = root.soup.findAll("th", attrs = {"class", "navbox-title"})
vte_links = root.soup.findAll("div", attrs = {"class", "navbar plainlinks hlist navbar-mini"})
if len(navbox_titles) == 0 or len(vte_links) == 0:
    print("page doesn't have a navbox hierarchy to scrape!")
    pages_without_navbox_hierarchy.append(root.title)
else:
    for navbox_title in navbox_titles:
        navbox_category_item = navbox_title.findAll("div")[1].text
        navbox_hierarchy.append(navbox_category_item)



pages_without_navbox_hierarchy

page doesn't have a navbox heirchy to scrape!


['Gerolamo Cardano']

In [11]:
navbox_list = soup.findAll("th", attrs = {"class", "navbox-title"})

In [17]:
navbox_list[1].findAll("div")[1].text

'Number systems'

In [19]:
#useful information for quickly explaining what an article is.
wikipedia.summary("Euler's formula", sentences=2)

"Euler's formula, named after Leonhard Euler, is a mathematical formula in complex analysis that establishes the fundamental relationship between the trigonometric functions and the complex exponential function. Euler's formula states that for any real number x:\n\nwhere e is the base of the natural logarithm, i is the imaginary unit, and cos and sin are the trigonometric functions cosine and sine respectively."

### Next Things

#### I have the categories associated to a wikipedia page. 

- https://www.wikidata.org/wiki/Wikidata:How_to_use_data_on_Wikimedia_projects
- https://www.wikidata.org/wiki/Wikidata:Data_access
    - great overview of what wikipedia tools I have to work with!
    - Wikimedia action API will be used because I can grab up to 50 articles at once!
        - link https://www.wikidata.org/w/api.php
        - better link https://www.mediawiki.org/wiki/API:Categories#
    - Linked data interface might be used because I can get the data needed in json
    - Wikipedia dumps may be useful in the future because I can host my own instance of wikipedia data and query it as much as I want.
- wikipedia python package https://wikipedia.readthedocs.io/en/latest/code.html#api

A key component will be analyzing wikipeida categories and or the category trees for where an article fits in the grand scheme of things. https://en.wikipedia.org/wiki/Help:Category

Possible algorithms to study
 - https://ceur-ws.org/Vol-735/paper8.pdf
 - https://medium.com/@RelcyEngineering/using-wikipedia-category-graph-for-semantic-understanding-5638c9897f8b
 

 

#### Possible algorithm
1. For a page, grab all categories and related categories from bottom of page.  first box is most specific, each category after that is higher level.  Note those categories and their structure somehow.  create a tree with this information.
2. loop through each page and grab the information.  count statistics on counts that overlap in each category.  do best to map them to parents/children.

note: wikidata may be able to get me the category information with is_instance of but it may be too much data to deal with and it would be better to stick to the categories.  The effect of this is that I would need to grab each html page but I will want to do this to search for the keywords related to students interests anyways.

### trying to associate each category to a higher-order category

In [None]:
# PROBLEM - BS4 IS NOT GETTING ALL THE PAGE DATA FOR THE CATEGORIES PAGE.  USE SELENIUM INSTEAD
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

url = "https://www.sofascore.com/pt/futebol/2018-09-18"
options = Options()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
driver = webdriver.Chrome(chrome_options=options)
driver.get(url)
time.sleep(3)
page = driver.page_source
driver.quit()
soup = BeautifulSoup(page, 'html.parser')
container = soup.find_all('div', attrs={
    'class':'js-event-list-tournament-events'})
print(container)

  driver = webdriver.Chrome(chrome_options=options)


WebDriverException: Message: 'chromedriver' executable needs to be in PATH. Please see https://chromedriver.chromium.org/home


In [5]:
from selenium import webdriver
import time
from selenium.webdriver.common.by import By

In [6]:
url = "https://en.m.wikipedia.org/w/index.php?hideprefix=20&mode=0&notranslations=&showcount=2&target=Mathematics&title=Special:CategoryTree"

In [32]:
driver = webdriver.Chrome(executable_path='C:/Users/calep/OneDrive/Documents/Projects/Wikipedia Analyzer/chromedriver_win32/chromedriver.exe')

  driver = webdriver.Chrome(executable_path='C:/Users/calep/OneDrive/Documents/Projects/Wikipedia Analyzer/chromedriver_win32/chromedriver.exe')


In [33]:
driver.get(url)

In [34]:
doms = driver.find_elements(By.XPATH, "//span[contains(@title, 'expand')]")
len(doms)

20

In [31]:
doms[0].click()
inner = doms[0].find_elements(By.XPATH, "//span[contains(@title, 'expand')]")
print(len(inner))
inner[0].click()

46


In [24]:
doms[0].get_attribute("innerHTML")

''

want to implement dfs exploration until I reach last page.

In [10]:
for idx in range(0, len(doms)):
    doms[idx].click()
    inner_doms = driver.find_elements(By.XPATH, "//span[contains(@title, 'expand')]")
    print(len(inner_doms))
    break

19


In [51]:
doms = driver.find_elements(By.XPATH, "//span[contains(@title, 'collapse')]")
len(doms)

97

In [52]:
for idx in range(1, len(doms)):
    time.sleep(200)
    doms[idx].click()

In [26]:
doms[0].click()

In [None]:
//span[contains(@class, 'CategoryTreeToggle') and (@title, "expand")] 

In [None]:
    page = driver.page_source
soup = BeautifulSoup(page, 'html.parser')
print(soup.prettify())

<html class="client-js" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Category tree - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":true,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"014cc689-a6d4-4bd8-a0bb-c174ea906121","wgCSPNonce":false,"wgCanonicalNamespace":"Special","wgCanonicalSpecialPageName":"CategoryTree","wgNamespaceNumber":-1,"wgPageName":"Special:CategoryTree","wgTitle":"CategoryTree","wgCurRevisionId":0,"wgRevisionId":0,"wgArticleId":0,"wgIsArticle":false,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgRelevantPageName":"Special:CategoryTree","wgRelevantArticleId":0,"wgIsProbablyEditable":false,"wgRelevantPa

category tree section
- category tree item
- category tree children
    - category tree section
    - category tree section
    - category tree section
        - category tree item
        - category tree children
            - section
                - item
                - children
    - category tree section            