## Workstation

In [63]:
#Imports
import requests
from bs4 import BeautifulSoup
import re
import wikipedia
import time
import json
from typing import List

In [65]:
#Helper functions
def join_contents_from_json(file_names: List[str], new_output_file_name: str, rmv_old_files: bool=False):
    """
    take two output files from ArticleConnections.get_data() and join them to one file
    """
    new_dict = {}
    for file_name in file_names:
        data = json_to_dict(file_name)
        new_dict.update(data)
    write_data_to_json(new_dict, new_output_file_name)
    if rmv_old_files:
        print('have not implemented removing old files yet')
    return new_dict

def write_data_to_json(data: dict, new_output_file_name: str) -> None:
    """
    Take dictionary to write it to a json file.
    """
    with open(new_output_file_name, mode="w") as f:
        json.dump(data, f)
    print(f"writing your data to {new_output_file_name}")

def json_to_dict(file_name):
    """
    Get dictionary obj from json file.
    """
    with open("outputs/get_data_Imaginary_number_100.json", mode="r") as f:
        return json.load(f)

In [64]:
#Globals
BASE_WIKIPEDIA_URL = "https://en.wikipedia.org/"


In [66]:
class Article:
    def __init__(self, title: str):
        self.title, self.url_title = self.__check_title(title)
        self.url = self.__build_url()
        self.soup = self.__get_page_soup(self.url)

    def __check_title(self, title: str) -> str:
        valid_page_names = wikipedia.search(title, results=3)
        if title in valid_page_names:
            print(f"found {title} in {valid_page_names}")
            return title, title.replace(" ", "_")
        else:
            raise Exception(f"I could not find that exact page, is this what you were looking for? {valid_page_names}")

    def __build_url(self) -> str:
        return BASE_WIKIPEDIA_URL + "wiki/" + self.url_title

    def __get_page_soup(self, url: str) -> BeautifulSoup:
        page = requests.get(url)
        soup = BeautifulSoup(page.content, "html.parser")
        return soup

    def what_links_here(self, num_of_links_start_idx: int=0, num_of_links_end_idx: int=5) -> List[str]:
        """
        Navigates to the 'What Links Here' section of your Article and gets all the article names. Does not get redirects.
        """
        links_list = []
        title = self.title.replace(" ", "+").replace("'", "&")
        what_links_here_url = f'{BASE_WIKIPEDIA_URL}w/index.php?title=Special:WhatLinksHere/{title}&namespace=0&hideredirs=1&limit=500'
        what_links_here_page_soup = self.__get_page_soup(what_links_here_url)
        soup_obj = what_links_here_page_soup.find(id="mw-whatlinkshere-list")
        for li in soup_obj.find_all("li")[num_of_links_start_idx:num_of_links_end_idx]:
            links_list.append(li.find_all("a")[0].text)
        return links_list

    def categories(self) -> List[str]:
        """
        Parses HTML with BS4 to get all categories on the page.
        """
        catlinks = self.soup.find("div", attrs={"id": "mw-normal-catlinks"}).find_all("li")
        return [category.text for category in catlinks]

    def navbox_hierarchy(self) -> list:
        """
        Get all categories in the navbox section that have the V-T-E links to their left.
        """
        navbox_hierarchy = []
        navbox_titles = self.soup.findAll("th", attrs = {"class", "navbox-title"})
        if len(navbox_titles) == 0 :
            print(f"{self.title} page doesn't have a navbox hierarchy to scrape!")
            return []
        else:
            for navbox_title in navbox_titles:
                if len(navbox_title.findAll("div", attrs = {"class", "navbar plainlinks hlist navbar-mini"})) == 0:
                    continue

                navbox_category_item = navbox_title.findAll("div")[1].text
                navbox_hierarchy.append(navbox_category_item)
        return navbox_hierarchy

In [67]:
class ArticleConnections:
    def __init__(self, root_article_title: str):
        self.root_article = Article(root_article_title)

    def __write_to_file(self, content: dict, path: str) -> None:
        """
        Use when you want to save outputs to a file instead of running code again.
        """
        with open(path, mode="w") as f:
            json.dump(content, f)
        print(f"writing your data to {path}")

    def get_data(self, start_idx: int, end_idx: int) -> dict:
        """
        Loops through desired number of connections based on Article.what_links_here() and gets categories and navbox information from each page.
        """
        url_title = self.root_article.url_title

        related_links = self.root_article.what_links_here(start_idx, end_idx)
        data = {}
        for title in related_links:
            page = Article(title)
            data[title] = {
                "categories": page.categories(),
                "navbox": page.navbox_hierarchy()
                }
        self.__write_to_file(data, f'outputs/get_data_{url_title}_{start_idx}-{end_idx}.json')
        return data

    def analyze_categories(self, data: dict) -> dict:
        """
        Parces data from get_data to see analytics of categories and navbox data
        """
        navbox_analysis = {}
        category_analysis = {}
        for page in data:
            for category in data[page]["categories"]:
                if category not in category_analysis:
                    category_analysis[category] = {"count": 1, "pages": []}
                    category_analysis[category]["pages"].append(page)
                else:
                    category_analysis[category]["count"] += 1
                    category_analysis[category]["pages"].append(page)
            for navbox in data[page]["navbox"]:
                if navbox not in navbox_analysis:
                    navbox_analysis[navbox] = {"count": 1, "pages": []}
                    navbox_analysis[navbox]["pages"].append(page)
                else:
                    navbox_analysis[navbox]["count"] += 1
                    navbox_analysis[navbox]["pages"].append(page)
        return {
            "category": sorted(category_analysis.items(), key=lambda x:x[1]["count"], reverse=True),
            "navbox": sorted(navbox_analysis.items(), key=lambda x:x[1]["count"], reverse=True)
        }

In [58]:
connections = ArticleConnections("Imaginary number")

found Imaginary number in ['Imaginary number', 'Complex number', 'Imaginary']


In [59]:
connections.get_data(100, 105)

found CCR and CAR algebras in ['CCR and CAR algebras', 'Clifford algebra', 'Weyl algebra']
CCR and CAR algebras page doesn't have a navbox hierarchy to scrape!
found Schwarzschild geodesics in ['Schwarzschild geodesics', 'Geodesics in general relativity', 'Eddington–Finkelstein coordinates']
Schwarzschild geodesics page doesn't have a navbox hierarchy to scrape!
found Schwarz lemma in ['Schwarz lemma', 'Schwarz–Ahlfors–Pick theorem', 'Riemann mapping theorem']
Schwarz lemma page doesn't have a navbox hierarchy to scrape!
found List of Ig Nobel Prize winners in ['List of Ig Nobel Prize winners', 'Ig Nobel Prize', 'Rectal foreign body']
List of Ig Nobel Prize winners page doesn't have a navbox hierarchy to scrape!
found Euler's three-body problem in ["Euler's three-body problem", 'Three-body problem', 'Three-body problem (disambiguation)']
Euler's three-body problem page doesn't have a navbox hierarchy to scrape!
writing your data to outputs/get_data_Imaginary_number_100-105.json


{'CCR and CAR algebras': {'categories': ['Quantum field theory',
   'Axiomatic quantum field theory',
   'Functional analysis',
   'Algebras',
   'C*-algebras'],
  'navbox': []},
 'Schwarzschild geodesics': {'categories': ['Exact solutions in general relativity'],
  'navbox': []},
 'Schwarz lemma': {'categories': ['Riemann surfaces',
   'Lemmas in analysis',
   'Theorems in complex analysis'],
  'navbox': []},
 'List of Ig Nobel Prize winners': {'categories': ['Lists of award winners',
   'Ig Nobel Prize',
   'Science and technology award winners'],
  'navbox': []},
 "Euler's three-body problem": {'categories': ['Orbits'], 'navbox': []}}

In [62]:
join_contents_from_json(["outputs/get_data_Imaginary_number_100.json", "outputs/get_data_Imaginary_number_100-105.json"], "outputs/get_data_Imaginary_number_0-105.json")

writing your data to outputs/get_data_Imaginary_number_0-105.json


{'Complex analysis': {'categories': ['Complex analysis'],
  'navbox': ['Major topics in mathematical analysis']},
 'Complex number': {'categories': ['Composition algebras', 'Complex numbers'],
  'navbox': ['Complex numbers', 'Number systems']},
 'Control theory': {'categories': ['Control theory',
   'Control engineering',
   'Computer engineering',
   'Cybernetics'],
  'navbox': ['Control theory',
   'Subfields of and cyberneticians involved in cybernetics',
   'Systems science',
   'Mathematics']},
 'Cauchy–Riemann equations': {'categories': ['Partial differential equations',
   'Complex analysis',
   'Harmonic functions',
   'Bernhard Riemann',
   'Augustin-Louis Cauchy'],
  'navbox': ['Bernhard Riemann']},
 'Definition': {'categories': ['Definition',
   'Philosophical logic',
   'Philosophy of language',
   'Semantics',
   'Linguistics terminology',
   'Mathematical terminology',
   'Concepts in logic',
   'Lexicography',
   'Meaning (philosophy of language)'],
  'navbox': ['Definit

#### Code to Implement and Remember

In [19]:
#useful information for quickly explaining what an article is.
wikipedia.summary("Euler's formula", sentences=2)

"Euler's formula, named after Leonhard Euler, is a mathematical formula in complex analysis that establishes the fundamental relationship between the trigonometric functions and the complex exponential function. Euler's formula states that for any real number x:\n\nwhere e is the base of the natural logarithm, i is the imaginary unit, and cos and sin are the trigonometric functions cosine and sine respectively."

### Possible Readings/things to study

- https://www.wikidata.org/wiki/Wikidata:How_to_use_data_on_Wikimedia_projects
- https://www.wikidata.org/wiki/Wikidata:Data_access
    - great overview of what wikipedia tools I have to work with!
    - Wikimedia action API will be used because I can grab up to 50 articles at once!
        - link https://www.wikidata.org/w/api.php
        - better link https://www.mediawiki.org/wiki/API:Categories#
    - Linked data interface might be used because I can get the data needed in json
    - Wikipedia dumps may be useful in the future because I can host my own instance of wikipedia data and query it as much as I want.
- wikipedia python package https://wikipedia.readthedocs.io/en/latest/code.html#api

A key component will be analyzing wikipeida categories and or the category trees for where an article fits in the grand scheme of things. https://en.wikipedia.org/wiki/Help:Category

Possible algorithms to study
 - https://ceur-ws.org/Vol-735/paper8.pdf
 - https://medium.com/@RelcyEngineering/using-wikipedia-category-graph-for-semantic-understanding-5638c9897f8b
 

 

#### Possible algorithm
1. For a page, grab all categories and related categories from bottom of page.  first box is most specific, each category after that is higher level.  Note those categories and their structure somehow.  create a tree with this information.
2. loop through each page and grab the information.  count statistics on counts that overlap in each category.  do best to map them to parents/children.

note: wikidata may be able to get me the category information with is_instance of but it may be too much data to deal with and it would be better to stick to the categories.  The effect of this is that I would need to grab each html page but I will want to do this to search for the keywords related to students interests anyways.