## Workstation

In [14]:
#Imports
import requests
from bs4 import BeautifulSoup
import re
import wikipedia
import time
import json
from typing import List

In [15]:
#Helper functions
def join_contents_from_json(file_names: List[str], new_output_file_name: str, rmv_old_files: bool=False):
    """
    take two output files from ArticleConnections.get_data() and join them to one file
    """
    new_dict = {}
    for file_name in file_names:
        data = json_to_dict(file_name)
        new_dict.update(data)
    write_data_to_json(new_dict, new_output_file_name)
    if rmv_old_files:
        print('have not implemented removing old files yet')
    return new_dict

def write_data_to_json(data: dict, new_output_file_name: str) -> None:
    """
    Take dictionary to write it to a json file.
    """
    with open(new_output_file_name, mode="w") as f:
        json.dump(data, f)
    print(f"writing your data to {new_output_file_name}")

def json_to_dict(file_name):
    """
    Get dictionary obj from json file.
    """
    with open("outputs/get_data_Imaginary_number_100.json", mode="r") as f:
        return json.load(f)

In [28]:
#Globals
# BASE_WIKIPEDIA_URL = "https://simple.wikipedia.org/" #for simple wikipedia
BASE_WIKIPEDIA_URL = "https://en.wikipedia.org/" #for regular wikipedia



In [70]:
class Article:
    def __init__(self, title: str):
        self.title, self.url_title = self.__check_title(title)
        self.url = self.__build_url()
        self.soup = self.__get_page_soup(self.url)

    def __check_title(self, title: str) -> str:
        valid_page_names = wikipedia.search(title, results=3)
        if title in valid_page_names:
            return title, title.replace(" ", "_")
        else:
            raise Exception(f"I could not find that exact page, is this what you were looking for? {valid_page_names}")

    def __build_url(self) -> str:
        return BASE_WIKIPEDIA_URL + "wiki/" + self.url_title
        
    def __get_page_soup(self, url: str) -> BeautifulSoup:
        page = requests.get(url)
        soup = BeautifulSoup(page.content, "html.parser")
        return soup

    def what_links_here(self, num_of_links_start_idx: int=0, num_of_links_end_idx: int=5) -> List[str]:
        """
        Navigates to the 'What Links Here' section of your Article and gets all the article names. Does not get redirects.
        """
        links_list = []
        title = self.title.replace(" ", "+").replace("'", "&")
        what_links_here_url = f'{BASE_WIKIPEDIA_URL}w/index.php?title=Special:WhatLinksHere/{title}&namespace=0&hideredirs=0&limit=500'
        what_links_here_page_soup = self.__get_page_soup(what_links_here_url)
        soup_obj = what_links_here_page_soup.find(id="mw-whatlinkshere-list")
        for li in soup_obj.find_all("li")[num_of_links_start_idx:num_of_links_end_idx]:
            # if you find a redirect link skip it, but it will capture all the sub-links under the redirect
            if li.find("a", attrs={"class": "mw-redirect"}):
                continue
            links_list.append(li.find_all("a")[0].text)
        return links_list

    def categories(self) -> List[str]:
        """
        Parses HTML with BS4 to get all categories on the page.
        """
        catlinks = self.soup.find("div", attrs={"id": "mw-normal-catlinks"}).find_all("li")
        return [category.text for category in catlinks]

    def navbox_hierarchy(self) -> list:
        """
        Get all categories in the navbox section that have the V-T-E links to their left.
        """
        navbox_hierarchy = []
        navbox_titles = self.soup.findAll("th", attrs = {"class", "navbox-title"})
        if len(navbox_titles) == 0 :
            print(f"{self.title} page doesn't have a navbox hierarchy to scrape!")
            return []
        else:
            for navbox_title in navbox_titles:
                if len(navbox_title.findAll("div", attrs = {"class", "navbar plainlinks hlist navbar-mini"})) == 0:
                    continue

                navbox_category_item = navbox_title.findAll("div")[1].text
                navbox_hierarchy.append(navbox_category_item)
        return navbox_hierarchy

    def html_near_link(self, root_title: str) -> List[str]:
        """
        Gets html from Article object where root_title is the name of the article.
        """
        #idea while len of html content is < n characters, start with text that has element where the connection is grab one element above then one below.
        html = []
        a_tags = self.soup.find_all("a", {"title": root_title})
        for idx in range(len(a_tags)):
            #todo: check if parent.name is b or i tag and account for that information somehow
            if a_tags[idx].parent.name == "p":
                soup = a_tags[idx].parent
                html.append(str(soup))
        return html

    def examples_near_link(self, root_title: str):
        """
        Gets examples from Article object where root_title is the name of the article that is usually the core subject user is interested in understanding better.
        """
        data = []
        a_tags = self.soup.find_all("a", {"title": root_title})
        for idx in range(len(a_tags)):
            #todo: check if parent.name is b or i tag and account for that information somehow
            if a_tags[idx].parent.name == "p":
                soup = a_tags[idx].parent
                #todo: account for alt text when there are images and spans.  Idea: remove all span/img tags prior to analyzing
                print(soup)
                text_example = " ".join(soup.stripped_strings)
                if len(text_example) > 500:
                    split_text = text_example.split(".")
                    location = None
                    for sen in split_text:
                        if a_tags[idx].text in sen:
                            location = split_text.index(sen)
                            break
                    if location is None:
                        print("sentence not found!!! On tag", a_tags[idx], " with title: ", root_title)
                    data.append('.'.join(split_text[min(abs(location - 2), 0): min(len(split_text), location + 2)]))
                else:
                    # print('the length of your examples is ', len(text_example), ' characters!')    
                    data.append(text_example)
        return data

In [83]:
class ArticleConnections:
    def __init__(self, root_article_title: str):
        self.root_article = Article(root_article_title)
        print(f'found {len(self.root_article.what_links_here(0, 500))} related links to scan!')


    def __write_to_file(self, content: dict, path: str) -> None:
        """
        Use when you want to save outputs to a file instead of running code again.
        """
        with open(path, mode="w") as f:
            json.dump(content, f)
        print(f"writing your data to {path}")

    def get_data(self, start_idx: int, end_idx: int) -> dict:
        """
        Loops through desired number of connections based on Article.what_links_here() and gets categories and navbox information from a_tags[idx] page.
        """
        url_title = self.root_article.url_title

        related_links = self.root_article.what_links_here(start_idx, end_idx)
        print(f'found {len(related_links)} related links to scan!')
        data = {}
        for title in related_links:
            page = Article(title)
            data[title] = {
                "categories": page.categories(),
                "navbox": page.navbox_hierarchy(),
                "examples": page.html_near_link(self.root_article.title)
                }
        self.__write_to_file(data, f'outputs/get_data_{url_title}_{start_idx}-{end_idx}.json')
        return data

    def analyze_categories(self, data: dict) -> dict:
        """
        Parces data from get_data to see analytics of categories and navbox data
        returns dict -> list -> tuple
        """
        navbox_analysis = {}
        category_analysis = {}
        for page in data:
            for category in data[page]["categories"]:
                if category not in category_analysis:
                    category_analysis[category] = {"count": 1, "pages": {}}
                    category_analysis[category]["pages"][page] = data[page]["examples"]
                else:
                    category_analysis[category]["count"] += 1
                    category_analysis[category]["pages"][page] = data[page]["examples"]
            for navbox in data[page]["navbox"]:
                if navbox not in navbox_analysis:
                    navbox_analysis[navbox] = {"count": 1, "pages": {}}
                    navbox_analysis[navbox]["pages"][page] = data[page]["examples"]
                else:
                    navbox_analysis[navbox]["count"] += 1
                    navbox_analysis[navbox]["pages"][page] = data[page]["examples"]
        return {
            "category": sorted(category_analysis.items(), key=lambda x:x[1]["count"], reverse=True),
            "navbox": sorted(navbox_analysis.items(), key=lambda x:x[1]["count"], reverse=True)
        }

In [58]:
root = Article("Imaginary number")
links = root.what_links_here(0,5)
links

['Complex analysis',
 'Complex number',
 'Control theory',
 'Cauchy–Riemann equations',
 'Definition']

In [61]:
from IPython.core.display import display, HTML

  from IPython.core.display import display, HTML


In [59]:
connection1 = Article("Complex analysis")
connection1.html_near_link(root.title)

[<p>For any complex function, the values <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math alttext="{\displaystyle z}" xmlns="http://www.w3.org/1998/Math/MathML">
 <semantics>
 <mrow class="MJX-TeXAtom-ORD">
 <mstyle displaystyle="true" scriptlevel="0">
 <mi>z</mi>
 </mstyle>
 </mrow>
 <annotation encoding="application/x-tex">{\displaystyle z}</annotation>
 </semantics>
 </math></span><img alt="z" aria-hidden="true" class="mwe-math-fallback-image-inline" src="https://wikimedia.org/api/rest_v1/media/math/render/svg/bf368e72c009decd9b6686ee84a375632e11de98" style="vertical-align: -0.338ex; width:1.088ex; height:1.676ex;"/></span> from the domain and their images <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math alttext="{\displaystyle f(z)}" xmlns="http://www.w3.org/1998/Math/MathML">
 <semantics>
 <mrow class="MJX-TeXAtom-ORD">
 <mstyle displaystyle="tr

In [63]:
connection2 = Article("Complex number")
for html in connection2.html_near_link(root.title):
    display(HTML(str(html)))

In [56]:
from IPython.core.display import display, HTML
display(HTML(
    """
    <p>For any complex function, the values <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math alttext="{\displaystyle z}" xmlns="http://www.w3.org/1998/Math/MathML">
<semantics>
<mrow class="MJX-TeXAtom-ORD">
<mstyle displaystyle="true" scriptlevel="0">
<mi>z</mi>
</mstyle>
</mrow>
<annotation encoding="application/x-tex">{\displaystyle z}</annotation>
</semantics>
</math></span><img alt="z" aria-hidden="true" class="mwe-math-fallback-image-inline" src="https://wikimedia.org/api/rest_v1/media/math/render/svg/bf368e72c009decd9b6686ee84a375632e11de98" style="vertical-align: -0.338ex; width:1.088ex; height:1.676ex;"/></span> from the domain and their images <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math alttext="{\displaystyle f(z)}" xmlns="http://www.w3.org/1998/Math/MathML">
<semantics>
<mrow class="MJX-TeXAtom-ORD">
<mstyle displaystyle="true" scriptlevel="0">
<mi>f</mi>
<mo stretchy="false">(</mo>
<mi>z</mi>
<mo stretchy="false">)</mo>
</mstyle>
</mrow>
<annotation encoding="application/x-tex">{\displaystyle f(z)}</annotation>
</semantics>
</math></span><img alt="f(z)" aria-hidden="true" class="mwe-math-fallback-image-inline" src="https://wikimedia.org/api/rest_v1/media/math/render/svg/d8dd568d570b390c337c0a911f0a1c5c214e8240" style="vertical-align: -0.838ex; width:4.176ex; height:2.843ex;"/></span> in the range may be separated into <a href="/wiki/Real_number" title="Real number">real</a> and <a href="/wiki/Imaginary_number" title="Imaginary number">imaginary</a> parts:
</p>
    """
))

  from IPython.core.display import display, HTML


In [None]:
root.examples_near_link("Imaginary number")

In [84]:
connections = ArticleConnections("Imaginary number")

found 223 related links to scan!


In [81]:
data = connections.get_data(0,16)

found 15 related links to scan!
Physical quantity page doesn't have a navbox hierarchy to scrape!
writing your data to outputs/get_data_Imaginary_number_0-16.json


In [85]:
data

analyzed_data = connections.analyze_categories(data)
analyzed_data

{'category': [('Complex analysis',
   {'count': 2,
    'pages': {'Complex analysis': ['<p>For any complex function, the values <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math alttext="{\\displaystyle z}" xmlns="http://www.w3.org/1998/Math/MathML">\n<semantics>\n<mrow class="MJX-TeXAtom-ORD">\n<mstyle displaystyle="true" scriptlevel="0">\n<mi>z</mi>\n</mstyle>\n</mrow>\n<annotation encoding="application/x-tex">{\\displaystyle z}</annotation>\n</semantics>\n</math></span><img alt="z" aria-hidden="true" class="mwe-math-fallback-image-inline" src="https://wikimedia.org/api/rest_v1/media/math/render/svg/bf368e72c009decd9b6686ee84a375632e11de98" style="vertical-align: -0.338ex; width:1.088ex; height:1.676ex;"/></span> from the domain and their images <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math alttext="{\\displaystyle f(z)}" xmlns="http://www.w3.org/

In [115]:
html = """"""
for category in analyzed_data["category"]:
    category_name = category[0]
    category_data = category[1]
    html += f'<h1>{category_name} has {category_data["count"]} connections!</h1>'
    for page in category_data["pages"]:
        # print('the page is', page)
        html += f'<h3>{page}</h3>'
        for example in category_data["pages"][page]:
            # print('the example is ', example)
            html += f'<p>{example}</p>'


        
    html += f'<h3></h3>'
    # print(f'the category is {category_name} and there are {category_data["count"]} connections to it!')

display(HTML(html))

In [105]:
s = 'abc'
s += 'def'
s

'abcdef'

In [9]:
imag_nums_0_200 = join_contents_from_json(["outputs/get_data_Imaginary_number_100.json", "outputs/get_data_Imaginary_number_100-105.json", "get_data_Imaginary_number_105-200"], "outputs/get_data_Imaginary_number_0-200.json")

writing your data to outputs/get_data_Imaginary_number_0-200.json


In [10]:
connections.analyze_categories(imag_nums_0_200)

{'category': [('Complex analysis',
   {'count': 8,
    'pages': ['Complex analysis',
     'Cauchy–Riemann equations',
     'Laurent series',
     'Zeros and poles',
     'Winding number',
     'Complex plane',
     'Isolated singularity',
     'Contour integration']}),
  ('Theorems in complex analysis',
   {'count': 8,
    'pages': ["Euler's formula",
     'Riemann mapping theorem',
     "Cauchy's integral theorem",
     "Cauchy's integral formula",
     'Residue theorem',
     "Morera's theorem",
     'Analyticity of holomorphic functions',
     "Rouché's theorem"]}),
  ('Complex numbers',
   {'count': 6,
    'pages': ['Complex number',
     'Imaginary number',
     'Imaginary unit',
     'Complex conjugate',
     'Complex plane',
     'Quater-imaginary base']}),
  ('Physical quantities',
   {'count': 5,
    'pages': ['Mass',
     'Refractive index',
     'Propagation constant',
     'Electrical impedance',
     'Admittance']}),
  ('Analytic functions',
   {'count': 4,
    'pages': ['

#### Code to Implement and Remember

In [19]:
#useful information for quickly explaining what an article is.
wikipedia.summary("Euler's formula", sentences=2)

"Euler's formula, named after Leonhard Euler, is a mathematical formula in complex analysis that establishes the fundamental relationship between the trigonometric functions and the complex exponential function. Euler's formula states that for any real number x:\n\nwhere e is the base of the natural logarithm, i is the imaginary unit, and cos and sin are the trigonometric functions cosine and sine respectively."

### Possible Readings/things to study

- https://www.wikidata.org/wiki/Wikidata:How_to_use_data_on_Wikimedia_projects
- https://www.wikidata.org/wiki/Wikidata:Data_access
    - great overview of what wikipedia tools I have to work with!
    - Wikimedia action API will be used because I can grab up to 50 articles at once!
        - link https://www.wikidata.org/w/api.php
        - better link https://www.mediawiki.org/wiki/API:Categories#
    - Linked data interface might be used because I can get the data needed in json
    - Wikipedia dumps may be useful in the future because I can host my own instance of wikipedia data and query it as much as I want.
- wikipedia python package https://wikipedia.readthedocs.io/en/latest/code.html#api

A key component will be analyzing wikipeida categories and or the category trees for where an article fits in the grand scheme of things. https://en.wikipedia.org/wiki/Help:Category

Possible algorithms to study
 - https://ceur-ws.org/Vol-735/paper8.pdf
 - https://medium.com/@RelcyEngineering/using-wikipedia-category-graph-for-semantic-understanding-5638c9897f8b
 

 

#### Possible algorithm
1. For a page, grab all categories and related categories from bottom of page.  first box is most specific, each category after that is higher level.  Note those categories and their structure somehow.  create a tree with this information.
2. loop through each page and grab the information.  count statistics on counts that overlap in each category.  do best to map them to parents/children.

note: wikidata may be able to get me the category information with is_instance of but it may be too much data to deal with and it would be better to stick to the categories.  The effect of this is that I would need to grab each html page but I will want to do this to search for the keywords related to students interests anyways.