In [1]:
import requests, bs4
from swampy import structshape
from typing import List
r_urls = "https://cran.r-project.org/doc/manuals/r-release/R-lang.html",
p_urls = "https://docs.python.org/3/reference/lexical_analysis.html", \
    "https://docs.python.org/3/reference/datamodel.html", \
        "https://docs.python.org/3/reference/executionmodel.html",\
            "https://docs.python.org/3/reference/import.html",\
                "https://docs.python.org/3/reference/expressions.html",\
                    "https://docs.python.org/3/reference/simple_stmts.html",\
                        "https://docs.python.org/3/reference/compound_stmts.html",\
                            "https://docs.python.org/3/reference/toplevel_components.html",

In [2]:
def get_page_data(url):
    """ Create soup object from a website url"""
    res = requests.get(url)
    if res.status_code != requests.codes.OK:
        return
    soup = bs4.BeautifulSoup(res.text, 'html.parser')
    return soup

In [3]:
def scrape_r() -> str:
    """ Scrape R language definition """
    r_soup = get_page_data(r_urls[0])

    result = r_soup.select('div > p')

    # @NOTE: If we want to include the <code> tag sections for an analysis
    if False:
        code = r_soup.select('code')
        code_res = ""
        for t in code:
            code_res += " " + t.text
        print(code_res)

    str_res = ""
    for t in result:
        str_res += " " + t.text

    # Remove stop words
    stop_words = "[]", "Next: , Previous: , Up:", "Previous: , Up:", "Next:"
    for sw in stop_words:
        str_res = str_res.replace(sw,'')

    # @TODO: Convert several spaces to one space
    return str_res

In [4]:
def scrape_python() -> str:
    """ Scrape python language specification """
    str_res = ""
    for url in p_urls:
        p_soup = get_page_data(url)
        result = p_soup.select('p')
        for t in result:
            str_res += " " + t.text

        # Remove stop words
        stop_words = ""
        for sw in stop_words:
            str_res = str_res.replace(sw,'')

        # @TODO: Convert several spaces to one space
    return str_res

In [5]:
import spacy, numpy as np

In [6]:
r_text = scrape_r()
py_text = scrape_python()

In [7]:
len(r_text), len(py_text)


(125173, 285444)

In [8]:
# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_sm")

nouns = []
verbs = []
# Process whole documents
for text in [r_text, py_text]:
    doc = nlp(text)

    # Analyze syntax
    nouns += [[chunk.text for chunk in doc.noun_chunks]]
    verbs += [[token.lemma_ for token in doc if token.pos_ == "VERB"]]

    # Find named entities, phrases and concepts
    # print("Concepts:")
    # for entity in doc.ents:
    #     print(entity.text, entity.label_)

In [9]:
len(nouns),len(verbs)
structshape.structshape(nouns), structshape.structshape(verbs)

('list of (list of 5496 str, list of 12000 str)',
 'list of (list of 2366 str, list of 6027 str)')

In [10]:
def clean_list_of_strs(t : List[str]):
    # replace newlines in phrases
    for idx, _ in enumerate(t):
        t[idx] = t[idx].replace('\n',' ')
        t[idx] = t[idx].replace('the ','')
        t[idx] = t[idx].replace('The ','')
        t[idx] = t[idx].replace('A ','')
        t[idx] = t[idx].strip()
    # kill duplicates 
    rv = []
    [rv.append(s) for s in t if s not in rv]
    return rv

In [11]:
for idx, _ in enumerate(nouns):
    nouns[idx] = clean_list_of_strs(nouns[idx])
    verbs[idx] = clean_list_of_strs(verbs[idx])

In [12]:
print(nouns[0])



In [13]:
# Find nouns contained in both languages
both_langs = []
for n in nouns[0]:
    if n in nouns[1]:
        both_langs += [n]
print(both_langs)



In [16]:
len(nouns[0]), len(nouns[1])


(2580, 5232)