In [1]:
import os

Get list of hyperlinks from Book of Code.ipynb

In [2]:
import re
urls = []
# Get a list of all sklearn doc URLs
with open('Book_of_Code.ipynb') as f:
  for line in f:
    if '(https://scikit-learn' in line:
      urls.append(re.findall("\((https://scikit-learn.*?)\)", line)[0])

for url in urls:
  print(url)

https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.DictVectorizer.html?highlight=dictvectorizer
https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.FeatureHasher.html
https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html?highlight=tfidfvectorizer
https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.VarianceThreshold.html?highlight=variancethreshold
https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html?highlight=selectkbest
https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectPercentile.html?highlight=selectpercentile
https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.GenericUnivariateSelect.html?highlight=genericunivariateselect
https://scikit-learn.org/stable/modules/gene

In [3]:
def underline(text):
  # return f"\033[4m{text}\033[0m"
  return f"*{text}*"

In [4]:
import json
# from bs4 import BeautifulSoup
from lxml import html
import pandas as pd
import requests
import re

In [5]:
def extract_data(url):
  response = requests.get(url)

  class_name = re.findall('.+\.(.+)\.html', url)[0]

  # soup = BeautifulSoup(response.content, features="lxml")
  tree = html.fromstring(response.content)

  signature_xpath = "//dl[@class='py class' or @class='py function']/dt[@class='sig sig-object py']"

  # param_name_xpath = signature_xpath + "//following-sibling::dd" + "//dl[@class='field-list']/dt[contains(text(), 'Parameters')]/../dd[1]/dl/dt/strong"
  # param_desc_xpath = signature_xpath + "//following-sibling::dd" + "//dl[@class='field-list']/dt[contains(text(), 'Parameters')]/../dd[1]/dl/dt/span"
  param_name_xpath = signature_xpath + "//following-sibling::dd" + "/dl/dt[contains(text(), 'Parameters')]/../dd[1]/dl/dt/strong"
  param_desc_xpath = signature_xpath + "//following-sibling::dd" + "/dl/dt[contains(text(), 'Parameters')]/../dd[1]/dl/dt/span"

  # attrib_or_return = signature_xpath + "//following-sibling::dd" + "//dl[@class='field-list']/dt[contains(text(), 'Parameters')]/../dd[2]/dl/dt/strong/ancestor::dd[position()=1]/preceding-sibling::dt[position()=1]"
  
  # attrib_name_xpath = signature_xpath + "//following-sibling::dd" + "//dl[@class='field-list']/dt[contains(text(), 'Parameters')]/../dd[2]/dl/dt/strong"
  # attrib_desc_xpath = signature_xpath + "//following-sibling::dd" + "//dl[@class='field-list']/dt[contains(text(), 'Parameters')]/../dd[2]/dl/dt/span"

  attrib_or_return = signature_xpath + "//following-sibling::dd" + "/dl/dt[contains(text(), 'Parameters')]/../dd[2]/dl/dt/strong/ancestor::dd[position()=1]/preceding-sibling::dt[position()=1]"
  
  attrib_name_xpath = signature_xpath + "//following-sibling::dd" + "/dl/dt[contains(text(), 'Parameters')]/../dd[2]/dl/dt/strong"
  attrib_desc_xpath = signature_xpath + "//following-sibling::dd" + "/dl/dt[contains(text(), 'Parameters')]/../dd[2]/dl/dt/span"

  method_name_xpath = "//tbody/tr/td[1]"
  method_desc_xpath = "//tbody/tr/td[2]"
  
  # Signature
  signature = tree.xpath(signature_xpath)[0].text_content()[:-9]

  #Parameters
  try:
    parameters = []
    for param_name, param_desc in zip(tree.xpath(param_name_xpath), tree.xpath(param_desc_xpath)):
      parameters.append(f'**{param_name.text_content()}** : {param_desc.text_content()}')
  except:
    print("No parameters found!")

  # is it attribute or return value
  try:
    attrib_return = tree.xpath(attrib_or_return)[0].text_content().strip()
    try:
      attributes = []
      for attrib_name, attrib_desc in zip(tree.xpath(attrib_name_xpath), tree.xpath(attrib_desc_xpath)):
        attributes.append(f'**{attrib_name.text_content()}** : {attrib_desc.text_content()}')
    except:
      print("No attributes/return values found!")
  except:
    attrib_return = None
    attributes = None
    print("No return value or attributes!")

  #Methods
  try:
    methods = []
    for method_name, method_desc in zip(tree.xpath(method_name_xpath), tree.xpath(method_desc_xpath)):
      method_name = method_name.text_content().replace('*', '\\*')  # specifically to handle markdown.
      methods.append(f'**{method_name}** : {method_desc.text_content()}')
  except:
    print("No methods found!")
  
  return class_name, signature, parameters, attrib_return, attributes, methods

In [6]:
def print_to_file(f, url, class_name, signature, parameters, attrib_return, attributes, methods):
  print(f'\n\n## [{underline(class_name)}]({url})', file=f)

  print(f'  {signature}', file=f)
  print(f'\n  #### {underline("List of parameters:")}', file=f)
  for parameter in parameters:
    print(f'  - {parameter}', file=f)

  # If attrib_return is "Returns", then argument "attributes" is actually "return" values.
  if attrib_return == "Returns":
    print(f'\n  #### {underline("List of return values:")}', file=f)
  else:
    print(f'\n  #### {underline("List of attributes:")}', file=f)

  if attrib_return is not None:
    for attribute in attributes:
      print(f'  - {attribute}', file=f)

  print(f'\n  #### {underline("List of methods:")}', file=f)
  for method in methods:
    print(f'  - {method}', file=f)
  
  try:
    print(f'\n\n<sub><sup>Auto-generated by {os.path.basename(__file__)}</sup></sub>')
  except:
    print(f'\n\n<sub><sup>Auto-generated by sklearn_apis.ipynb</sup></sub>', file=f)

In [7]:
# url = 'https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Perceptron.html'
# url = 'https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html'
# url = 'https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html'

f = open("sklearn_apis.md", "w")
for url in urls:
  print(url)
  class_name, signature, parameters, attrib_return, attributes, methods = extract_data(url)
  print_to_file(f, url, class_name, signature, parameters, attrib_return, attributes, methods)
f.close()

https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.DictVectorizer.html?highlight=dictvectorizer
https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.FeatureHasher.html
No return value or attributes!
https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html?highlight=tfidfvectorizer
https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.VarianceThreshold.html?highlight=variancethreshold
https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html?highlight=selectkbest
https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectPercentile.html?highlight=selectpercentile
https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.GenericUnivariateSelect.html?highlight=genericunivariateselect
https://sciki