In [161]:
import pandas as pd
import numpy as np
import requests
import random
import IPython.display
from bs4 import BeautifulSoup

In [162]:
QUIZ_CATEGORIES = [
    ('https://pandas.pydata.org/docs/reference/general_functions.html', '#general-functions'),
    ('https://pandas.pydata.org/docs/reference/frame.html', '#dataframe'),
    ('https://pandas.pydata.org/docs/reference/series.html', '#series'),
]

In [163]:
def get_quiz_data(main_section):
    category = main_section.h1.text.strip("#")
    for section in main_section.select("section"):
        try:
            section_name = section.h2.text.strip("#")
            for tr in section.select("tr"):
                    yield {
                        "category": category,
                        "section": section_name,
                        "method": tr.find(class_=["reference", "internal"])['title'],
                        "method_url": tr.find(class_=["reference", "internal"])['href'],
                        "question": tr.find_all("td")[1].text,
                    }
        except Exception as e:
            # a these errors are not relevant to the desired data and can be skipped
            continue


def get_questions(api_url, section_id):
    soup = BeautifulSoup(requests.get(api_url).content)
    main_section = soup.select(section_id)[0]
    yield pd.DataFrame(get_quiz_data(main_section))


def get_all_questions():
    for api_url, section_id in QUIZ_CATEGORIES:
        yield from get_questions(api_url, section_id)


def question_df():
    # get all the question materials
    df = pd.concat(get_all_questions())
    # let's not ask deprecated questions
    df.loc[df['question'].str.contains("DEPRECATED", regex=False), 'question'] = np.nan
    df.dropna(inplace=True)
    return df

In [165]:
df = question_df()
df.to_csv("PandasMethods.csv")