### Import libraries

In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
import pandas as pd

### Scrape the Pandas website to get the URLs and titles of all the chapters

In [2]:
# Scrape the pandas website to collect all the chapter contents:\n
url = "https://pandas.pydata.org/docs/user_guide/index.html#user-guide"
html = urlopen(url)
soup = BeautifulSoup(html, 'html.parser')

In [3]:
# look for a div with class `bd-toc-item navbar-nav`
container_div = soup.find_all("div", class_="bd-toc-item navbar-nav")

# look for every 'a' tag with class `reference internal` within container_div
all_links = []
for div in container_div:
    links = div.find_all("a", class_="reference internal")
    all_links.append(links)

### Put the conents into a dataframe

In [4]:
# put the links into a dataframe and extract important information
series = pd.Series(all_links[0])
df = pd.DataFrame(series, columns=['links'])
df['links'] = df['links'].astype(str)

# extract href and link text
df['href'] = df['links'].str.extract('href=(.*?)>')
df['link_text'] = df['links'].str.extract('>(.*?)</a>')
df['href'] = df['href'].str.strip('\"')

In [5]:
df.head()

Unnamed: 0,links,href,link_text
0,"<a class=""reference internal"" href=""10min.html...",10min.html,10 minutes to pandas
1,"<a class=""reference internal"" href=""dsintro.ht...",dsintro.html,Intro to data structures
2,"<a class=""reference internal"" href=""basics.htm...",basics.html,Essential basic functionality
3,"<a class=""reference internal"" href=""io.html"">I...",io.html,"IO tools (text, CSV, HDF5, …)"
4,"<a class=""reference internal"" href=""pyarrow.ht...",pyarrow.html,PyArrow Functionality


### Define a function to get the contents from all the chapters using their URL

In [6]:
# function to get content out of the links
def get_content(href):
    url = "https://pandas.pydata.org/docs/user_guide/" + href
    html = urlopen(url)
    soup = BeautifulSoup(html, 'html.parser')

    # Find the content
    content = soup.find("article", class_="bd-article")
  
    # Find all div elements with the class 'highlight-ipython notranslate' 
    # within the article and remove from content
    for div in content.find_all('div', class_='highlight-ipython notranslate'):
        div.decompose()
        
    # Find all table elements with the class 'table' 
    # within the article and remove from content
    for table in content.find_all('table', class_='table'):
        table.decompose()

    # Get the remaining content of the article element
    text = content.get_text(separator="\n", strip=True)
    text = text.strip().replace("\n", " ").replace("\r", " ")
                            
    return content, text

### Use the above function to scrape the contents and save them for future use

In [7]:
# store the data into a variable
contents = []
for index, row in df.iterrows():
    content, text = get_content(row['href'])

    data = [
                'https://pandas.pydata.org/docs/user_guide/' + row['href'],
                row['link_text'],
                content,
                text,
            ]
    contents.append(data)

In [8]:
# convert list to dataframe
columns = ['url', 'link_text', 'content', 'content_text']
contents_df = pd.DataFrame(contents, columns=columns)

In [9]:
# save dataframe to csv
contents_df.to_csv('../data/pandas_user_guide.csv', index=False)