<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Scrape-URLs" data-toc-modified-id="Scrape-URLs-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Scrape URLs</a></span><ul class="toc-item"><li><span><a href="#Base-Page" data-toc-modified-id="Base-Page-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Base Page</a></span></li><li><span><a href="#Collect-all-URLs-for-each-report" data-toc-modified-id="Collect-all-URLs-for-each-report-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Collect all URLs for each report</a></span><ul class="toc-item"><li><span><a href="#Sanity-Check" data-toc-modified-id="Sanity-Check-1.2.1"><span class="toc-item-num">1.2.1&nbsp;&nbsp;</span>Sanity Check</a></span></li></ul></li></ul></li><li><span><a href="#Download-and-extract-files" data-toc-modified-id="Download-and-extract-files-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Download and extract files</a></span></li></ul></div>

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

# Scrape URLs
## Base Page

In [2]:
base_url = 'https://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/MCRAdvPartDEnrolData'

In [3]:
r = requests.get(base_url)
soup = BeautifulSoup(r.text, 'html.parser')

In [4]:
left_menu = soup.find("div", {"id": 'block-cmsmainnavigation'})
reports = left_menu.findAll("li", {"class": 'menu-item'})
report_urls = [r.find("a")['href'] for r in reports]
report_urls[:5]

['/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/MCRAdvPartDEnrolData/Monthly-Contract-and-Enrollment-Summary-Report',
 '/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/MCRAdvPartDEnrolData/Monthly-Enrollment-by-Contract',
 '/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/MCRAdvPartDEnrolData/Monthly-Enrollment-by-Contract-Plan-State-County',
 '/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/MCRAdvPartDEnrolData/Monthly-Enrollment-by-Plan',
 '/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/MCRAdvPartDEnrolData/Monthly-Enrollment-by-State']

In [5]:
# verify that there are 26 report urls
len(report_urls)

26

## Collect all URLs for each report

In [6]:
def get_page_soup(url, params = {'items_per_page':100}): 
    r = requests.get(url,params=params)    
    soup = BeautifulSoup(r.text, 'html.parser')
    return soup

def extract_num_entries(soup):
    #regex, or BS?
    entries_text = soup.find('span', {'class': 'ds-l-sm-col--12'}).text.strip()
    num_entries = entries_text.split(' ')[-2]    
    return int(num_entries)
    
def extract_monthly_report_urls(soup):
    
    # extract table rows
    table = soup.find('div', {'class':'view-content'})
    body = soup.find('tbody')
    rows = body.findAll('tr')
    
    # local data storage object
    reports = {'report_period':[],
               'report_title':[],
               'report_url':[]}
    
    # extract row wise data
    for row in rows:
        cells = row.findAll('td')
        reports['report_period'].append(cells[1].text)
        reports['report_title'].append(cells[0].text)
        reports['report_url'].append(cells[0].a['href'])
    
    # return as DataFrame
    df = pd.DataFrame(reports)
    
    return df

def extract_single_file_download(soup):
    # extract table rows
    rows = soup.findAll('div', {'class':'media--view-mode-file-list'})
    
    # local data storage object
    reports = {'report_period':[],
               'report_title':[],
               'report_url':[]}
    
    # extract row wise data
    for row in rows:

        reports['report_period'] = np.nan
        reports['report_title'].append(row.text.strip())
        reports['report_url'].append(row.a['href'])
    
    # return as DataFrame
    df = pd.DataFrame(reports)
    
    return df

def extract_page_title(soup):
    return soup.find(id='skipNavTarget').text.strip()

def extract_description(soup):
    return soup.find('div', {'id':'block-cms-drupal-global-content'}).div.p.text

def is_single_file_download(soup):
    if soup.find('h2', {'class':'field__label'}):
        return True
    return False

def get_all_pages(url):
    # page can be select with `page` and `items_per_page` params
    
    params = {'items_per_page':100}
    soup = get_page_soup(url, params)
    
    # test for single download
    if is_single_file_download(soup):
        df = extract_single_file_download(soup)
    else:
        # get number of entries
        num_entries = extract_num_entries(soup)

        # get first page
        df = extract_monthly_report_urls(soup)    

        # 0 indexed page id
        page = 0
        while len(df) < num_entries:
            page += 1
            params['page'] = page
            soup = get_page_soup(url, params)
            df = pd.concat([df, extract_monthly_report_urls(soup)])
        
    df['description'] = extract_description(soup)
    df['page_title'] = extract_page_title(soup)
    df['page_url'] = url
    return df

In [8]:
# get separate dataframes of all the report urls.
dfs = []

for idx, report_url in enumerate(report_urls):

    url='https://www.cms.gov'+report_url
    print(idx, url)

    pages = get_all_pages(url)
    dfs.append(get_all_pages(url))

0 https://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/MCRAdvPartDEnrolData/Monthly-Contract-and-Enrollment-Summary-Report
1 https://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/MCRAdvPartDEnrolData/Monthly-Enrollment-by-Contract
2 https://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/MCRAdvPartDEnrolData/Monthly-Enrollment-by-Contract-Plan-State-County
3 https://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/MCRAdvPartDEnrolData/Monthly-Enrollment-by-Plan
4 https://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/MCRAdvPartDEnrolData/Monthly-Enrollment-by-State
5 https://www.cms.gov/research-statistics-data-systems/medicare-advantagepart-d-contract-and-enrollment-data/monthly-online-enrollment-center-oec-report
6 https://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/MCRAdvPartDEnrolData/Month

In [9]:
df = pd.concat(dfs)
df

Unnamed: 0,report_period,report_title,report_url,description,page_title,page_url
0,2021-05,Contract Summary,/research-statistics-data-and-systemsstatistic...,Monthly Contract and Enrollment Summary Report,Monthly Contract and Enrollment Summary Report,https://www.cms.gov/Research-Statistics-Data-a...
1,2021-04,Contract Summary,/research-statistics-data-and-systemsstatistic...,Monthly Contract and Enrollment Summary Report,Monthly Contract and Enrollment Summary Report,https://www.cms.gov/Research-Statistics-Data-a...
2,2021-03,Contract Summary,/research-statistics-data-and-systemsstatistic...,Monthly Contract and Enrollment Summary Report,Monthly Contract and Enrollment Summary Report,https://www.cms.gov/Research-Statistics-Data-a...
3,2021-02,Contract Summary,/research-statistics-data-and-systemsstatistic...,Monthly Contract and Enrollment Summary Report,Monthly Contract and Enrollment Summary Report,https://www.cms.gov/Research-Statistics-Data-a...
4,2021-01,Contract Summary,/research-statistics-data-and-systemsstatistic...,Monthly Contract and Enrollment Summary Report,Monthly Contract and Enrollment Summary Report,https://www.cms.gov/Research-Statistics-Data-a...
...,...,...,...,...,...,...
1,,CAP Summary Report - Updated 12/05/2012 (PDF),/Research-Statistics-Data-and-Systems/Statisti...,Title: Corrective Action Plans,Corrective Action Plans,https://www.cms.gov/Research-Statistics-Data-a...
2,,CAP Detail Data File - Abbreviated Version - U...,/Research-Statistics-Data-and-Systems/Statisti...,Title: Corrective Action Plans,Corrective Action Plans,https://www.cms.gov/Research-Statistics-Data-a...
3,,CAP Detail Report - Updated 12/05/2012 (ZIP),/Research-Statistics-Data-and-Systems/Statisti...,Title: Corrective Action Plans,Corrective Action Plans,https://www.cms.gov/Research-Statistics-Data-a...
4,,CAP Read-Me File (ZIP),/Research-Statistics-Data-and-Systems/Statisti...,Title: Corrective Action Plans,Corrective Action Plans,https://www.cms.gov/Research-Statistics-Data-a...


### Sanity Check

In [10]:
df['page_url'].nunique()

26

In [11]:
df['page_title'].nunique()

26

In [12]:
df['report_title'].nunique()

119

I need to look into why there are so many report titles and what that means for storing these reports in a database.

In [13]:
df[df['report_period'].isna()]

Unnamed: 0,report_period,report_title,report_url,description,page_title,page_url
0,,MPF-OEC-Enrollment-Data (ZIP),/files/zip/mpf-oec-enrollment-data.zip,This dataset provides the total number of enro...,Monthly Online Enrollment Center (OEC) Report,https://www.cms.gov/research-statistics-data-s...
0,,MA Plan Directory as of May 2021 (ZIP),/Research-Statistics-Data-and-Systems/Statisti...,"Plan Directory for MA, Cost, PACE, and Demo Or...",MA Plan Directory,https://www.cms.gov/Research-Statistics-Data-a...
0,,PDP Plan Directory as of May 2021 (ZIP),/Research-Statistics-Data-and-Systems/Statisti...,Title: PDP Plan Directory,PDP Plan Directory,https://www.cms.gov/Research-Statistics-Data-a...
0,,MA Claims Processing Contacts as of May 2021 (...,/Research-Statistics-Data-and-Systems/Statisti...,Title: MA Claims Processing Contacts,MA Claims Processing Contacts,https://www.cms.gov/Research-Statistics-Data-a...
0,,CAP Detail Data File - Updated 12/05/2012 (ZIP),/Research-Statistics-Data-and-Systems/Statisti...,Title: Corrective Action Plans,Corrective Action Plans,https://www.cms.gov/Research-Statistics-Data-a...
1,,CAP Summary Report - Updated 12/05/2012 (PDF),/Research-Statistics-Data-and-Systems/Statisti...,Title: Corrective Action Plans,Corrective Action Plans,https://www.cms.gov/Research-Statistics-Data-a...
2,,CAP Detail Data File - Abbreviated Version - U...,/Research-Statistics-Data-and-Systems/Statisti...,Title: Corrective Action Plans,Corrective Action Plans,https://www.cms.gov/Research-Statistics-Data-a...
3,,CAP Detail Report - Updated 12/05/2012 (ZIP),/Research-Statistics-Data-and-Systems/Statisti...,Title: Corrective Action Plans,Corrective Action Plans,https://www.cms.gov/Research-Statistics-Data-a...
4,,CAP Read-Me File (ZIP),/Research-Statistics-Data-and-Systems/Statisti...,Title: Corrective Action Plans,Corrective Action Plans,https://www.cms.gov/Research-Statistics-Data-a...
5,,CAP Overview (PDF),/Research-Statistics-Data-and-Systems/Statisti...,Title: Corrective Action Plans,Corrective Action Plans,https://www.cms.gov/Research-Statistics-Data-a...


# Download and extract files

Download and extract files in a single step to avoid overtaxing the server.