<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Scrape-URLs" data-toc-modified-id="Scrape-URLs-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Scrape URLs</a></span><ul class="toc-item"><li><span><a href="#Base-Page" data-toc-modified-id="Base-Page-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Base Page</a></span></li><li><span><a href="#Collect-all-URLs-for-each-report" data-toc-modified-id="Collect-all-URLs-for-each-report-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Collect all URLs for each report</a></span><ul class="toc-item"><li><span><a href="#Sanity-Check" data-toc-modified-id="Sanity-Check-1.2.1"><span class="toc-item-num">1.2.1&nbsp;&nbsp;</span>Sanity Check</a></span></li></ul></li></ul></li><li><span><a href="#Download-and-extract-files" data-toc-modified-id="Download-and-extract-files-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Download and extract files</a></span></li><li><span><a href="#File-Types" data-toc-modified-id="File-Types-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>File Types</a></span><ul class="toc-item"><li><span><a href="#zip-files" data-toc-modified-id="zip-files-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>zip files</a></span></li><li><span><a href="#txt-files" data-toc-modified-id="txt-files-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>txt files</a></span></li></ul></li></ul></div>

In [1]:
from bs4 import BeautifulSoup
import requests
import requests_cache

import pandas as pd
import numpy as np
import os

from urllib.parse import urljoin
from tqdm import tqdm

import time
import zipfile

import matplotlib.pyplot as plt
import seaborn as sns

from glob import glob

In [2]:
# start requests cache:
requests_cache.install_cache('cms_cache')

# Scrape URLs
## Base Page

In [3]:
# base_url = 'https://www.cms.gov/'
start_url = 'https://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/MCRAdvPartDEnrolData'

In [4]:
r = requests.get(start_url)
soup = BeautifulSoup(r.text, 'html.parser')

In [5]:
left_menu = soup.find("div", {"id": 'block-cmsmainnavigation'})
reports = left_menu.findAll("li", {"class": 'menu-item'})
report_urls = [r.find("a")['href'] for r in reports]
report_urls[:5]

['/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/MCRAdvPartDEnrolData/Monthly-Contract-and-Enrollment-Summary-Report',
 '/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/MCRAdvPartDEnrolData/Monthly-Enrollment-by-Contract',
 '/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/MCRAdvPartDEnrolData/Monthly-Enrollment-by-Contract-Plan-State-County',
 '/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/MCRAdvPartDEnrolData/Monthly-Enrollment-by-Plan',
 '/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/MCRAdvPartDEnrolData/Monthly-Enrollment-by-State']

In [6]:
# verify that there are 26 report urls
len(report_urls)

26

## Collect all URLs for each report

In [7]:
def get_page_soup(url, params = {'items_per_page':100}): 
    r = requests.get(url,params=params)    
    soup = BeautifulSoup(r.text, 'html.parser')
    return soup

def extract_num_entries(soup):
    #regex, or BS?
    entries_text = soup.find('span', {'class': 'ds-l-sm-col--12'}).text.strip()
    num_entries = entries_text.split(' ')[-2]    
    return int(num_entries)

def extract_monthly_report_urls(soup):
    
    # extract table rows
    table = soup.find('div', {'class':'view-content'})
    body = soup.find('tbody')
    rows = body.findAll('tr')
    
    # local data storage object
    reports = []
    
    # extract row wise data
    for row in rows:
        cells = row.findAll('td')
        
        report = {'report_period':cells[1].text,
                  'report_title':cells[0].text,
                  'report_url':cells[0].a['href']}
        reports.append(report)
    
    return reports

def extract_downloads(soup):
    # extract table rows
    rows = soup.findAll('div', {'class':'media--view-mode-file-list'})
    
    # local data storage object
    reports = {'report_period':[],
               'report_title':[],
               'download_url':[]}
    
    # extract row wise data
    for row in rows:

        reports['report_period'] = np.nan
        reports['report_title'].append(row.text.strip())
        reports['download_url'].append(row.a['href'])
    
    # return as DataFrame
    df = pd.DataFrame(reports)
    
    return df

def extract_page_title(soup):
    return soup.find(id='skipNavTarget').text.strip()

def extract_description(soup):
    return soup.find('div', {'id':'block-cms-drupal-global-content'}).div.p.text

def is_download_page(soup):
    if soup.find('h2', {'class':'field__label'}):
        return True
    return False

def is_links_list_pate(soup):
    
    if soup.find('span', {'class': 'ds-l-sm-col--12'}):
        return True
    
    return False

def get_all_pages(url, recursion=0, verbose=True):
    # page can be select with `page` and `items_per_page` params
    
    if verbose:
        print('Accessing', url)
        print('--> recursion:', recursion)

    params = {'items_per_page':100}
    soup = get_page_soup(url, params)
    
    # test for download page
    #if is_download_page(soup):
    if not is_links_list_pate(soup):
        if verbose:
            print('--> extracting downloads')
        
        df = extract_downloads(soup)
        df['download_page_url'] = url
        return df
    else:
        if verbose:
            print('--> getting links to download pages')
        # get number of entries
        num_entries = extract_num_entries(soup)

        # get first page
        reports = extract_monthly_report_urls(soup)    

        # recurse to get all pages of download urls
        # 0 indexed page id
        page = 0
        while len(reports) < num_entries:
            page += 1
            params['page'] = page
            soup = get_page_soup(url, params)
            reports += extract_monthly_report_urls(soup)
                
        dfs = []
        for report in reports:
            next_url = urljoin(start_url, report['report_url'])
            dfs.append(get_all_pages(next_url, recursion+1, verbose))
            
        df = pd.concat(dfs)
        
        df['description'] = extract_description(soup)
        df['page_title'] = extract_page_title(soup)
        df['page_url'] = url
        return df

In [8]:
test_df = get_all_pages('https://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/MCRAdvPartDEnrolData/SNP-HEDIS-Public-Use-Files', verbose=False)

In [9]:
test_df = get_all_pages('https://www.cms.gov/research-statistics-data-and-systemsstatistics-trends-and-reportsmcradvpartdenroldatamonthly/contract-summary-2020-12', verbose=True)

Accessing https://www.cms.gov/research-statistics-data-and-systemsstatistics-trends-and-reportsmcradvpartdenroldatamonthly/contract-summary-2020-12
--> recursion: 0
--> extracting downloads


In [10]:
test_df.head()

Unnamed: 0,report_period,report_title,download_url,download_page_url
0,,Monthly Contract Summary Report – December 202...,/files/zip/monthly-contract-summary-report-dec...,https://www.cms.gov/research-statistics-data-a...


In [11]:
# get separate dataframes of all the report urls.
dfs = []

In [12]:
for report_url in tqdm(report_urls):
    url=urljoin(base_url,report_url)
    dfs.append(get_all_pages(url, verbose=False))
    time.sleep(1)

  0%|          | 0/26 [00:00<?, ?it/s]


NameError: name 'base_url' is not defined

In [None]:
visited_urls = []
for df in dfs:
    visited_urls += df.

In [None]:
df = pd.concat(dfs).reset_index(drop=True)
df.head()

### Sanity Check

In [None]:
df['page_url'].nunique()

In [None]:
df['page_title'].nunique()

In [None]:
df['download_url'].nunique()

I need to look into why there are so many report titles and what that means for storing these reports in a database.

In [None]:
os.makedirs('data', exist_ok=True)
df.to_csv('data/cms-file-links.csv', index=False)

# Download and extract files

Download and extract files in a single step to avoid overtaxing the server.

In [None]:
# Test download first file.
df = pd.read_csv('data/cms-file-links.csv')
df.head()

In [None]:
def pop_filename_from_url(url):
    return url.split('/')[-1]

def download_file(url, save_path='files', verbose=False):
    
    # create files dir if it doesn't exist
    os.makedirs(save_path, exist_ok=True)
    
    filename = pop_filename_from_url(url)
    file_path = os.path.join(save_path, filename)
    
    if os.path.exists(file_path):
        if verbose:
            print(filename, 'already exists')
    else:
        r = requests.get(url)
        
        # error if we didn't get the file
        r.raise_for_status()
        
        file = open(file_path, "wb")
        file.write(r.content)
        file.close()
        if verbose:
            print(filename, 'downloaded')

In [None]:
DOWNLOAD_FILES_PATH = 'files'

In [None]:
for url in tqdm(df.download_url):
    url = urljoin(start_url, url)
    download_file(url, DOWNLOAD_FILES_PATH)

In [None]:
len(os.listdir(DOWNLOAD_FILES_PATH))

There appear to be two filename errors that repeat:
- files ending in zip-0 (this could indicate multi-part zip files)
- extension missing `.` separator

Below we deal with these by creating a custom splitext function.

In [None]:
def is_zip(filename):
    base, ext = os.path.splitext(filename) 
    
    if ext == '.zip' or ext == '.zip-0':
        return True
    elif filename.endswith('zip'):
        return True
    else:
        return False
    
def splitext(filename):

    base, ext = os.path.splitext(filename)
    
    # missing .
    if ext == '' and filename.endswith('zip'):
        ext = '.' + filename[-3:]
        return filename[:-4], ext 
    
    if ext == '.zip-0':
        ext = '.zip'
    return base, ext

In [None]:
EXTRACTED_ZIP_FILES_PATH = 'zip_extract'

# unzip
notzip = []

for url in tqdm(df.download_url):
    filename = pop_filename_from_url(url)
    
    report_name, ext = splitext(filename) 
        
    # print(report_name, ext)
    if ext == '.zip':
        extract_path = os.path.join(EXTRACTED_ZIP_FILES_PATH, report_name)

        # print(f'unzipping {filename} to {extract_path}')
        
        if not os.path.exists(extract_path):
            os.makedirs(extract_path)
            
            file_path = os.path.join(DOWNLOAD_FILES_PATH, filename)
            
            with zipfile.ZipFile(file_path, 'r') as zip_ref:
                zip_ref.extractall(extract_path)
#         else:
#             print(filename, 'already extracted')
    else:
        # treat as extracted file and copy to extracted folders dir
        os.copy(file_path, extract_path)
        notzip.append(filename)

In [None]:
notzip

# File Types

In [None]:
file_type_counts = {}

for (root,dirs,files) in os.walk(EXTRACTED_ZIP_FILES_PATH):
    for file in files:
        base, ext = os.path.splitext(file)
        ext = ext.lower()
        file_type_counts[ext] = file_type_counts.get(ext, 0) + 1

In [None]:
file_type_counts

In [None]:
ft_counts = pd.DataFrame([file_type_counts.keys(), file_type_counts.values()]).T
ft_counts.columns = ['ext', 'count']
ft_counts

In [None]:

sns.barplot(x='count',
            y='ext',
            data=ft_counts.sort_values('count'),
            color='#E8AE68')

plt.title('Number of files by type (from CMS)')


## zip files
were the zip files in other zip files?

In [None]:
zips = glob(os.path.join(EXTRACTED_ZIP_FILES_PATH, '**/*.zip'), recursive=True)
zips

In [None]:
for z in zips:
    path, filename = os.path.split(z)
    print(os.listdir(path))

Some files seem to have alternative versions with alt in the filename.  Some exploration is required to find out how the alternative files differ.

See how many `alt` files there are

In [None]:
alts = glob(os.path.join(EXTRACTED_ZIP_FILES_PATH, '**Alt**/*.*'), recursive=True)
alts[:5]

In [None]:
# number of alternative files
len(alts)

## txt files
Are the text files delimited tables?

In [None]:
txts = glob(os.path.join(EXTRACTED_ZIP_FILES_PATH, '**/*.txt'), recursive=True)

In [None]:
txt = np.random.choice(txts)
print(txt)
pd.read_csv(np.random.choice(txts), delimiter='\t')

In [None]:
# Most txt files seem to be content descriptions
pd.read_csv('zip_extract/Monthly-Enrollment-by-Contract-April-2008./readme_monthly_report_by_Contract.txt', delimiter='\t')

In [None]:
# some txt files contain data
pd.read_csv('zip_extract/PBP-Benefits-2016./pbp_Section_D_opts.txt', delimiter='\t')

In [None]:
os.listdir('zip_extract/PBP-Benefits-2016.')

In [None]:
# maybe the sas files are not sas files?
pd.read_sas('zip_extract/PBP-Benefits-2016./pbp_Section_D_opt.sas', format='sas7bdat' )