# VBA Files

### Pulling Excel Files from the Website

#### The steps of this are going to be:

#### Pulling the files to my computer from 2014 to present at https://www.benefits.va.gov/reports/detailed_claims_data.asp and
#### historical files at https://www.benefits.va.gov/REPORTS/mmwr/historical/{year}/index.asp from 2004 to 2013

#### Reading in the table that's initially the only table

#### Cleaning it to add file information and standardize the columns

#### Analytics

### Libraries/Setup

In [18]:
import pandas as pd
import numpy as np
import os
import requests
from bs4 import BeautifulSoup
import win32com.client
os.chdir(r'C:\Users\admin\Documents\Python Scripts\veteranLawData')

### Get File URLs From By-Year (Archive) Pages

In [10]:
def pullFilesByYear(year):
    url= f'https://www.benefits.va.gov/REPORTS/mmwr/historical/{str(year)}/index.asp'
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "html.parser")
    filesToGet=[]
    for link in soup.findAll('a'):
        try:
            if "xls" in (link.get('href')):
                filesToGet.append(link.get('href'))
        except:
            pass
    return(filesToGet)

yearRange=list(range(2004, 2014))
#files=pullTheFiles(2004)
filesList=list(map(pullFilesByYear, yearRange))
flatFileList= [item for sublist in filesList for item in sublist]

### Get File URLs From Main, Current Page

In [11]:
def pullFilesFromSite():
    url='https://www.benefits.va.gov/reports/detailed_claims_data.asp'
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "html.parser")
    filesToGet=[]
    for link in soup.findAll('a'):
        try:
            if "xls" in (link.get('href')):
                filesToGet.append(link.get('href'))
        except:
            pass
    return(filesToGet)

filesList2=pullFilesFromSite()
    

### Check We Have The Right Number From By-Year Archive Pages: 2008 Is Legitimately Missing Three, I Checked

In [12]:
for fileList in filesList:
    print(len(fileList))

52
52
52
52
49
52
52
52
52
53


### Check We Have The Right Number from Top, Recent Page

In [13]:
files2018=[i.split("_")[1] for i in filesList2 if ("2018" in i and "18" in i.split("_")[1])]
len(files2018)

54

### This Looks Pretty Good, We Might Need to Go Back and Check At Some Point

In [14]:
files2018.sort()
files2018

['01-01-18.xlsx',
 '01-08-18.xlsx',
 '01-16-18.xlsx',
 '01-22-18.xlsx',
 '01-29-18.xlsx',
 '02-05-18.xlsx',
 '02-12-18.xlsx',
 '02-19-18.xlsx',
 '02-26-18.xlsx',
 '03-05-18.xlsx',
 '03-12-18.xlsx',
 '03-19-18.xlsx',
 '03-26-18.xlsx',
 '04-02-18.xlsx',
 '04-09-18.xlsx',
 '04-16-18.xlsx',
 '04-23-18.xlsx',
 '04-28-18.xlsx',
 '05-07-18.xlsx',
 '05-14-18.xlsx',
 '05-21-18.xlsx',
 '05-29-18.xlsx',
 '06-04-18.xlsx',
 '06-11-18.xlsx',
 '06-18-18.xlsx',
 '06-25-18.xlsx',
 '07-09-18.xlsx',
 '07-14-18.xlsx',
 '07-23-18.xlsx',
 '07-30-18.xlsx',
 '08-06-18.xlsx',
 '08-13-18.xlsx',
 '08-20-2018.xlsx',
 '08-27-2018.xlsx',
 '09-04-18.xlsx',
 '09-10-18.xlsx',
 '09-17-18.xlsx',
 '09-24-18.xlsx',
 '10-01-18.xlsx',
 '10-08-18',
 '10-15-18',
 '10-22-18',
 '10-29-18',
 '11-05-18',
 '11-13-18',
 '11-19-18',
 '11-26-18.xlsx',
 '12-03-18.xlsx',
 '12-10-18.xlsx',
 '12-17-18.xlsx',
 '12-18-17.xlsx',
 '12-24-18.xlsx',
 '12-31-18.xlsx',
 '7-2-18.xlsx']

### Append The Lists of Files

In [15]:
allList= flatFileList + filesList2

### Pull The Files

In [16]:
#there are two different formats for how the file URLs are structured; this picks up both
#this takes the longest to run, and I don't need to run it every time because I already have the files

def pullFile(url):
    try:
        cleanLink='https://www.benefits.va.gov'+url
        r = requests.get(cleanLink, allow_redirects=True)
        name=cleanLink.split("/")[-1]
        open(name, 'wb').write(r.content)
        return(name)
    except:
        try: 
            r = requests.get(url, allow_redirects=True)
            name=url.split("/")[-1]
            open(name, 'wb').write(r.content)
        except:
            print(url)

#listOfFileNames=list(map(pullFile, allList))

### Read Them Into A Dictionary: There are a bunch of different types

In [24]:
#there are a few different formats for the content we want, identifiable by tab names 
#and whether it errors out for being encrypted
def getEncyptedFiles(fileName):
    fullFileName=os.getcwd()+'\\'+fileName
    #fileName=r"C:\Users\admin\Documents\Python Scripts\veteranLawData\MMWL_01-04-14.xls"
    excel = win32com.client.Dispatch('Excel.Application')
    workbook = excel.Workbooks.open(fullFileName)
    try:
        sheet = workbook.WorkSheets('Traditional Aggregate')
    except:
        sheet = workbook.WorkSheets(r'TA-Regional Office')
    content = sheet.Range(sheet.Cells(9, 2), sheet.Cells(80, 18)).Value 
    # Transfer content to pandas dataframe
    df = pd.DataFrame(list(content))
    df.columns = df.iloc[0]
    workbook.Close(True)
    return(df)

def readBasicFile(fileName):
    try:
        xl=pd.ExcelFile(fileName)
        if len(xl.sheet_names)==1:
            df=pd.read_excel(fileName)
        if 'Final Aggregate' in xl.sheet_names:
            df=pd.read_excel(fileName, sheet_name='Final Aggregate', header=7)
        if 'Traditional Aggregate' in xl.sheet_names:
            df=pd.read_excel(fileName, sheet_name='Traditional Aggregate', header=7)
        print("worked1")
        return(df)
    except:
        try:
            df=getEncyptedFiles(fileName)
            print("worked2")
            return(df)
        except:
            print(f"failed on {fileName}")
        
downloadedFiles=[i for i in os.listdir() if "xls" in i]
(len(downloadedFiles))

796

In [32]:
#we're going to randomly pull 50 and see how it goes
import random
myRandoms = random.sample(range(len(downloadedFiles)), 50)
myRandomFileNames=[downloadedFiles[i] for i in myRandoms]

In [33]:
myRandomDFs= list(map(readBasicFile, myRandomFileNames))

worked1
worked1
worked1
worked1
worked1
worked1
worked2
worked1
worked2
worked1
worked1
worked2
worked1
worked1
worked1
worked2
worked1
worked1
worked2
worked1
worked1
worked1
worked1
worked1
worked1
worked2
worked1
worked1
worked1
worked2
worked1
worked2
worked1
worked2
worked1
worked1
worked2
worked1
worked2
worked1
worked1
worked1
worked1
worked1
worked1
worked1
worked1
worked1
worked1
worked1
worked1
worked2
worked1
worked2
worked1
worked1
worked1
worked1
worked1
worked1
worked2


In [35]:
dictionaryOfFiles = dict(zip(myRandomFileNames, myRandomDFs))

### Append Into Master DF

In [36]:


masterDF=pd.DataFrame()
for key in dictionaryOfFiles.keys():
    try:
        df=dictionaryOfFiles[key]
        df['fileName']=key
        listOfPhrasesToFind=["As of", "For the Week"]
        foundPhrase=[i for i in df.columns if any(substring in i for substring in listOfPhrasesToFind)]
        dropPhrase=[i for i in df.columns if "Percent" in i]
        #print(len(foundPhrase))
        #df["forTheWeek"]=df[foundPhrase[0]]
        #df=df.drop([foundPhrase[0]], axis=1)
        #df=df.drop(dropPhrase, axis=1)
        masterDF=masterDF.append(df, sort=True)
        #print(key)
    except:
        print(f'failed on {key}')
        

failed on MMWR_2015-05-25.xlsx
failed on MMWL_06-30-14.xls
failed on 111213.xls
failed on MMWR_2015-08-24_b.xlsx
failed on MMWR_2015-12-21_b.xlsx
failed on MMWR_2015-01-26.xlsx
failed on MMWL_11-17-14.xls
failed on MMWR_08-13-18.xlsx
failed on MMWR_10-2-2017.xlsx
failed on MMWR 6-24-17.xlsx
failed on MMWR 4-22-17.xlsx
failed on MMWR_2015-10-05_b.xlsx
failed on MMWR_09-04-18.xlsx
failed on MMWR_8-28-17.xlsx
failed on MMWR_2015-12-28_b.xlsx
failed on MMWR_2015-07-27_b.xlsx


#### I see, looking at these, that what's happening is that I'm reading the wrong tabs. This is the next thing to look at

#### I think part of the issue is that the content we're looking at -- it's not just that the tab name changes, it's that 

#### we may have a tab name across multiple worksheets that, sometimes it's the right one and sometimes it isn't

#### have to decide, do we want to do increasingly elaborate if, then, or do we want to read in all the tabs and tell it to look for

#### specific kinds of content

### Let's look at the ones where it is reading - there's a lot to clean, but this looks like we would want it to

In [39]:
masterDF.columns

Index([' Pending', ' Pending.1', ' Rating Cases Pending',
       ' Rating Claims Pending', '* Scorecard  Non-Rating Cases Pending',
       '* Scorecard  Non-Rating Claims Pending', 'Accrued  ', 'Appeals',
       'As of: \nApril 07, 2007\n  ', 'As of: \nApril 12, 2008\n  ',
       'As of: \nApril 19, 2008\n  ', 'As of: \nApril 28, 2007\n  ',
       'As of: \nApril 29, 2006\n  ', 'As of: \nFebruary 03, 2007\n  ',
       'As of: \nFebruary 17, 2007\n  ', 'As of: \nJan 3, 2009\n  ',
       'As of: \nJanuary 22, 2005\n  ', 'As of: \nJanuary 31,2009\n  ',
       'As of: \nJuly 14, 2007\n  ', 'As of: \nJuly 28, 2007\n  ',
       'As of: \nJune 20, 2009', 'As of: \nJune 3, 2006\n  ',
       'As of: \nMarch 31, 2007\n  ', 'As of: \nMarch 7, 2009\n  ',
       'As of: \nMay 15, 2009', 'As of: \nMay 28, 2005\n  ',
       'As of: \nMay 6, 2006\n  ', 'As of: \nSeptember 12,  2009',
       'As of: \nSeptember 16, 2006\n  ', 'Award Adjustment ', 'Burial  ',
       'C&P Claims over 180 Days ', 'C&P Cla

### We want to see similar lengths of the dataframe by fileName to make sure we're not cutting anything off

In [43]:
masterDF['fileName'].value_counts().value_counts().sort_index()

70    10
71     3
72     5
73     1
77     1
78     2
89     5
92     7
Name: fileName, dtype: int64

### Let's look at a 70 just to see if it's cutting anything off

In [57]:
firstShortFile=masterDF.groupby(['fileName']).size().sort_values().index[0]
firstShortFile

'091806.xls'

In [59]:
masterDF.loc[masterDF['fileName']==firstShortFile].dropna(how='all', axis=1)

Unnamed: 0,"As of: September 16, 2006",C&P Claims over 180 Days in WIPP,COEs Issued,Date of Oldest Pending COE,Education Work Items Pending,Guarantees Pending,IVMs Pending (EP 154 and EP 314),Non-Rating Cases Pending over 180 Days,Percent Pending over 180 Days,Percent Pending over 180 Days.1,Percent Pending over 180 Days.2,Pre-Discharge Claims,Rating Cases Pending over 180 Days,SOC's,Scorecard Non-Rating Cases Pending,Scorecard Rating Cases Pending,Total Appeals Requiring Adjudicative Action,Total C&P Pending in Wipp,VACOLS Appeals,fileName
0,USA,148500.000000,00:00:00,2006-09-14 00:00:00,131144,61.00,16944,47441.000000,0.231762,0.301684,0.249365,3621,89763.000000,18861.000000,157254.000000,387306.000000,133599,595512.000000,152460.000000,091806.xls
1,Last Week,151140.000000,00:00:00,2006-09-05 00:00:00,130566,50.00,17146,48220.000000,0.235399,0.303536,0.252600,3423,91639.000000,17868.000000,158861.000000,389293.000000,133852,598338.000000,151720.000000,091806.xls
2,Change from Last Week,-2640.000000,00:00:00,NaT,578,11.00,-202,-779.000000,-0.003636,-0.0018519,-0.003234,198,-1876.000000,993.000000,-1607.000000,-1987.000000,-253,-2826.000000,740.000000,091806.xls
3,Percent Change,-0.017467,,NaT,0.00442688,0.22,-0.0117812,-0.016155,,,,0.057844,-0.020472,0.055574,-0.010116,-0.005104,-0.00189015,-0.004723,0.004877,091806.xls
4,Last Year,98077.000000,00:00:00,2005-09-12 00:00:00,113162,823.00,14511,18101.000000,0.209433,0.154126,0.194092,8906,73487.000000,21913.000000,117443.000000,350885.000000,130158,505311.000000,152071.000000,091806.xls
5,EASTERN AREA,,,NaT,,,,,,,,,,,,,,,,091806.xls
6,Baltimore,1691.000000,,NaT,,,8,613.000000,0.213000,0.343,0.245000,23,960.000000,206.000000,1789.000000,4499.000000,1383,6895.000000,1589.000000,091806.xls
7,Boston,1462.000000,,NaT,,,2,182.000000,0.248000,0.154,0.228000,0,1196.000000,328.000000,1180.000000,4830.000000,1749,6403.000000,2077.000000,091806.xls
8,Buffalo,1841.000000,,NaT,38352,,8,358.000000,0.264000,0.19,0.245000,17,1357.000000,197.000000,1880.000000,5139.000000,1130,7516.000000,1327.000000,091806.xls
9,Cleveland,5976.000000,,NaT,,46.00,1,1375.000000,0.292000,0.333,0.300000,14,4408.000000,778.000000,4130.000000,15075.000000,2973,19899.000000,3751.000000,091806.xls


### that looks good, and I confirmed from looking at the file that there aren't additional cities being cut off at the bottom 