# VBA Files

### Pulling Excel Files from the Website

In [1]:
# There are weekly files from 2014 to present on https://www.benefits.va.gov/reports/detailed_claims_data.asp

# There are historical files at https://www.benefits.va.gov/REPORTS/mmwr/historical/{year}/index.asp from 2004 to 2013

# Presumably the first step will be just pulling all of them

### Libraries/Setup

In [2]:
import pandas as pd
import numpy as np
import os
import requests
from bs4 import BeautifulSoup
os.chdir(r'C:\Users\admin\Documents\Python Scripts\veteranLawData')

### Get File URLs

In [3]:
def pullTheFiles(year):
    url= f'https://www.benefits.va.gov/REPORTS/mmwr/historical/{str(year)}/index.asp'
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "html.parser")
    filesToGet=[]
    for link in soup.findAll('a'):
        try:
            if "xls" in (link.get('href')):
                filesToGet.append(link.get('href'))
        except:
            pass
    return(filesToGet)

yearRange=list(range(2004, 2014))
#files=pullTheFiles(2004)
filesList=list(map(pullTheFiles, yearRange))
flatFileList = [item for sublist in filesList for item in sublist]

### Check We Have The Right Number: 2008 Is Legitimately Missing Three, I Checked

In [4]:
for fileList in filesList:
    print(len(fileList))

52
52
52
52
49
52
52
52
52
53


### Pull The Files

In [4]:
def pullFile(url):
    cleanLink='https://www.benefits.va.gov'+url
    r = requests.get(cleanLink, allow_redirects=True)
    name=cleanLink.split("/")[-1]
    open(name, 'wb').write(r.content)
    return(name)

#listOfFileNames=list(map(pullFile, files))
filesNameList=list(map(pullFile, flatFileList))

### Read Them Into A Dictionary

In [5]:
def readBasicFile(fileName):
    try:
        df=pd.read_excel(fileName)
        return(df)
    except:
        print(f"failed on {fileName}")

listOfDFs=list(map(readBasicFile, filesNameList))
dictionaryOfFiles = dict(zip(filesNameList, listOfDFs))

failed on 102113.xls
failed on 102813.xls
failed on 110413.xls
failed on 111213.xls
failed on 111813.xls
failed on 112513.xls
failed on 120213.xls
failed on 120913.xls
failed on 121613.xls
failed on 122313.xls
failed on 123013.xls


### Append Into Master DF

In [6]:
masterDF=pd.DataFrame()
for key in dictionaryOfFiles.keys():
    try:
        df=dictionaryOfFiles[key]
        df['fileName']=key
        listOfPhrasesToFind=["As of", "For the Week"]
        foundPhrase=[i for i in df.columns if any(substring in i for substring in listOfPhrasesToFind)]
        dropPhrase=[i for i in df.columns if "Percent" in i]
        #print(len(foundPhrase))
        df["forTheWeek"]=df[foundPhrase[0]]
        df=df.drop([foundPhrase[0]], axis=1)
        df=df.drop(dropPhrase, axis=1)
        masterDF=masterDF.append(df, sort=True)
    except:
        print(f'failed on {key}')

failed on NewMMWL_100509.xls
failed on 101309.xls
failed on 101909.xls
failed on 102609.xls
failed on 110209.xls
failed on 110909.xls
failed on 111609.xls
failed on 112309.xls
failed on 113009.xls
failed on 120709.xls
failed on 121409.xls
failed on 122109.xls
failed on 122809.xls
failed on 010410.xlsx
failed on 040510.xlsx
failed on 070610.xlsx
failed on 100410.xlsx
failed on 011110.xlsx
failed on 041210.xlsx
failed on 071210.xlsx
failed on 101210.xlsx
failed on 011910.xlsx
failed on 041910.xlsx
failed on 071910.xlsx
failed on 101810.xlsx
failed on 012510.xlsx
failed on 042610.xlsx
failed on 072610.xlsx
failed on 102510.xlsx
failed on 020110.xlsx
failed on 050310.xlsx
failed on 080210.xlsx
failed on 110110.xlsx
failed on 020810.xlsx
failed on 051010.xlsx
failed on 080910.xlsx
failed on 110810.xlsx
failed on 021610.xlsx
failed on 051710.xlsx
failed on 081610.xlsx
failed on 111510.xlsx
failed on 022210.xlsx
failed on 052410.xlsx
failed on 082310.xlsx
failed on 112210.xlsx
failed on 03011

In [7]:
masterDF.columns

Index([' ', ' Rating Cases Pending', ' Rating Claims Pending',
       '* Scorecard  Non-Rating Cases Pending',
       '* Scorecard  Non-Rating Claims Pending', 'C&P Claims over 180 Days ',
       'C&P Claims over 180 Days in WIPP', 'C&P Work Items over 180 Days ',
       'COE', 'COEs Issued', 'Date of Oldest Pending COE',
       'Education Cases Pending ', 'Education Work Items Pending ',
       'Guarantees Pending', 'IVMs Pending (EP 154 and EP 314)',
       'Non-Rating Cases Pending over 180 Days',
       'Non-Rating Claims Pending over 180 Days', 'Pre-Discharge Claims',
       'Rating Cases Pending', 'Rating Cases Pending over 180 Days',
       'Rating Claims Pending over 180 Days', 'SOC's',
       'Scorecard Non-Rating Cases Pending', 'Scorecard Rating Cases Pending',
       'Total Appeals Requiring Adjudicative Action ',
       'Total Appeals Requiring Adjudicative action ', 'Total C&P Pending ',
       'Total C&P Pending in Wipp', 'Total C&P Work Items Pending ',
       'Unnamed:

### Right now: it looks like 04-09 is working; after that we need to look at the format of the 09 worksheets to modify it; there's also some column cleaning we may need to do for the 04-09 group; this is a good stopping point

### Are we missing lines? spot checked first and last, looked good, getting weird value counts for the regional variable, but the overall counts don't have too much variation

In [23]:
seriesOfLength=pd.Series()

for file in masterDF['fileName'].unique():
    s= pd.Series([len(masterDF.loc[masterDF['fileName']==file])], index=[file])
    seriesOfLength=seriesOfLength.append(s)

In [24]:
seriesOfLength.value_counts()

71    84
72    83
70    78
68    24
78    15
73    11
dtype: int64

In [25]:
seriesOfLength.loc[seriesOfLength==68]

010504.xls    68
040504.xls    68
012004.xls    68
041204.xls    68
041904.xls    68
012604.xls    68
042604.xls    68
020204.xls    68
050304.xls    68
020904.xls    68
051004.xls    68
021704.xls    68
051704.xls    68
022304.xls    68
052404.xls    68
030104.xls    68
060104.xls    68
030804.xls    68
060704.xls    68
031504.xls    68
061404.xls    68
032204.xls    68
062104.xls    68
032904.xls    68
dtype: int64

### Looking at One With Count 68 : Looks Good

In [27]:
masterDF.loc[masterDF['fileName']=='021704.xls']

Unnamed: 0,Unnamed: 1,Rating Cases Pending,Rating Claims Pending,* Scorecard Non-Rating Cases Pending,* Scorecard Non-Rating Claims Pending,C&P Claims over 180 Days,C&P Claims over 180 Days in WIPP,C&P Work Items over 180 Days,COE,COEs Issued,...,Total Appeals Requiring Adjudicative Action,Total Appeals Requiring Adjudicative action,Total C&P Pending,Total C&P Pending in Wipp,Total C&P Work Items Pending,Unnamed: 18,VACOLS Appeals,VACOLS Appeals includes AMC & Travel Board,fileName,forTheWeek
0,NaT,,,,,,102909.000000,,NaT,1906-11-01 00:00:00,...,,124637.000000,,478130.000000,,,,145668.000000,021704.xls,USA
1,NaT,,,,,,104386.000000,,NaT,1906-11-01 00:00:00,...,,125006.000000,,470643.000000,,,,145132.000000,021704.xls,Last Week
2,NaT,,,,,,-1477.000000,,NaT,00:00:00,...,,-369.000000,,7487.000000,,,,536.000000,021704.xls,Change from Last Week
3,NaT,,,,,,-0.014149,,NaT,,...,,-0.002952,,0.015908,,,,0.003693,021704.xls,Percent Change
4,NaT,,,,,,118461.000000,,NaT,1906-11-01 00:00:00,...,,97678.000000,,472859.000000,,,,121596.000000,021704.xls,Last Year
5,NaT,,,,,,,,NaT,,...,,,,,,,,,021704.xls,EASTERN AREA
6,NaT,,,,,,1890.000000,,NaT,,...,,1165.000000,,6161.000000,,,,1466.000000,021704.xls,Baltimore Regional Office
7,NaT,,,,,,2757.000000,,NaT,,...,,1568.000000,,7641.000000,,,,1806.000000,021704.xls,Boston Regional Office
8,NaT,,,,,,2147.000000,,NaT,,...,,2040.000000,,6959.000000,,,,2319.000000,021704.xls,Buffalo Regional Office
9,NaT,,,,,,3610.000000,,NaT,,...,,3110.000000,,12994.000000,,,,3960.000000,021704.xls,Cleveland Regional Office


### Now Let's Look at the Next Template // first failure to read in this way

#### The first failure is 101309.xls; looking at it, there are two tabs, the second one is called "Final Aggregate" and it also differs from the previous ones in that there's more spacing/headers

In [30]:
fileName="101309.xls"
df=pd.read_excel(fileName, sheet_name='Final Aggregate', header=7)

In [31]:
df

Unnamed: 0,Unnamed: 1,Claims Pending,Pending over 125 days,Percent Pending over 125 days,Pending,Pending over 125 days.1,Percent Pending over 125 days.1,Pending.2,Pending over 125 days.2,Percent Pending over 125 days.2,Pending.1,Pending over 125 days.3,Percent Pending over 125 days.3,Claims Pending.1,Claims Pending.2,Pending.1.1
,USA,393195,144017,0.366274,147439,46948,0.318423,22479,9040,0.402153,28425,15381,0.541108,3391,446,175257
,EASTERN AREA,83069,33642,0.404989,27495,7737,0.281397,5674,2828,0.498414,5074,2994,0.590067,618,101,31335
,Baltimore,8647,4387,0.507344,3424,1949,0.569217,1979,1239,0.626074,473,308,0.651163,29,14,2117
,Boston,4274,1203,0.281469,896,173,0.19308,460,121,0.263043,309,199,0.644013,56,5,2170
,Buffalo,5172,2324,0.449343,1982,348,0.17558,281,98,0.348754,241,99,0.410788,114,4,762
,Cleveland,10208,4055,0.397237,2372,300,0.126476,457,99,0.21663,476,293,0.615546,7,0,6098
,Detroit,11541,5577,0.483234,2792,546,0.195559,408,190,0.465686,960,697,0.726042,13,0,4040
,Hartford,1561,404,0.258808,968,58,0.0599174,108,50,0.462963,78,37,0.474359,3,5,1054
,Indianapolis,10353,4714,0.455327,4303,1700,0.395073,473,296,0.625793,613,221,0.360522,33,15,2841
,Manchester,1264,409,0.323576,423,50,0.118203,2,0,0,103,38,0.368932,52,0,664


In [None]:
### This is working fine, the question is, can we actually hardcode 7 or do we need to make this more flexible?