Update scraper.py

blablupcom · May 31, 2018 · a9fcc24 · a9fcc24
1 parent cefe65c
commit a9fcc24
Showing 1 changed file with 23 additions and 11 deletions.
diff --git a/scraper.py b/scraper.py
@@ -85,28 +85,40 @@ def convert_mth_strings ( mth_string ):
 
 #### VARIABLES 1.0
 
-entity_id = "E0931_ABC_gov"
-url = "https://www.allerdale.gov.uk/en/about-council/budget-and-spending/spending-council/"
+entity_id = "E2632_BDC_gov"
+url = "https://www.broadland.gov.uk/info/200197/spending_and_transparency/339/council_spending_over_250"
 errors = 0
 data = []
 
 
 #### READ HTML 1.0
 
 html = urllib2.urlopen(url)
-soup = BeautifulSoup(html, "lxml")
+soup = BeautifulSoup(html, 'lxml')
 
 #### SCRAPE DATA
 
-links = soup.find_all('a')
+links = soup.find('div', 'editor').find_all('a', href=True)
 for link in links:
-    file_name = link.text
-    if 'Spending' in file_name and '.csv' in link['href']:
-        url = link['href']
-        csvYr = file_name.replace('Spending ', '').strip()[-4:]
-        csvMth = file_name.replace('Spending ', '').strip()[:3]
-        csvMth = convert_mth_strings(csvMth.upper())
-        data.append([csvYr, csvMth, url])
+    if 'http' not in link['href']:
+        year_url = 'https://www.broadland.gov.uk' + link['href']
+    else:
+        year_url = link['href']
+    year_html = urllib2.urlopen(year_url)
+    year_soup = BeautifulSoup(year_html, 'lxml')
+    blocks = year_soup.find_all('span', 'download-listing__file-tag download-listing__file-tag--type')
+    for block in blocks:
+        if 'CSV' in block.text:
+            url = block.find_next('a')['href']
+            if 'http' not in url:
+                url = 'https://www.broadland.gov.uk' + url
+            else:
+                url = url
+            file_name = block.find_next('a')['aria-label']
+            csvMth = file_name.split()[-2][:3]
+            csvYr = file_name.split()[-1]
+            csvMth = convert_mth_strings(csvMth.upper())
+            data.append([csvYr, csvMth, url])
 
 
 #### STORE DATA 1.0