Update scraper.py

blablupcom · Jul 24, 2018 · 7461f95 · 7461f95
1 parent ec4e42b
commit 7461f95
Showing 1 changed file with 20 additions and 24 deletions.
diff --git a/scraper.py b/scraper.py
@@ -44,8 +44,7 @@ def validateURL(url):
             print ("Attempt {0} - Status code: {1}. Retrying.".format(count, r.status_code))
             count += 1
             r = urllib2.urlopen(url)
-        sourceFilename = r.headers.get('Content-Disposition')
-
+        sourceFilename = r.geturl()
         if sourceFilename:
             ext = os.path.splitext(sourceFilename)[1].replace('"', '').replace(';', '').replace(' ', '')
         else:
@@ -85,8 +84,8 @@ def convert_mth_strings ( mth_string ):
 
 #### VARIABLES 1.0
 
-entity_id = "E1232_CBC_gov"
-url = "https://www.dorsetforyou.gov.uk/your-council/about-your-council/budgets-and-spending/open-data-and-transparency/payments-to-suppliers-christchurch-borough-council.aspx"
+entity_id = "E1732_EHDC_gov"
+url = "https://www.easthants.gov.uk/transparency"
 errors = 0
 data = []
 
@@ -98,26 +97,23 @@ def convert_mth_strings ( mth_string ):
 
 #### SCRAPE DATA
 
-links = soup.find('main', id='main').find_all('li')
-for link in links:
-    if 'http' not in link.find('a')['href']:
-        url = 'https://www.dorsetforyou.gov.uk/' + link.find('a')['href'][1:]
-    else:
-        url = link.find('a')['href'][1:]
-    if '.xlsx' in url or '.xls' in url or '.csv' in url:
-        file_name = link.text.strip()
-        csvYr = link.text.strip()[-4:]
-        if 'Q4' in file_name:
-            csvMth = 'Q1'
-        if 'Q3' in file_name:
-            csvMth = 'Q4'
-        if 'Q2' in file_name:
-            csvMth = 'Q3'
-        if 'Q1' in file_name:
-            csvMth = 'Q2'
-        csvMth = convert_mth_strings(csvMth.upper())
-        data.append([csvYr, csvMth, url])
-
+ul_blocks = soup.find('div', attrs = {'property': 'content:encoded'}).find_all('ul')
+for ul_block in ul_blocks:
+    li_blocks = ul_block.find_all('li')
+    for li_block in li_blocks:
+        link = li_block.find('a', href=True)
+        if link:
+            url = link['href']
+            if '-csv-' in url:
+                file_name = link.text
+                if 'http' not in url:
+                    url = 'https://www.easthants.gov.uk'+link['href']
+                else:
+                    url = link['href']
+                csvMth = file_name[:3]
+                csvYr = file_name.split()[1]
+                csvMth = convert_mth_strings(csvMth.upper())
+                data.append([csvYr, csvMth, url])
 
 #### STORE DATA 1.0