Update scraper.py

blablupcom · May 30, 2018 · 251fd6b · 251fd6b
1 parent bca30d4
commit 251fd6b
Showing 1 changed file with 26 additions and 33 deletions.
diff --git a/scraper.py b/scraper.py
@@ -9,6 +9,7 @@
 from datetime import datetime
 from bs4 import BeautifulSoup
 
+
 #### FUNCTIONS 1.0
 
 def validateFilename(filename):
@@ -81,52 +82,44 @@ def convert_mth_strings ( mth_string ):
         mth_string = mth_string.replace(k, v)
     return mth_string
 
+
 #### VARIABLES 1.0
 
-entity_id = "E1821_WCC_gov"
-url = "http://www.worcestershire.gov.uk/info/20024/council_finance/331/payments_to_commercial_suppliers_over_500_and_government_procurement_card_transactions"
+entity_id = "E2231_ABC_gov"
+url = "https://www.ashford.gov.uk/transparency/expenditure/supplier-invoices/"
 errors = 0
 data = []
 
+
 #### READ HTML 1.0
 
 html = urllib2.urlopen(url)
 soup = BeautifulSoup(html, 'lxml')
 
-
 #### SCRAPE DATA
 
-block = soup.find('div',{'id':'content'})
-links = block.findAll('a', text=re.compile('View Reports'))
-
+links = soup.find('section', attrs = {'class': 'main-article-area-7'}).find_all('a', href=True)
 for link in links:
-    if 'http' not in link['href']:
-        suburl = 'http://www.worcestershire.gov.uk' + link['href']
-    else:
-        suburl = link['href']
-    if 'payments_to_commercial_suppliers_over' in suburl:
-        html2 = urllib2.urlopen(suburl)
-        soup2 = BeautifulSoup(html2, 'lxml')
-        block = soup2.find('ul', {'class':'item-list item-list__rich'})
-        sublinks = block.findAll('a', href=True)
-        for sublink in sublinks:
-            filePageUrl = sublink['href']
-            title = sublink.encode_contents(formatter='html').replace('&nbsp;',' ')
-            title = title.upper().strip()
-            html3 = urllib2.urlopen(filePageUrl)
-            soup3 = BeautifulSoup(html3, 'lxml')
-            block = soup3.find('main',{'class':'main-content'})
-            filelinks = block.findAll('a', href=True)
-            for filelink in filelinks:
-                fileurl = filelink['href']
-                if 'Download' in filelink.text:
-                    csvYr = title.split(' ')[-1]
-                    csvMth = title.split(' ')[-2][:3]
-                    if ' - ' not in title:
-                        csvYr = title.split()[1]
-                        csvMth = title.split()[0][:3]
-                    csvMth = convert_mth_strings(csvMth)
-                    data.append([csvYr, csvMth, fileurl])
+    if ('.xlsx' in link['href'] or '.csv' in link['href'] or '.xlx' in link['href']) and ('Supplier Spend' in link.text or 'Supplier spend' in link.text):
+        if 'http' not in link['href']:
+            url = 'https://www.ashford.gov.uk'+link['href']
+        else:
+            url = link['href']
+        file_name = link.text.strip()
+        if 'April - June' in file_name:
+            csvMth = 'Q2'
+        if 'July - Sept' in file_name:
+            csvMth = 'Q3'
+        if 'October - December' in file_name or 'Oct - Dec' in file_name:
+            csvMth = 'Q4'
+        if 'Jan - Mar' in file_name:
+            csvMth = 'Q1'
+        if ' - ' not in file_name:
+            csvMth = 'Y1'
+        csvYr = file_name.split('/')[0][-4:]
+        csvMth = convert_mth_strings(csvMth.upper())
+        data.append([csvYr, csvMth, url])
+
 
 #### STORE DATA 1.0