Skip to content

Commit

Permalink
create scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
blablupcom committed Jun 10, 2015
1 parent e50c736 commit 6bfd964
Show file tree
Hide file tree
Showing 11 changed files with 452 additions and 18 deletions.
1 change: 1 addition & 0 deletions .idea/.name

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions .idea/encodings.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions .idea/modules.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions .idea/ncl-tenders.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions .idea/scopes/scope_settings.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

395 changes: 395 additions & 0 deletions .idea/workspace.xml

Large diffs are not rendered by default.

1 change: 0 additions & 1 deletion dn16b.csv

This file was deleted.

7 changes: 4 additions & 3 deletions requirements.txt
@@ -1,3 +1,4 @@
pandas
scraperwiki == 0.4.1
beautifulsoup4
-e git+http://github.com/openaustralia/scraperwiki-python.git@morph_defaults#egg=scraperwiki
lxml==3.4.4
cssselect==0.9.1
beautifulsoup4
30 changes: 16 additions & 14 deletions scraper.py
@@ -1,12 +1,12 @@
# -*- coding: utf-8 -*-
import sys
reload(sys) # Reload does the trick!
sys.setdefaultencoding('UTF8')
from datetime import datetime
import urllib
#import BeautifulSoup
from bs4 import BeautifulSoup
from bs4 import NavigableString
import csv
import time
from pandas import read_csv
import scraperwiki

def get_links_list (source_url):
Expand Down Expand Up @@ -148,12 +148,12 @@ def get_attachments (tender_soup):

todays_date = str(datetime.now())
portals = [
#'https://www.londontenders.org/procontract/supplier.nsf/frm_planner_search_results?OpenForm&contains=&cats=&order_by=DATE&all_opps=CHECK&org_id=ALL',
# 'https://www.bluelight.gov.uk/procontract/supplier.nsf/frm_planner_search_results?OpenForm&contains=&cats=&order_by=DATE&all_opps=CHECK&org_id=ALL',
# 'https://www.londontenders.org/procontract/supplier.nsf/frm_planner_search_results?OpenForm&contains=&cats=&order_by=DATE&all_opps=CHECK&org_id=ALL',
#'https://www.bluelight.gov.uk/procontract/supplier.nsf/frm_planner_search_results?OpenForm&contains=&cats=&order_by=DATE&all_opps=CHECK&org_id=ALL',
# 'https://www.fxplustenders.org/procontract/fxplus/supplier.nsf/frm_planner_search_results?OpenForm&contains=&cats=&order_by=DATE&all_opps=CHECK&org_id=ALL',
# 'https://www.lppsourcing.org/procontract/lpp/supplier.nsf/frm_planner_search_results?OpenForm&contains=&cats=&order_by=DATE&all_opps=CHECK&org_id=ALL',
# 'https://www.advantageswtenders.co.uk/procontract/advantage/supplier.nsf/frm_planner_search_results?OpenForm&contains=&cats=&order_by=DATE&all_opps=CHECK&org_id=ALL',
## 'https://www.bankofenglandtenders.co.uk/procontract/BankOfEngland/supplier.nsf/frm_planner_search_results?OpenForm&contains=&cats=&order_by=DATE&all_opps=CHECK&org_id=ALL',
# 'https://www.bankofenglandtenders.co.uk/procontract/BankOfEngland/supplier.nsf/frm_planner_search_results?OpenForm&contains=&cats=&order_by=DATE&all_opps=CHECK&org_id=ALL',
# 'https://www.channelislandtenders.com/procontract/channelislands/supplier.nsf/frm_planner_search_results?OpenForm&contains=&cats=&order_by=DATE&all_opps=CHECK&org_id=ALL',
# 'https://www.eastmidstenders.org/procontract/emp/supplier.nsf/frm_planner_search_results?OpenForm&contains=&cats=&order_by=DATE&all_opps=CHECK&org_id=ALL',
# 'https://www.eastridingcontractsfinder.co.uk/procontract/eastriding/supplier.nsf/frm_planner_search_results?OpenForm&contains=&cats=&order_by=DATE&all_opps=CHECK&org_id=ALL',
Expand Down Expand Up @@ -185,10 +185,12 @@ def get_attachments (tender_soup):
]


df = read_csv("dn16b.csv") # use pandas to open csv
saved_urls = df['tender_url'].values.tolist() # convert column to list

resultFile = open("dn16b.csv",'a')
# saved_urls = df.values.tolist() # convert column to list
# df = read_csv("dn16b.csv") # use pandas to open csv
# saved_urls = df['tender_url'].values.tolist() # convert column to list
#
# resultFile = open("dn16b.csv",'a')


for portal in portals:
Expand Down Expand Up @@ -234,11 +236,12 @@ def get_attachments (tender_soup):

attach_list = []
attach_list = get_attachments(tender_soup)
scraperwiki.sqlite.save(unique_keys=['l'], data={"l":unicode(link), "tender_id": unicode(tender_id), "buyer": unicode(buyer), "title" : unicode(title), "categories": unicode(categories), "contact_name": unicode(contact_name), "contact_phone": unicode(contact_phone), "contact_addr": unicode(contact_addr), "contact_email": unicode(contact_email), "contract_start": contract_start, "contract_end": contract_end, "eoi_start": eoi_start, "eoi_end": eoi_end, "est_value": unicode(est_value), "contract_duration": unicode(contract_duration), "extension_duration": unicode(extension_duration), "extension_iterations": unicode(extension_iterations), "summary": unicode(summary), "attach_list": unicode(attach_list),"d": todays_date })

csv_row = [link, tender_id,buyer,title,summary,categories,contact_name,contact_phone,contact_email,contact_addr,contract_start,contract_end,eoi_start,eoi_end,est_value,contract_duration,extension_duration,extension_iterations,attach_list]

wr = csv.writer(resultFile, quoting=csv.QUOTE_ALL, delimiter=',')
wr.writerow(csv_row)
# csv_row = [link, tender_id,buyer,title,summary,categories,contact_name,contact_phone,contact_email,contact_addr,contract_start,contract_end,eoi_start,eoi_end,est_value,contract_duration,extension_duration,extension_iterations,attach_list]
#
# wr = csv.writer(resultFile, quoting=csv.QUOTE_ALL, delimiter=',')
# wr.writerow(csv_row)



Expand All @@ -249,4 +252,3 @@ def get_attachments (tender_soup):




0 comments on commit 6bfd964

Please sign in to comment.