Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
blablupcom committed Nov 25, 2015
1 parent 90837a5 commit ed65042
Showing 1 changed file with 8 additions and 10 deletions.
18 changes: 8 additions & 10 deletions scraper.py
@@ -1,17 +1,14 @@
# -*- coding: utf-8 -*-

#### IMPORTS 1.0
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

import os
import re
import scraperwiki
import urllib2
from datetime import datetime
from bs4 import BeautifulSoup
import requests
from dateutil.parser import parse


#### FUNCTIONS 1.0

Expand Down Expand Up @@ -41,19 +38,20 @@ def validateFilename(filename):

def validateURL(url):
try:
r = requests.get(url, allow_redirects=True, timeout=20)
r = urllib2.urlopen(url)
count = 1
while r.status_code == 500 and count < 4:
while r.getcode() == 500 and count < 4:
print ("Attempt {0} - Status code: {1}. Retrying.".format(count, r.status_code))
count += 1
r = requests.get(url, allow_redirects=True, timeout=20)
r = urllib2.urlopen(url)
sourceFilename = r.headers.get('Content-Disposition')

if sourceFilename:
ext = os.path.splitext(sourceFilename)[1].replace('"', '').replace(';', '').replace(' ', '')
else:
ext = os.path.splitext(url)[1]
validURL = r.status_code == 200
validFiletype = ext in ['.csv', '.xls', '.xlsx']
validURL = r.getcode() == 200
validFiletype = ext.lower() in ['.csv', '.xls', '.xlsx']
return validURL, validFiletype
except:
print ("Error validating URL.")
Expand Down

0 comments on commit ed65042

Please sign in to comment.