In [12]:
# clear all the variables
%reset -f

import os
import IPython as ip
import logging

logger = logging.getLogger('scrapy-tutorial')
logger.setLevel(logging.INFO)

In [8]:
# import the relevant libraries
import scrapy
import re
import json

from bs4 import BeautifulSoup
from scrapy.http import FormRequest, Request
from scrapy.crawler import CrawlerProcess 

# for debugging, uncomment the following line
#from IPython.core.debugger import Tracer as tr
# to call trace, tr()()

In [9]:
# the connect spider definition
class loginSpider(scrapy.Spider):
    name = 'kaggle_login'
    website = 'https://www.kaggle.com/'
    start_urls = [website + 'account/login?']
    login = 'login'
    pwd = 'mypassword'
    path = 'C:\\Temp\\data\\'
    
    def parse(self, response):                
        #self.debugPrint(response, filename='initial', append=False)        
        token = response.css('input[name=__RequestVerificationToken]::attr(value)').extract_first()
        
        yield FormRequest.from_response(response, 
                                          formdata = {
                                              'UserName' : self.login, 
                                              'Password': self.pwd,
                                              '__RequestVerificationToken' : token
                                              }, 
                                          callback = self.after_login)
    
    def after_login(self, response):
        #self.debugPrint(response, filename='return', append=False)
        
        if 'The username or password provided is incorrect.' in response.body:
            logging.error('Login failed')
            return
        else:
            logging.info('Login succeed')

            #self.debugPrint(response, filename='login successful', append=False)
            
            # connect to the main dataset webpage
            yield Request(loginSpider.website + 'fivethirtyeight/uber-pickups-in-new-york-city',
                           callback=self.connect_datasetPage)
            
    def connect_datasetPage(self, response):
        #self.debugPrint(response, filename='datasets', append=False)
        
        # get the repo list
        soup = BeautifulSoup(response.text, 'lxml')
        find = soup.find_all('div', {'data-component-name' : 'DatasetContainer'})[0].next.contents[0]

        # data
        data = unicode(find).split(';')[2]
        
        # regex
        p = re.compile(r'Kaggle.State.push\((.*)\)')
        parsed = p.findall(data)
        js = json.loads(parsed[0])
        
        # finally the url to download
        url = loginSpider.website[:-1] + js['downloadUrl']

        yield Request(url, callback=self.savePdf)
        
    def savePdf(self, response):
        logging.info('zip file retrieved (size: {0}'.format('bla'))
        
    # for debugging
    def debugPrint(self, message, filename = 'debug', append = False):
        opt = None;
        if append:
            opt = 'a'
        else:
            opt = 'wb'
            
        with open('C:\\Temp\\' + filename + '.txt', opt) as f:
            f.write(message.body)
            f.close()

In [None]:
# set the proxy
os.environ["http_proxy"] = "http://ip:port"
os.environ["https_proxy"] = "https://ip:port"

# set the logging policy
logger = logging.getLogger('scrapy-loadZip')
logger.setLevel(logging.INFO)

process = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})

sp = loginSpider()
process.crawl(sp)
process.start(stop_after_crawl=True)

print('repo found: {0}'.format(loginSpider.result))

In [12]:
# terminate the kernel
app = ip.Application.instance()
app.kernel.do_shutdown(restart=False)

{'restart': False, 'status': 'ok'}