In [31]:
from sqlalchemy import Column, ForeignKey, Integer, String, UniqueConstraint, Text
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import create_engine


Base = declarative_base()


class Lerua(Base):
    __tablename__ = 'lerua'
    id = Column(Integer, primary_key=True)
    title = Column(String(200), nullable=False)
    link = Column(String(400), nullable=False)
    price = Column(Integer, nullable=True)
    photos = Column(String(400), nullable=False)
    feature = Column(Text, nullable=False)


if __name__ == '__main__':
    engine = create_engine('sqlite:///leruaDB.db', echo=True)
    Base.metadata.create_all(engine)

2020-11-15 17:44:19,680 INFO sqlalchemy.engine.base.Engine SELECT CAST('test plain returns' AS VARCHAR(60)) AS anon_1
2020-11-15 17:44:19,682 INFO sqlalchemy.engine.base.Engine ()
2020-11-15 17:44:19,684 INFO sqlalchemy.engine.base.Engine SELECT CAST('test unicode returns' AS VARCHAR(60)) AS anon_1
2020-11-15 17:44:19,685 INFO sqlalchemy.engine.base.Engine ()
2020-11-15 17:44:19,689 INFO sqlalchemy.engine.base.Engine PRAGMA main.table_info("lerua")
2020-11-15 17:44:19,691 INFO sqlalchemy.engine.base.Engine ()
2020-11-15 17:44:19,694 INFO sqlalchemy.engine.base.Engine PRAGMA temp.table_info("lerua")
2020-11-15 17:44:19,696 INFO sqlalchemy.engine.base.Engine ()
2020-11-15 17:44:19,698 INFO sqlalchemy.engine.base.Engine 
CREATE TABLE lerua (
	id INTEGER NOT NULL, 
	title VARCHAR(150) NOT NULL, 
	link VARCHAR(500) NOT NULL, 
	price INTEGER, 
	photos VARCHAR(500) NOT NULL, 
	feature TEXT NOT NULL, 
	PRIMARY KEY (id)
)


2020-11-15 17:44:19,699 INFO sqlalchemy.engine.base.Engine ()
2020-11-1

In [32]:
BOT_NAME = 'lerua'

SPIDER_MODULES = ['lerua.spider']
NEWSPIDER_MODULE = 'lerua.spider'

IMAGES_STORE = 'photo'

LOG_ENABLED = True
LOG_LEVEL = 'DEBUG'

USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 YaBrowser/20.9.3.126 Yowser/2.5 Safari/537.36'


ROBOTSTXT_OBEY = False


CONCURRENT_REQUESTS = 8

DOWNLOAD_DELAY = 2

COOKIES_ENABLED = True


ITEM_PIPELINES = {
   'lerua.pipelines.DataBasePipeline': 300,
   'lerua.pipelines.LeruaPhotosPipeline': 200,
}

In [33]:
import scrapy
from scrapy.loader.processors import MapCompose, TakeFirst


def int_price(value):
    return int(value.replace(' ', ''))


def f_preproc(value):
    #print(value.xpath('./div'))
    f_dict = {}
    for f in value.xpath('./div'):
        key = f.xpath('.//dt/text()').extract_first()
        value = f.xpath('.//dd/text()').extract_first().replace('\n', '').replace(' ', '')

        f_dict[key] = value
    return f_dict


class LeruaItem(scrapy.Item):
    title = scrapy.Field(output_processor=TakeFirst())
    link = scrapy.Field(output_processor=TakeFirst())
    price = scrapy.Field(output_processor=TakeFirst(), input_processor=MapCompose(int_price))
    photo_links = scrapy.Field()
    feature = scrapy.Field(input_processor=MapCompose(f_preproc))



In [34]:

class LeruaSpiderMiddleware:

    @classmethod
    def from_crawler(cls, crawler):
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_spider_input(self, response, spider):
        return None

    def process_spider_output(self, response, result, spider):
        for i in result:
            yield i

    def process_spider_exception(self, response, exception, spider):
        pass

    def process_start_requests(self, start_requests, spider):
        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)


class LeruaDownloaderMiddleware:

    @classmethod
    def from_crawler(cls, crawler):
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_request(self, request, spider):
        return None

    def process_response(self, request, response, spider):
        return response

    def process_exception(self, request, exception, spider):
        pass

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)

In [35]:
class LeroymerlinSpider(scrapy.Spider):
    name = 'leroymerlin'
    allowed_domains = ['leroymerlin.ru']

    def __init__(self, search):
        self.start_urls = [f'https://leroymerlin.ru/search/?q={search}']

    def parse(self, response):
        next_page = response.xpath("//a[@class='paginator-button next-paginator-button']/@href").extract_first()
        goods_links = response.xpath('//a[@class="plp-item__info__title"]')
        for link in goods_links:
            yield response.follow(link, callback=self.parse_goods)
        if next_page:
            yield response.follow(next_page, callback=self.parse)

    def parse_goods(self, response: HtmlResponse):
        loader = ItemLoader(item=LeruaItem(), response=response)
        loader.add_xpath('title', '//h1[@class="header-2"]/text()')
        loader.add_value('link', response.url)
        loader.add_xpath('price', '//span[@slot="price"]/text()')
        loader.add_xpath('photo_links', '//uc-pdp-media-carousel[@slot="media-content"]/picture/source[1]/@data-origin')
        loader.add_value('feature', response.xpath('//uc-pdp-section-vlimited/dl'))
        yield loader.load_item()

In [36]:


class DataBasePipeline:
    def __init__(self):
        engine = create_engine('sqlite:///leruaDB.db', echo=True)
        Base.metadata.bind = engine
        DBSession = sessionmaker(bind=engine)
        self.session = DBSession()
    def process_item(self, item, spider):
        new_vacancy = Lerua(
            title=item['title'],
            link=item['link'],
            price=item['price'],
            photos=r'/photo/' + item['title'],
            feature=str(item['feature'])
        )
        try:
            self.session.add(new_vacancy)
            self.session.commit()
        except IntegrityError:
            print('Проблема с загрузкой Данных!')
            self.session.rollback()
        return item

    def __del__(self):
        self.session.close()


class LeruaPhotosPipeline(ImagesPipeline):
    def get_media_requests(self, item, info):
        print('_____', item)
        if item['photo_links']:
            for img_link in item['photo_links']:
                try:
                    yield scrapy.Request(img_link, meta=item)
                except Exception as e:
                    print(e)

    def file_path(self, request, response=None, info=None):
        item = request.meta
        name = request.url.split('/')[-1]
        return f"/{item['title']}/{name}"

    def item_completed(self, results, item, info):
        if results:
            item['photo_links'] = [itm[1] for itm in results if itm[0]]
        return item


In [37]:
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings

if __name__ == '__main__':
    crawler_settings = Settings()
    crawler_settings.setmodule(settings)

    process = CrawlerProcess(settings=crawler_settings)
    process.crawl(LeroymerlinSpider, search='диван')

    process.start()

NameError: name 'settings' is not defined