In [1]:
import requests
from lxml import html

from datetime import datetime as dt, timedelta as td
import locale
locale.setlocale(locale.LC_ALL, 'ru_RU')

from sqlalchemy import create_engine
from sqlalchemy import Column, String
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker

In [2]:
class Parser:
    
    def __init__(self, url):
        self.main_link = url
        self.headers = {'User-agent': 'Mozilla/5.0'}
        response = requests.get(url, headers=self.headers)
        self.root = html.fromstring(response.text)

In [3]:
class Yandex(Parser):
    
    def parser(self):
        
        titles = iter(self.root.xpath('//h2[@class="story__title"]/a/text()'))
        
        links = (l.split('?')[0] if l.startswith('https')
                 else 'https://yandex.ru' + l.split('?')[0] for l
                 in self.root.xpath('//h2[@class="story__title"]/a/@href'))

        sources = (' '.join(s.split()[:-1]) if 'вчера' not in s else
                   ' '.join(s.split()[:-3]) for s
                   in self.root.xpath('//div[@class="story__date"]/text()'))

        dates = (str(dt.now().date()) + ' ' + d[-5:] if 'вчера' not in d else
                 str((dt.now() - td(days=1)).date()) + ' ' + d[-5:] for d
                 in self.root.xpath('//div[@class="story__date"]/text()'))
        
        return zipper(zip(titles, links, dates), source=sources)
    
    

In [4]:
class Lenta(Parser):

    def get_head(self):
        return [{
            'title': self.root.xpath('//div[@class="first-item"]/h2/a/text()')[0].replace('\xa0', ' '),
            'url': self.main_link + self.root.xpath('//div[@class="first-item"]/h2/a/@href')[0],
            'source': 'Lenta.ru',
            'date': str(dt.strptime(
                self.root.xpath('//div[@class="first-item"]/h2/a/time/@datetime')[0],
                ' %H:%M,  %d %B %Y'))[:-3]
        }]

    def get_body(self):

        body_zip = zip(
            (t.replace('\xa0', ' ') for t in self.root.xpath('//div[@class="item"]/a/text()')),
            (self.main_link + l for l in self.root.xpath('//div[@class="item"]/a/@href')),
            (str(dt.strptime(t, ' %H:%M,  %d %B %Y'))[:-3] for t
             in self.root.xpath('//div[@class="item"]/a/time/@datetime'))
        )
        return zipper(body_zip, source='Lenta.ru')

    def get_footer(self):

        footer_zip = zip(
            (t.replace('\xa0', ' ') for t in self.root.xpath('//div[@class="titles"]/h3/a/span/text()')),
            (self.main_link + t for t in self.root.xpath('//div[@class="titles"]/h3/a/@href') if 'https' not in t),
            (' '.join(date_tuple) for date_tuple in zip(
                (str(dt.now().date()) if d == 'Сегодня'
                 else str((dt.strptime(d, ' %d %B') + td(days=365 * 120 + 30)).date())
                 for d in self.root.xpath('//span[@class="g-date item__date"]/text()')),
                self.root.xpath('//span[@class="time"]/text()'))),
        )
        return zipper(footer_zip, source='Lenta.ru')

    def parser(self):
        head = self.get_head()
        body = self.get_body()
        footer = self.get_footer()
        return head + body + footer

In [5]:
class Mail(Parser):

    def dive_into(self, links):
        roots = [html.fromstring(requests.get(link, headers=self.headers).text) for link in links]
        dates = (r.xpath('//span[@class="note"]/span/@datetime')[0][:-9].replace('T', ' ') for r in roots)
        sources = (r.xpath('//span/span/a/span[@class="link__text"]/text()')[0] for r in roots)
        return dates, sources

    def get_head(self):
        titles = (t.replace('\xa0', ' ') for t in self.root.xpath('//span[@class="photo__captions"]/span[1]/text()'))
        links = [self.main_link + l if 'https' not in l else l 
                 for l in self.root.xpath('//div[@class="photo__inner"]/../@href')[:5]]
        dates, sources = self.dive_into(links)
        return zipper(zip(titles, links, dates), source=sources)

    def get_body(self):
        titles_upper = self.root.xpath('//div[@class="cols__inner"]/div/span/a/span/text()')
        titles_lower = self.root.xpath('//span[@class="list__text"]/a/span/text()')
        titles = [t.replace('\xa0', ' ') for t in titles_upper + titles_lower]
        
        links_upper = self.root.xpath('//a[@class="newsitem__title link-holder"]/@href')
        links_lower = self.root.xpath('//span[@class="list__text"]/a/@href')
        links = [self.main_link + l for l in links_upper + links_lower]
        
        dates, sources = self.dive_into(links)
        return zipper(zip(titles, links, dates), source=sources)

    def parser(self):
        head = self.get_head()
        body = self.get_body()
        return head + body

In [6]:
def zipper(columns, source):
    return [{
        'title': unit[0],
        'url': unit[1],
        'source': source if type(source) == str else next(source),
        'date': unit[2]} for unit in columns]

In [7]:
class Connector:

    def __init__(self):
        db_string = "postgres://localhost:5432/vacancies"
        self.db = create_engine(db_string)
        base = declarative_base()

        class Vacancy(base):
            __tablename__ = 'vacancies_table'
            title = Column(String, primary_key=True)
            url = Column(String)
            source = Column(String)
            date = Column(String)

        self.table = Vacancy
        session = sessionmaker(self.db)
        self.session = session()
        base.metadata.create_all(self.db)

    def insert(self, news):
        i = 0
        for item in news:
            duple = self.session.query(self.table).get(item['title'])
            if not duple:
                vacancy_item = self.table(
                    title=item['title'],
                    url=item['url'],
                    source=item['source'],
                    date=item['date'])

                self.session.add(vacancy_item)
                i += 1
            else:
                print(f'"{duple.title}" already exists, pass\n')
                
        self.session.commit()
        print(f'{i} items added')
        print(f'{self.session.query(self.table).count()} items total')

    def head(self, num):
        vacancies = self.session.query(self.table).limit(num)
        for v in vacancies:
            print(f'Title: {v.title}')
            print(f'Date: {v.date}')
            print(f'Source: {v.source}')
            print(f'Url: {v.url}')
            print()

    def clear(self):
        self.table.__table__.drop(self.db)

In [8]:
connector = Connector()

In [9]:
yandex = Yandex(url='https://yandex.ru/news')
news = yandex.parser()

connector.insert(news)

65 items added
65 items total


In [10]:
lenta = Lenta(url='https://lenta.ru')
news = lenta.parser()

connector.insert(news)

"Российский «Лидер» оказался аутсайдером" already exists, pass

"Названы вакансии с зарплатой 800 тысяч рублей" already exists, pass

79 items added
144 items total


In [11]:
mail = Mail(url='https://news.mail.ru')
news = mail.parser()

connector.insert(news)

"Пилот Коби Брайанта был в секундах от спасения вертолета" already exists, pass

26 items added
170 items total


In [12]:
connector.head(1)

Title: В Казахстане восемь человек погибли в массовой драке
Date: 2020-02-08 18:04
Source: РИА Новости
Url: https://yandex.ru/news/story/V_Kazakhstane_vosem_chelovek_pogibli_v_massovoj_drake--e2a63ed10fcce2c9c7dda20585b3ccc7

