In [3]:
import pandas as pd
import re
import requests
from lxml import html
import sys

In [4]:
class Scrapper:
    
    def __init__(self, source, link_pattern, xpath, df, max_while = 4, max_links = 20):
        self.source = source
        self.link_pattern = link_pattern
        self.xpath = xpath
        self.df = df
        self.max_while = max_while
        self.max_links = max_links
        self.link_storage = []
        self.link_history = []
    
    def scrape(self, seed_link):
        self.link_storage.append(seed_link)
        print('Put link into seed link storage')
        i = 0
        while True:
            links_list_length = len(self.link_storage)
            print('Entered while cycle')
            links = self.link_storage.copy()
            for link in links:
                print('Entered for cycle')
                if link not in self.link_history:
                    seed_page = self.load(link)
                    print('Tried to load: ' + link)
                    self.link_history.append(link)
                    if seed_page:
                        print('Loaded: ' + link)
                        if self.is_needed(link):
                            print('Link is needed: ' + link)
                            self.extract_info(seed_page)
                            print('Info extracted')
                        self.extract_links(seed_page)
                        print('Links extracted')
                if self.df.shape[0] > self.max_links:
                    break
            if links_list_length == len(self.link_storage):
                i += 1
            if self.df.shape[0] > self.max_links or i > self.max_while:
                break
        #    self.scrape(self.link_storage)
        return self.df
        
    
    def extract_links(self, page):
        pattern = self.source + '[a-z0-9\-\_\.\/]+'
        links = re.findall(pattern, page.text)
        for link in links:
            if not link in self.link_storage:
                self.link_storage.append(link)
    
    def load(self, link):
        response = requests.get(link)            
        return response
    
    def extract_info(self, page):
        tree = html.fromstring(page.content.decode('UTF-8'))
        result = tree.xpath(self.xpath[0])
        body_intro = tree.xpath(self.xpath[1])
        image_url = tree.xpath(self.xpath[2])
        self.df = self.df.append(pd.DataFrame([[result, body_intro, image_url]], columns=list(self.df.columns)), ignore_index=True)
        return result, body_intro, image_url
    
    def is_needed(self, link):
        return bool(re.search(self.link_pattern, link))

In [5]:
df = pd.DataFrame(columns=['title', 'body intro', 'image url'])

scrapper_kloop = Scrapper('https://www.bbc.co.uk/news/world', 
                          '/news/world',
                          ['//h1/text()', '''//div[@class='story-body']/div[@class='story-body__inner']/p[@class='story-body__introduction']/text()''', '''//span[@class='image-and-copyright-container']/img[@class='js-image-replace']/@src'''],
                          df)

In [6]:
scrapper_kloop.scrape('https://www.bbc.com/news/world-middle-east-46087058')

Put link into seed link storage
Entered while cycle
Entered for cycle
Tried to load: https://www.bbc.com/news/world-middle-east-46087058
Loaded: https://www.bbc.com/news/world-middle-east-46087058
Link is needed: https://www.bbc.com/news/world-middle-east-46087058
Info extracted
Links extracted
Entered while cycle
Entered for cycle
Entered for cycle
Tried to load: https://www.bbc.co.uk/news/world-middle-east-46087058
Loaded: https://www.bbc.co.uk/news/world-middle-east-46087058
Link is needed: https://www.bbc.co.uk/news/world-middle-east-46087058
Info extracted
Links extracted
Entered for cycle
Tried to load: https://www.bbc.co.uk/news/world-middle-east-46077894
Loaded: https://www.bbc.co.uk/news/world-middle-east-46077894
Link is needed: https://www.bbc.co.uk/news/world-middle-east-46077894
Info extracted
Links extracted
Entered for cycle
Tried to load: https://www.bbc.co.uk/news/world-europe-45812399
Loaded: https://www.bbc.co.uk/news/world-europe-45812399
Link is needed: https://www

Unnamed: 0,title,body intro,image url
0,[Saudi Arabia frees Prince Khaled bin Talal af...,[A Saudi prince arrested for criticising a cra...,[https://ichef.bbci.co.uk/news/320/cpsprodpb/1...
1,[Saudi Arabia frees Prince Khaled bin Talal af...,[A Saudi prince arrested for criticising a cra...,[https://ichef.bbci.co.uk/news/320/cpsprodpb/1...
2,[Khashoggi murder: Turkish leader blames Saudi...,[Turkish President Recep Tayyip Erdogan has fo...,[]
3,[Jamal Khashoggi: All you need to know about S...,"[On 2 October, Jamal Khashoggi, a well-known j...",[]
4,[Khashoggi murder: Is Saudi Crown Prince Moham...,"[""He's toast"". ""He's toxic"". ""He's my hero"". ""...",[https://ichef.bbci.co.uk/news/320/cpsprodpb/1...
5,[Khashoggi murder: Body 'dissolved in acid'],"[A top Turkish official, presidential adviser ...",[]
6,"[Khashoggi: Saudi Arabia to try suspects, fore...",[Saudi Arabia's foreign minister has said the ...,[https://ichef.bbci.co.uk/news/320/cpsprodpb/9...
7,[Jamal Khashoggi: Who is murdered Saudi Journa...,[Jamal Khashoggi - a well-known Saudi journali...,[https://ichef.bbci.co.uk/news/320/cpsprodpb/6...
8,[Jamal Khashoggi: Saudi journalist in his own ...,[Saudi journalist Jamal Khashoggi disappeared ...,[]
9,"[Saudi Crown Prince Mohammed bin Salman, power...",[Few people outside Saudi Arabia had heard of ...,[https://ichef.bbci.co.uk/news/320/cpsprodpb/E...


In [None]:
page = scrapper_kloop.load('https://www.bbc.com/news/world-middle-east-46087058')

In [None]:
info = scrapper_kloop.extract_info(page)

In [None]:
scrapper_kloop.df

In [None]:
print(info)