In [2]:
import requests
from bs4 import BeautifulSoup
from googlesearch import search
import pdfplumber
from IPython.display import display, HTML, Javascript
import ipywidgets as widgets
from selenium import webdriver
import webbrowser
from IPython.display import clear_output
import os, pickle

In [3]:
class ScrapeData():
    """
    Python tool to scrap text data from different languages. 3 options available:
    
    - scrape from link: provided with a link, the tool scrapes through the text body from the webpage 
      corresponding to the link.
      
    - scrape from keyword: provided with a keyword, the tool performs a google search and retrieves text from 
      a priority domain webpage(eg: wikipedia) or top google search result.
      
    - scrape from document: provided with a pdf document, the tools accesses the text using pdfplumber 
      python package. Note that some portion of the text may not be returned properly.
    """
    
    def __init__(self):
        """
        Initialise parameters for google search and priority website 
        """
        
        self.parser = 'html.parser'
        self.tld = 'co.in'
        self.search_num = 10
        self.search_stop = 10
        self.pause = 2
        self.search_priority = 'wikipedia'
        self.auto_return_index = 0
    
    def read_from_link(self, link, replace_list=['\n']):
        """
        This function accesses the text content from a webpage link using beautiful soup. To clean the text, 
        provide the list of charecters to be removed in replace_list.
        """
        page = requests.get(link)
        soup = BeautifulSoup(page.content, self.parser)
        data = []
        for i in range(len(soup.find_all('p'))):
            text = soup.find_all('p')[i].get_text()
            for j in range(len(replace_list)):
                text = text.replace(replace_list[j],'')
            if len(text)>0:
                data.append(text)
        return data
    
    def google_search(self, search_keyword, priority=None):
        """
        This function performs google search on the input keyword. Priority can be provided to a particular 
        website (Eg: wikipedia) 
        """
        search_links = []
        for link in search(search_keyword, 
                           tld=self.tld, 
                           num=self.search_num, 
                           stop=self.search_stop, 
                           pause=self.pause):
            search_links.append(link)
            
            if priority is not None:
                if self.search_priority in link:
                    return link
        return search_links[self.auto_return_index]
                
    
    def read_from_doc(self, document):
        """
        This function extracts text from pdf using pdfplumber tool.
        """
        data = []
        with pdfplumber.open(document) as pdf:
            for i in range(len(pdf.pages)):
                page = pdf.pages[i]
                text = page.extract_text()
                if text is not None:
                    text = text.replace('\n', ' ')
                    data.append(text) 
        return data
    
    def read_page(self, search_keyword=None, link=None, document=None):
        if search_keyword == link == document == None:
            raise Exception('Provide link, keyword or document to scrape from')
            
        if search_keyword is not None:
            keyword_link = self.google_search(search_keyword, priority=self.search_priority)
            text = self.read_from_link(keyword_link)
            return(text)
            
        if link:
            text = self.read_from_link(link)
            return(text)
                
        if document:
            text = self.read_from_doc(document)
            print(f'{len(text)} pages found')
            return(text)
            
scrape_tool = ScrapeData()

<h4>Read from english website</h4>

In [None]:
scrape_tool.read_page(link='https://en.wikipedia.org/wiki/Agriculture')

<h4>Read from english keyword</h4>

In [None]:
scrape_tool.read_page(search_keyword='agriculture')

<h4>Read from english pdf document</h4>

In [None]:
scrape_tool.read_page(document='document_name.pdf')

---

<h3>Read from list of links with GUI</h3>

In [None]:
class ActiveScrappingGUI():
    def __init__(self, links):
        if not isinstance(links, list):
            raise Exception('link input shoudl be present as a python list')
        self.linkIdx = 0
        self.links = links
        self.approvedList = {}
        self.storedText = {}
        self.defaultColor  = '#EEEEEE'
        self.approvedColor = 'lightgreen'
        self.rejectedColor = '#FF4500'
        self.nextButton = widgets.Button(description = 'Next')
        self.approve = widgets.Button(description = 'Approve', disabled=True)
        self.reject = widgets.Button(description = 'Reject', disabled=True)
        self.nextButton.add_class("red_label")
        self.approve.add_class("red_label")
        self.reject.add_class("red_label")
        self.output = widgets.Output()
        display(widgets.HBox((self.nextButton, self.approve,self.reject )), self.output, 
        HTML("<style>.red_label { font-weight: bold}</style>"),
        HTML("<style>.red_label { font-family:calibri}</style>"),
        HTML("<style>.red_label { font-size:16px}</style>"))
        
    def on_button_clicked_approve(self, b):
        with self.output:
            self.nextButton.disabled = False
            self.approve.style.button_color = self.approvedColor
            self.reject.style.button_color = self.defaultColor
            self.approvedList[self.links[self.linkIdx-1]] = True
                
    def on_button_clicked_reject(self, b):
        with self.output:
            self.nextButton.disabled = False
            self.approve.style.button_color = self.defaultColor
            self.reject.style.button_color = self.rejectedColor
            self.approvedList[self.links[self.linkIdx-1]] = False
                
    def on_button_clicked(self, b):
        with self.output:
            if  self.linkIdx == len(self.links):
                clear_output()
                print('All links visited. Approved link can be accessed with "gui.approvedList"')
                self.reject.disabled = True
                self.approve.disabled = True
                self.nextButton.disabled = True
                self.reject.style.button_color = self.defaultColor
                self.approve.style.button_color = self.defaultColor
            else:
                clear_output()
                self.reject.disabled = True
                self.approve.disabled = True
                self.nextButton.disabled = True
                self.reject.style.button_color = self.defaultColor
                self.approve.style.button_color = self.defaultColor
                print('Extracting..')
                text = scrape_tool.read_page(self.links[self.linkIdx])
                clear_output()
                self.storedText[self.links[self.linkIdx]] = text
                for para in text:
                    print(para, '\n')
                webbrowser.open(self.links[self.linkIdx])
                self.reject.disabled = False
                self.approve.disabled = False
                self.linkIdx += 1
            
    def start(self):
        self.approve.on_click(self.on_button_clicked_approve)
        self.reject.on_click(self.on_button_clicked_reject)
        self.nextButton.on_click(self.on_button_clicked)
    
    def save(self, path):
        savedSet = set()
        for key, item in self.approvedList.items():
            if item == True:
                saveName = key.replace('/', '_')
                if saveName in savedSet:
                    raise Exception(f'{key} already saved, conflicitng links present. Report it to sathvikudupa66@gmail.com')
                else:
                    savedSet.add(saveName)
                try:
                    with open(os.path.join(path, saveName), 'w') as f:
                        f.writelines(self.storedText[key])
                except:
                    raise Exception(f'Unable to save extracted text from link {key}. Please verify if "{path}" exists')
    

In [None]:
links = ['https://en.wikipedia.org/wiki/Agriculture', 'https://www.bbc.com/hindi/india-56901831',
        'https://en.wikipedia.org/wiki/Main_Page']

#load links from csv into a list
gui = ActiveScrappingGUI(links)
gui.start()

In [None]:
gui.approvedList

In [None]:
#store this as .pickle if rejected links are processed at a later time.
linkStatusPath = 'status.pickle'
with open(linkStatusPath, 'wb') as handle:
    pickle.dump(gui.approvedList, handle, protocol=pickle.HIGHEST_PROTOCOL)

<b> Save approved text </b>

In [None]:
#provide your folder path
gui.save(path='saved')

---

<b>Example on how to proceed with rejected links (Work In Progress)<b>

In [4]:
class HandleRejects():
    def __init__(self, storedGUIStatusDict, openWebPage = True, ):
        self.approve = widgets.Button(description = 'Approve Changes')
        self.refresh = widgets.Button(description = 'Refresh Changes')
        self.discard = widgets.Button(description = 'Unable to process')
        self.output = widgets.Output()
        self.openWebPage = openWebPage
        self.defaultColor  = '#EEEEEE'
        self.approve.disabled = True
        self.discard.disabled = True
        self.approve.add_class("layout")
        self.refresh.add_class("layout")
        self.discard.add_class("layout")
        with open(linkStatusPath, 'rb') as handle:
            self.statusDct = pickle.load(handle)
        self.resolvedLinks = {}
        self.discardedLinks = {}
        self.initalVal = None
        self.initalVal = self.reportStatus()
        self.linkIdx = 0
        self.visitSet = set()
        self.linkFromStatus = [key for key, item in self.statusDct.items() if item == False]
        
    def reportStatus(self):
        if self.initalVal == None:
            toBeResolved = sum([1 if item == False else 0 for key, item in self.statusDct.items()])
            return toBeResolved
        else:
            self.remainingCount = self.initalVal - len(self.resolvedLinks) - len(self.discardedLinks)
            print(f'{self.remainingCount}/{self.initalVal} links left to be resolved')
    
    def on_button_clicked_approve(self, b):
        with self.output:
            self.approve.style.button_color = self.defaultColor
            self.resolvedLinks[self.currentLink] = self.currentText
            self.linkIdx += 1 
            clear_output()
            print(f'Text extracted from {self.currentLink} approved')
            self.reportStatus()
            self.discard.disabled = True
            self.approve.disabled = True
    
    def on_button_clicked_discard(self, b):
        with self.output:
            self.discard.style.button_color = self.defaultColor
            self.discardedLinks[self.currentLink] = self.currentText
            self.linkIdx += 1 
            clear_output()
            print(f'Text extracted from {self.currentLink} needs manual inspection. Unable to proceed with web scraping code')
            self.reportStatus()
            self.approve.disabled = True
            self.discard.disabled = True
            
    def on_button_clicked_refresh(self, b):
        with self.output:
            if len(self.linkFromStatus) == 0:
                raise Exception('No sentences dound in dict')
            self.approve.disabled = False
            self.discard.disabled = False
            self.reportStatus()
            print('Extracting..')
            if self.linkIdx >= len(self.linkFromStatus):
                clear_output()
                self.discard.disabled = True
                self.approve.disabled = True
                self.refresh.disabled = True
                raise Exception('All links visited. : )')
            self.currentLink = self.linkFromStatus[self.linkIdx]
            if self.currentLink not in self.resolvedLinks.keys():
                if self.currentLink not in self.visitSet:
                    self.visitSet.add(self.currentLink)
                    webbrowser.open(self.currentLink)
                self.currentText = self.readFn(self.currentLink)
                clear_output()
                for line in self.currentText:
                    print(line, '\n')
    
    def start(self, readFn):
        display(widgets.HBox((self.approve, self.refresh, self.discard)), self.output, 
        HTML("<style>.layout { font-weight: bold}</style>"),
        HTML("<style>.layout { font-family:calibri}</style>"),
        HTML("<style>.layout { font-size:16px}</style>"))
        self.readFn = readFn
        self.refresh.on_click(self.on_button_clicked_refresh)
        self.approve.on_click(self.on_button_clicked_approve)
        self.discard.on_click(self.on_button_clicked_discard)
    
    def save(self, path):
        savedSet = set()
        for key, item in self.resolvedLinks.items():
            saveName = key.replace('/', '_')
            if saveName in savedSet:
                raise Exception(f'{key} already saved, conflicitng links present. Report it to sathvikudupa66@gmail.com')
            else:
                savedSet.add(saveName)
            try:
                with open(os.path.join(path, saveName), 'w') as f:
                    f.writelines(item)
            except:
                raise Exception(f'Unable to save extracted text from link {key}. Please verify if "{path}" exists')
    def saveDiscarded(self, path):
        with open(path, 'w') as f:
            data = list(self.discardedLinks.keys())
            for line in data:
                f.write(line+'\n')
                
linkStatusPath = 'status.pickle'
handle = HandleRejects(storedGUIStatusDict=linkStatusPath);

In [7]:
def read_from_link_custom(link, replace_list=['\n']):  
    page = requests.get(link)
    soup = BeautifulSoup(page.content, 'html.parser')
    data = []
    for i in range(len(soup.find_all('p'))):
        text = soup.find_all('p')[i].get_text()
        for j in range(len(replace_list)):
            text = text.replace(replace_list[j],'')
        if len(text)>0:
            data.append(text)
    return data  

handle.start(readFn=read_from_link_custom1)

HBox(children=(Button(description='Approve Changes', style=ButtonStyle(), _dom_classes=('layout',)), Button(de…

Output(outputs=({'output_type': 'stream', 'text': 'Agriculture \n\n\xa0China \n\n\xa0India \n\n\xa0European Un…

In [10]:
#resolved links with extracted text present here
handle.resolvedLinks.keys()
#links which need manual inspection present here
list(handle.discardedLinks.keys())

dict_keys(['https://en.wikipedia.org/wiki/Agriculture', 'https://en.wikipedia.org/wiki/Main_Page'])

In [11]:
handle.save('saveResolved')
handle.saveDiscarded('furtherInspection/day1links.txt')

---

<b>Read from hindi webpage</b>

In [None]:
# scrape_tool.read_page(link="https://hi.wikipedia.org/wiki/%E0%A4%95%E0%A5%83%E0%A4%B7%E0%A4%BF")
scrape_tool.read_page(link="https://www.bbc.com/hindi/india-56901831")

<b>Read from hindi document</b>

In [None]:
scrape_tool.read_page(document='RedRidingHood-H-2mb.pdf')

<b>Read from kannada webpage</b>

In [None]:
# scrape_tool.read_page(link="https://kn.wikipedia.org/wiki/%E0%B2%B8%E0%B2%BE%E0%B2%B5%E0%B2%AF%E0%B2%B5_%E0%B2%AC%E0%B3%87%E0%B2%B8%E0%B2%BE%E0%B2%AF")
scrape_tool.read_page(link="https://kannada.asianetnews.com/karnataka-districts/bjp-mla-g-somashekara-reddy-talks-lockdown-in-karnataka-grg-qs9n0r")