From da18b38751a33387a084821e9ce793e63f74ae7d Mon Sep 17 00:00:00 2001 From: Andrew Anker Date: Sun, 27 Nov 2022 10:24:21 -0800 Subject: [PATCH] Refactor WebExtract to prepare for subclasses Moving arguments into init and creating new pull_data() method that can be replaced by subclasses to do specially formatted calls --- xwordlist/xwl.py | 35 ++++++++++++++++++++++------------- xwordlist/xwordlist.py | 6 ++++-- 2 files changed, 26 insertions(+), 15 deletions(-) diff --git a/xwordlist/xwl.py b/xwordlist/xwl.py index c058d18..5db0b86 100644 --- a/xwordlist/xwl.py +++ b/xwordlist/xwl.py @@ -123,29 +123,38 @@ def convert(self, parseChars): class WebExtract: + PARSEDICT = {} + WEBEXTRACT = '' + + def __init__(self, parseDict={}, webExtract=''): self.returnWords = [] self.scrapeWords = [] + self.parseDict = parseDict if parseDict != {} else self.PARSEDICT + self.webExtract = webExtract if webExtract != '' else self.WEBEXTRACT + + def pull_data(self, getData): + return self._get_web_page(getData) - def get_web_page(self, webURL, parseDict, webExtract): + def _get_web_page(self, webURL): try: r = requests.get(webURL) if r.status_code == 200: inputSoup = BeautifulSoup(r.text, 'html.parser') - if parseDict: + if self.parseDict: # See if we have a class, in which case, have to do more screening (1 to N classes) - if 'class' in parseDict: + if 'class' in self.parseDict: classDict = {} - classDict['class'], whichNum = parseDict['class'] + classDict['class'], whichNum = self.parseDict['class'] fullSoup = inputSoup.find_all(attrs=classDict) for counter, whichSoup in enumerate(fullSoup, start=1): if whichNum == counter or whichNum == 0: - self._extract_from_web(webExtract, whichSoup, webURL) + self._extract_from_web(whichSoup, webURL) self.returnWords.extend(self.scrapeWords) else: - self._extract_from_web(webExtract, inputSoup.find(attrs=parseDict), webURL) + self._extract_from_web(inputSoup.find(attrs=self.parseDict), webURL) self.returnWords.extend(self.scrapeWords) else: - self._extract_from_web(webExtract, inputSoup, webURL) + self._extract_from_web(inputSoup, webURL) self.returnWords.extend(self.scrapeWords) elif r.status_code == 403: @@ -185,11 +194,11 @@ def get_web_page(self, webURL, parseDict, webExtract): } raise XWLException(err_dict) - def _extract_from_web(self, extractWhat, soup, extractURL): + def _extract_from_web(self, soup, extractURL): # A few ways the default option can come in, try that first - if extractWhat == 'text' or extractWhat is None: + if self.webExtract == 'text': self.scrapeWords = soup.stripped_strings - elif extractWhat == 'links': + elif self.webExtract == 'links': for link in soup.find_all('a'): getURL = link.get('href') if getURL: @@ -199,14 +208,14 @@ def _extract_from_web(self, extractWhat, soup, extractURL): parseExtract = urllib.parse.urlsplit(extractURL) getURL = urllib.parse.urljoin(f'{parseExtract.scheme}://{parseExtract.netloc}', getURL) self.scrapeWords.append(getURL) - elif extractWhat[:5] == 'html-': - extractTags = extractWhat[5:].split('_') + elif self.webExtract[:5] == 'html-': + extractTags = self.webExtract[5:].split('_') for tag in extractTags: for link in soup.find_all(tag): text = link.get_text() self.scrapeWords.append(text) else: - error = f'Exiting... incorrect option for webextract: {self.err_text(extractWhat)}' + error = f'Exiting... incorrect option for webextract: {self.err_text(self.webExtract)}' raise XWLException(error) self.scrapeWords = list(line for line in self.scrapeWords) diff --git a/xwordlist/xwordlist.py b/xwordlist/xwordlist.py index 51b5b1f..e1f1683 100755 --- a/xwordlist/xwordlist.py +++ b/xwordlist/xwordlist.py @@ -121,7 +121,8 @@ def setup_input(localArgs, otherArgs): if localArgs.webpage: if localArgs.container: parseDict = create_dict(localArgs.container) - webScrape.get_web_page(localArgs.webpage, parseDict, localArgs.webextract) + webScrape = xwl.WebExtract(parseDict, localArgs.webextract) + webScrape.pull_data(localArgs.webpage) returnWords.extend(webScrape.returnWords) if localArgs.urllist: @@ -139,7 +140,8 @@ def setup_input(localArgs, otherArgs): 'urlLength': urlLength, } print_line(print_text, arg_dict, endText='') - webScrape.get_web_page(oneUrl, parseDict, localArgs.webextract) + webScrape = xwl.WebExtract(parseDict, localArgs.webextract) + webScrape.pull_data(oneUrl) returnWords.extend(webScrape.returnWords) if urlCount < urlLength: delay = int(otherArgs['urllist_delay'])