Refactor WebExtract to prepare for subclasses

Moving arguments into init and creating new pull_data() method that can be replaced by subclasses to do specially formatted calls
aanker · Nov 27, 2022 · da18b38 · da18b38
1 parent 438d4ef
commit da18b38
Show file tree

Hide file tree

Showing 2 changed files with 26 additions and 15 deletions.
diff --git a/xwordlist/xwl.py b/xwordlist/xwl.py
@@ -123,29 +123,38 @@ def convert(self, parseChars):
 
 
 class WebExtract:
+    PARSEDICT = {}
+    WEBEXTRACT = ''
+
+    def __init__(self, parseDict={}, webExtract=''):
         self.returnWords = []
         self.scrapeWords = []
+        self.parseDict = parseDict if parseDict != {} else self.PARSEDICT
+        self.webExtract = webExtract if webExtract != '' else self.WEBEXTRACT
+
+    def pull_data(self, getData):
+        return self._get_web_page(getData)
 
-    def get_web_page(self, webURL, parseDict, webExtract):
+    def _get_web_page(self, webURL):
         try:
             r = requests.get(webURL)
             if r.status_code == 200:
                 inputSoup = BeautifulSoup(r.text, 'html.parser')
-                if parseDict:
+                if self.parseDict:
                     # See if we have a class, in which case, have to do more screening (1 to N classes)
-                    if 'class' in parseDict:
+                    if 'class' in self.parseDict:
                         classDict = {}
-                        classDict['class'], whichNum = parseDict['class']
+                        classDict['class'], whichNum = self.parseDict['class']
                         fullSoup = inputSoup.find_all(attrs=classDict)
                         for counter, whichSoup in enumerate(fullSoup, start=1):
                             if whichNum == counter or whichNum == 0:
-                                self._extract_from_web(webExtract, whichSoup, webURL)
+                                self._extract_from_web(whichSoup, webURL)
                                 self.returnWords.extend(self.scrapeWords)
                     else:
-                        self._extract_from_web(webExtract, inputSoup.find(attrs=parseDict), webURL)
+                        self._extract_from_web(inputSoup.find(attrs=self.parseDict), webURL)
                         self.returnWords.extend(self.scrapeWords)
                 else:
-                    self._extract_from_web(webExtract, inputSoup, webURL)
+                    self._extract_from_web(inputSoup, webURL)
                     self.returnWords.extend(self.scrapeWords)
 
             elif r.status_code == 403:
@@ -185,11 +194,11 @@ def get_web_page(self, webURL, parseDict, webExtract):
             }
             raise XWLException(err_dict)
 
-    def _extract_from_web(self, extractWhat, soup, extractURL):
+    def _extract_from_web(self, soup, extractURL):
         # A few ways the default option can come in, try that first
-        if extractWhat == 'text' or extractWhat is None:
+        if self.webExtract == 'text':
             self.scrapeWords = soup.stripped_strings
-        elif extractWhat == 'links':
+        elif self.webExtract == 'links':
             for link in soup.find_all('a'):
                 getURL = link.get('href')
                 if getURL:
@@ -199,14 +208,14 @@ def _extract_from_web(self, extractWhat, soup, extractURL):
                         parseExtract = urllib.parse.urlsplit(extractURL)
                         getURL = urllib.parse.urljoin(f'{parseExtract.scheme}://{parseExtract.netloc}', getURL)
                     self.scrapeWords.append(getURL)
-        elif extractWhat[:5] == 'html-':
-            extractTags = extractWhat[5:].split('_')
+        elif self.webExtract[:5] == 'html-':
+            extractTags = self.webExtract[5:].split('_')
             for tag in extractTags:
                 for link in soup.find_all(tag):
                     text = link.get_text()
                     self.scrapeWords.append(text)
         else:
-            error = f'Exiting... incorrect option for webextract: {self.err_text(extractWhat)}'
+            error = f'Exiting... incorrect option for webextract: {self.err_text(self.webExtract)}'
             raise XWLException(error)
 
         self.scrapeWords = list(line for line in self.scrapeWords)

diff --git a/xwordlist/xwordlist.py b/xwordlist/xwordlist.py
@@ -121,7 +121,8 @@ def setup_input(localArgs, otherArgs):
     if localArgs.webpage:
         if localArgs.container:
             parseDict = create_dict(localArgs.container)
-        webScrape.get_web_page(localArgs.webpage, parseDict, localArgs.webextract)
+        webScrape = xwl.WebExtract(parseDict, localArgs.webextract)
+        webScrape.pull_data(localArgs.webpage)
         returnWords.extend(webScrape.returnWords)
 
     if localArgs.urllist:
@@ -139,7 +140,8 @@ def setup_input(localArgs, otherArgs):
                     'urlLength': urlLength,
                 }
                 print_line(print_text, arg_dict, endText='')
-                webScrape.get_web_page(oneUrl, parseDict, localArgs.webextract)
+                webScrape = xwl.WebExtract(parseDict, localArgs.webextract)
+                webScrape.pull_data(oneUrl)
                 returnWords.extend(webScrape.returnWords)
                 if urlCount < urlLength:
                     delay = int(otherArgs['urllist_delay'])