Skip to content

Commit

Permalink
Refactor WebExtract to prepare for subclasses
Browse files Browse the repository at this point in the history
Moving arguments into init and creating new pull_data() method that can be replaced by subclasses to do specially formatted calls
  • Loading branch information
aanker committed Nov 27, 2022
1 parent 438d4ef commit da18b38
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 15 deletions.
35 changes: 22 additions & 13 deletions xwordlist/xwl.py
Expand Up @@ -123,29 +123,38 @@ def convert(self, parseChars):


class WebExtract:
PARSEDICT = {}
WEBEXTRACT = ''

def __init__(self, parseDict={}, webExtract=''):
self.returnWords = []
self.scrapeWords = []
self.parseDict = parseDict if parseDict != {} else self.PARSEDICT
self.webExtract = webExtract if webExtract != '' else self.WEBEXTRACT

def pull_data(self, getData):
return self._get_web_page(getData)

def get_web_page(self, webURL, parseDict, webExtract):
def _get_web_page(self, webURL):
try:
r = requests.get(webURL)
if r.status_code == 200:
inputSoup = BeautifulSoup(r.text, 'html.parser')
if parseDict:
if self.parseDict:
# See if we have a class, in which case, have to do more screening (1 to N classes)
if 'class' in parseDict:
if 'class' in self.parseDict:
classDict = {}
classDict['class'], whichNum = parseDict['class']
classDict['class'], whichNum = self.parseDict['class']
fullSoup = inputSoup.find_all(attrs=classDict)
for counter, whichSoup in enumerate(fullSoup, start=1):
if whichNum == counter or whichNum == 0:
self._extract_from_web(webExtract, whichSoup, webURL)
self._extract_from_web(whichSoup, webURL)
self.returnWords.extend(self.scrapeWords)
else:
self._extract_from_web(webExtract, inputSoup.find(attrs=parseDict), webURL)
self._extract_from_web(inputSoup.find(attrs=self.parseDict), webURL)
self.returnWords.extend(self.scrapeWords)
else:
self._extract_from_web(webExtract, inputSoup, webURL)
self._extract_from_web(inputSoup, webURL)
self.returnWords.extend(self.scrapeWords)

elif r.status_code == 403:
Expand Down Expand Up @@ -185,11 +194,11 @@ def get_web_page(self, webURL, parseDict, webExtract):
}
raise XWLException(err_dict)

def _extract_from_web(self, extractWhat, soup, extractURL):
def _extract_from_web(self, soup, extractURL):
# A few ways the default option can come in, try that first
if extractWhat == 'text' or extractWhat is None:
if self.webExtract == 'text':
self.scrapeWords = soup.stripped_strings
elif extractWhat == 'links':
elif self.webExtract == 'links':
for link in soup.find_all('a'):
getURL = link.get('href')
if getURL:
Expand All @@ -199,14 +208,14 @@ def _extract_from_web(self, extractWhat, soup, extractURL):
parseExtract = urllib.parse.urlsplit(extractURL)
getURL = urllib.parse.urljoin(f'{parseExtract.scheme}://{parseExtract.netloc}', getURL)
self.scrapeWords.append(getURL)
elif extractWhat[:5] == 'html-':
extractTags = extractWhat[5:].split('_')
elif self.webExtract[:5] == 'html-':
extractTags = self.webExtract[5:].split('_')
for tag in extractTags:
for link in soup.find_all(tag):
text = link.get_text()
self.scrapeWords.append(text)
else:
error = f'Exiting... incorrect option for webextract: {self.err_text(extractWhat)}'
error = f'Exiting... incorrect option for webextract: {self.err_text(self.webExtract)}'
raise XWLException(error)

self.scrapeWords = list(line for line in self.scrapeWords)
Expand Down
6 changes: 4 additions & 2 deletions xwordlist/xwordlist.py
Expand Up @@ -121,7 +121,8 @@ def setup_input(localArgs, otherArgs):
if localArgs.webpage:
if localArgs.container:
parseDict = create_dict(localArgs.container)
webScrape.get_web_page(localArgs.webpage, parseDict, localArgs.webextract)
webScrape = xwl.WebExtract(parseDict, localArgs.webextract)
webScrape.pull_data(localArgs.webpage)
returnWords.extend(webScrape.returnWords)

if localArgs.urllist:
Expand All @@ -139,7 +140,8 @@ def setup_input(localArgs, otherArgs):
'urlLength': urlLength,
}
print_line(print_text, arg_dict, endText='')
webScrape.get_web_page(oneUrl, parseDict, localArgs.webextract)
webScrape = xwl.WebExtract(parseDict, localArgs.webextract)
webScrape.pull_data(oneUrl)
returnWords.extend(webScrape.returnWords)
if urlCount < urlLength:
delay = int(otherArgs['urllist_delay'])
Expand Down

0 comments on commit da18b38

Please sign in to comment.