# Build Up Code for Queue Depduping

I need a way to not put dedupped url download requests in the download queue. Otherwise the queue will grow unbounded as each page is likely to have more than 1 link.


In [1]:
%run ./setup-logging.ipynb
%run ./jupyter-test-support.ipynb
%run ./create_container.ipynb

Created [container]
  * app_root
  * blob_root_directory
  * blob_store
  * environment_name
  * event_publisher
  * mongodb_client
  * mongodb_credentials
  * queue_receiver
  * queue_receiver_manager
  * queue_sender
  * rabbitmq_connection
  * rabbitmq_url
  * url_download_handler
  * url_downloaded_handler
  * url_info_store
  * url_store


In [18]:
import datetime as dt
import pymongo
from pymongo.errors import DuplicateKeyError

In [101]:
class MongoDBURLDownloadRequests:
    
    def __init__(self, mongodb_client):
        self.mongodb_client = mongodb_client
        self.db = self.mongodb_client.wcrawl
        self.download_requests = self.db.download_requests
        
    def add(self, url):
        try:
            self.download_requests.insert_one({ '_id': url, 'request_datetime': dt.datetime.utcnow()})
            return True
        except DuplicateKeyError as e:
            return False
    
    def remove(self, url):
        result = self.download_requests.delete_one({'_id': url})
        return result.deleted_count == 1
    
    def clear(self):
        result = self.download_requests.delete_many({})
        return result.deleted_count
    
    def pop_random(self):
        results = list(self.download_requests.aggregate([{ '$sample': { 'size': 1 } }]))
        if len(results) > 0:
            result = results[0]
            self.download_requests.delete_one({ '_id': result['_id'] })
            result['url'] = result['_id']
            del(result['_id'])
            return result
        else:
            return None
    

In [102]:
download_requests = MongoDBURLDownloadRequests(container.mongodb_client())

In [103]:
download_requests.clear()

6

In [111]:
urls = [
    'https://www.towardsdatascience.com/',
    'https://www.towardsdatascience.com/article1',
    'https://www.newyorktimes.com/',
    'https://www.towardsdatascience.com/',
    'https://www.google.com/',
    'https://www.bing.com/',
    'https://www.foobar.com/'
]

def add_urls():
    for url in urls:
        r = download_requests.add(url)
        print(f"[{r}] -> {url}")

def list_requests():      
    return list(download_requests.download_requests.find({}))

def pop_requests():
    requests = []
    while True:
        r = download_requests.pop_random()
        if r is None:
            return requests
        else:
            requests.append(r)

In [112]:
add_urls()
pop_requests()

[True] -> https://www.towardsdatascience.com/
[True] -> https://www.towardsdatascience.com/article1
[True] -> https://www.newyorktimes.com/
[False] -> https://www.towardsdatascience.com/
[True] -> https://www.google.com/
[True] -> https://www.bing.com/
[True] -> https://www.foobar.com/


[{'request_datetime': datetime.datetime(2020, 11, 25, 18, 37, 34, 225000),
  'url': 'https://www.towardsdatascience.com/'},
 {'request_datetime': datetime.datetime(2020, 11, 25, 18, 37, 34, 235000),
  'url': 'https://www.foobar.com/'},
 {'request_datetime': datetime.datetime(2020, 11, 25, 18, 37, 34, 233000),
  'url': 'https://www.google.com/'},
 {'request_datetime': datetime.datetime(2020, 11, 25, 18, 37, 34, 234000),
  'url': 'https://www.bing.com/'},
 {'request_datetime': datetime.datetime(2020, 11, 25, 18, 37, 34, 229000),
  'url': 'https://www.newyorktimes.com/'},
 {'request_datetime': datetime.datetime(2020, 11, 25, 18, 37, 34, 227000),
  'url': 'https://www.towardsdatascience.com/article1'}]

In [113]:
add_urls()
pop_requests()

[True] -> https://www.towardsdatascience.com/
[True] -> https://www.towardsdatascience.com/article1
[True] -> https://www.newyorktimes.com/
[False] -> https://www.towardsdatascience.com/
[True] -> https://www.google.com/
[True] -> https://www.bing.com/
[True] -> https://www.foobar.com/


[{'request_datetime': datetime.datetime(2020, 11, 25, 18, 37, 38, 262000),
  'url': 'https://www.newyorktimes.com/'},
 {'request_datetime': datetime.datetime(2020, 11, 25, 18, 37, 38, 258000),
  'url': 'https://www.towardsdatascience.com/'},
 {'request_datetime': datetime.datetime(2020, 11, 25, 18, 37, 38, 269000),
  'url': 'https://www.foobar.com/'},
 {'request_datetime': datetime.datetime(2020, 11, 25, 18, 37, 38, 260000),
  'url': 'https://www.towardsdatascience.com/article1'},
 {'request_datetime': datetime.datetime(2020, 11, 25, 18, 37, 38, 267000),
  'url': 'https://www.google.com/'},
 {'request_datetime': datetime.datetime(2020, 11, 25, 18, 37, 38, 268000),
  'url': 'https://www.bing.com/'}]