# URL filtering process

In [594]:
import pandas
from kryptone.utils.urls import URL, URLIgnoreTest

In [595]:
START_URL_OBJECT = URL('https://www.undiz.com/fr/soutiens-gorge/')

URLS_TO_VISIT = set()

VISITED_URLS = set(['https://www.undiz.com/'])

LIST_OF_SEEN_URLS = set()

IGNORE_QUERIES = False

IGNORE_IMAGES = False

## Navigation to web page

This is is when the robot actually moves to a web page and gathers all the urls for the current url

In [596]:
raw_urls = pandas.read_csv('test_urls.csv')

In [597]:
raw_urls.initial_urls.count()

355

The structural check allows us to identify urls that start with "/" and put them in the correct format "https://domain.com/path" it also allows us to convert the url into a python object

In [599]:
from urllib.parse import urlunparse, unquote

def urljoin(path):
    path = str(path).strip()
    result = urlunparse((
        START_URL_OBJECT.scheme,
        START_URL_OBJECT.netloc,
        path,
        None,
        None,
        None
    ))
    return unquote(result)

def url_structural_check(url):
    if str(url).startswith('/'):
        clean_url = urljoin(str(clean_url))
    return URL(str(url))

raw_urls['urls'] = raw_urls['initial_urls'].map(lambda x: url_structural_check(x).raw_url)

A page could have muliple duplicates but before droping them, we need to get a clean decoded version of the url

In [600]:
raw_urls = raw_urls.drop_duplicates(subset=['urls'])

In [601]:
raw_urls.urls.count()

192

### Initial url filters

This initial url filter check that _the url is no emppty, is not a fragment (e.g. /#, /path#fragment, is not the home page if it was already the start url and optionnaly is not a query or an image_

The initial premise is that every url that comes in is not valid unless said differently

In [602]:
raw_urls['is_valid'] = False
def initial_url_filter(df):
    for item in df.itertuples(name='Url'):
        instance = URL(item.urls)
        if not instance.is_same_domain(START_URL_OBJECT):
            continue

        if instance.is_empty:
            continue

        if instance.has_fragment:
            continue

        if instance.url_object.path == '/' and START_URL_OBJECT.url_object.path == '/':
            continue

        if IGNORE_QUERIES:
            if instance.has_queries():
                continue

        if IGNORE_IMAGES:
            if instance.is_image:
                continue

        df.loc[item.Index, 'is_valid'] = True
    return df
raw_urls.pipe(initial_url_filter).head()

Unnamed: 0,initial_urls,urls,is_valid
0,https://www.undiz.com/fr/saint-valentin/,https://www.undiz.com/fr/saint-valentin/,True
2,https://www.undiz.com/fr/soutiens-gorge/#,https://www.undiz.com/fr/soutiens-gorge/#,False
3,https://www.undiz.com/,https://www.undiz.com/,True
9,https://www.undiz.com/fr/collection-saint-vale...,https://www.undiz.com/fr/collection-saint-vale...,True
10,https://www.undiz.com/fr/collection-saint-vale...,https://www.undiz.com/fr/collection-saint-vale...,True


We check for a correspondance between the urls that were already visitesd, that we have already seen. This is the second level filter

In [603]:
visited_urls = pandas.DataFrame({'urls': list(VISITED_URLS)})
raw_urls['already_visited'] = raw_urls['urls'].isin(visited_urls.urls)

In [604]:
list_of_seen_urls = pandas.DataFrame({'urls': list(LIST_OF_SEEN_URLS)})
raw_urls['already_seen'] = raw_urls['urls'].isin(list_of_seen_urls.urls)

This is our intermediate dataframe that we will be using to apply the remaining custom user filters

In [605]:
raw_urls[raw_urls['already_visited'] == True]

Unnamed: 0,initial_urls,urls,is_valid,already_visited,already_seen
3,https://www.undiz.com/,https://www.undiz.com/,True,True,False


In [606]:
raw_urls[raw_urls['already_seen'] == True]

Unnamed: 0,initial_urls,urls,is_valid,already_visited,already_seen


In [607]:
valid_urls = raw_urls[
    (raw_urls['is_valid'] == True) &
    (raw_urls['already_visited'] == False) &
    (raw_urls['already_seen'] == False)
]

In [608]:
valid_urls.urls.count()

183

In [609]:
valid_urls['urls'].isin(LIST_OF_SEEN_URLS).count()

183

In [610]:
valid_urls = valid_urls.sort_values('urls')

Restructure our dataframe for the rest of the filtering process

In [611]:
valid_urls = valid_urls[['urls', 'is_valid']]

In [612]:
valid_urls.head()

Unnamed: 0,urls,is_valid
106,https://www.undiz.com/fr/1-1-3/,True
343,https://www.undiz.com/fr/FR-landing-page-app.html,True
127,https://www.undiz.com/fr/achat/mon-panier-reca...,True
84,https://www.undiz.com/fr/collabs-pyjamas/,True
13,https://www.undiz.com/fr/collection-saint-vale...,True


### Url ignore tests

The user might indicate that certain urls need to be ignored if they match a certain path or regex pattern. This is used essentially when some of the urls are known in advance

In [613]:
from collections import defaultdict, OrderedDict

URL_IGNORE_TESTS = [
    URLIgnoreTest('bras', paths=['/soutiens-gorge/', 'soutien-gorge'])
]

results = defaultdict(list)
for item in valid_urls.itertuples():
    truth_array = results[item.urls]
    for instance in URL_IGNORE_TESTS:
        truth_array.append(instance(item.urls))

urls_kept = set()
urls_removed = set()
final_urls_filtering_audit = OrderedDict()

for url, truth_array in results.items():
    final_urls_filtering_audit[url] = any(truth_array)

    if any(truth_array):
        urls_removed.add(url)
        continue
    urls_kept.add(url)

urls_kept = pandas.DataFrame({'urls': list(urls_kept)})
valid_urls = valid_urls[valid_urls.urls.isin(urls_kept.urls)]




In [614]:
valid_urls.urls.count()

89

### Url rule tests

The user might indicate that __only__ a specific type of url that matches a regex path needs to be kept

In [615]:
URL_RULE_TESTS = [
    r'\/culottes\-'
]

urls_to_keep = set()
for item in valid_urls.itertuples(name='Url'):
    instance = URL(item.urls)
    for regex in URL_RULE_TESTS:
        result = instance.test_url(regex)
        if result:
            urls_to_keep.add(item.urls)
            continue

urls_to_keep = pandas.DataFrame({'urls': list(urls_to_keep)})
valid_urls = valid_urls[valid_urls.urls.isin(urls_to_keep.urls)]

In [616]:
valid_urls.urls.count()

9

From here, we finally have the valid urls that we want to visit but also a full image of the urls that we have already seen

In [617]:
URLS_TO_VISIT.update(valid_urls.urls.to_list())
LIST_OF_SEEN_URLS.update(raw_urls.urls.to_list())

In [618]:
URLS_TO_VISIT

{'https://www.undiz.com/fr/culottes-de-regle/',
 'https://www.undiz.com/fr/culottes-et-bas-de-lingerie/culottes-accessoires/',
 'https://www.undiz.com/fr/culottes-et-bas-de-lingerie/culottes-microfibres/',
 'https://www.undiz.com/fr/culottes-et-bas-de-lingerie/culottes/',
 'https://www.undiz.com/fr/culottes-et-bas-de-lingerie/lots-de-2-3-culottes/',
 'https://www.undiz.com/fr/culottes-et-bas-de-lingerie/offre-culottes/',
 'https://www.undiz.com/fr/culottes-et-bas-de-lingerie/shorty/',
 'https://www.undiz.com/fr/culottes-et-bas-de-lingerie/string-tanga/',
 'https://www.undiz.com/fr/culottes-et-bas-de-lingerie/string/'}

We can also get a dataframe of the urls that we did not want if we wanted an image of what was done

In [619]:
rejected_urls = raw_urls[~raw_urls.urls.isin(valid_urls.urls)][['urls']]

In [620]:
rejected_urls.urls.count()

183