In [12]:
from bs4 import BeautifulSoup
import os
import re
import requests

In [7]:
with open('../data/category_urls.txt') as f:
    category_urls = f.readlines()    

In [8]:
category_urls = [url[:-1] for url in category_urls]

In [10]:
single_category = category_urls[0]

In [43]:
def parse_category_for_pages(category):
    response = requests.get(category)
    soup = BeautifulSoup(response.content, 'html')
    pagination = soup.findAll('ul', attrs={'class':'pagination'})[0]
    try:
        pages_to_scrape = int(list(pagination.find_all('li'))[3].text)
    except:
        pages_to_scrape = 0
    return {category: pages_to_scrape}

In [89]:
def parse_category_for_pages_regex(category):
    response = requests.get(category)
    try:
        pages_to_scrape = int(re.findall('<li>(\d+)<', str(response.content))[0])
    except:
        pages_to_scrape = 0
    return (category, pages_to_scrape)

In [57]:
%timeit parse_category_for_pages(single_category)

5.32 s ± 202 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [58]:
%timeit parse_category_for_pages_regex(single_category)

601 ms ± 173 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [59]:
for cat in category_urls[0:5]:
    print(parse_category_for_pages_regex(cat))

{'https://www.cgtrader.com/3d-models/standard': 217}
{'https://www.cgtrader.com/3d-models/jet': 198}
{'https://www.cgtrader.com/3d-models/car': 981}
{'https://www.cgtrader.com/3d-models/ship': 183}
{'https://www.cgtrader.com/3d-models/luxury': 675}


In [90]:
category_pages = []

for cat in category_urls:
    category_pages.append(parse_category_for_pages_regex(cat))

In [106]:
sorted_categories_by_page_count = sorted(category_pages, key=lambda x: x[1], reverse=True)

In [87]:
def format_url_pagination(url, page):
    return f"{url}?page={page}"

In [120]:
def discover_true_page_count_per_category(url):
    response = requests.get(url)
    try:
        true_max_page = int(re.findall('<li class="is-current">(\d+)<', response.text)[0])
    except:
        true_max_page = 0
    cat_url = url.split('?')[0]
    original_max_page = int(url.split('?')[1][5:])
    return (cat_url, original_max_page, true_max_page)

In [107]:
max_page_per_category_urls = list(map(lambda x: format_url_pagination(x[0], x[1]), sorted_categories_by_page_count))

In [124]:
max_page_per_category_urls[0:5]

['https://www.cgtrader.com/3d-models/furniture?page=3908',
 'https://www.cgtrader.com/3d-models/interior?page=2991',
 'https://www.cgtrader.com/3d-models/architectural?page=2412',
 'https://www.cgtrader.com/3d-models/character?page=1810',
 'https://www.cgtrader.com/3d-models/house?page=1769']

In [112]:
single_max_page = max_page_per_category_urls[0]

In [113]:
single_max_page

'https://www.cgtrader.com/3d-models/furniture?page=3908'

In [125]:
true_category_pages = []

for cat in max_page_per_category_urls:
    true_category_pages.append(discover_true_page_count_per_category(cat))

I have a strong suspicion that cgtrader limits page views beyond 277 to only users that are registered and logged into their website. This would be a great move in reducing their exposure to scrapers.

In [133]:
import pickle

In [134]:
with open('../data/cats_with_277_pages', 'wb') as f:
    pickle.dump([(x[0], x[2]) for x in true_category_pages if x[2] == 277], f)

In [135]:
with open('../data/cats_with_277_pages', 'rb') as f:
    new_list = pickle.load(f)

In [137]:
new_list[0:10]

[('https://www.cgtrader.com/3d-models/furniture', 277),
 ('https://www.cgtrader.com/3d-models/interior', 277),
 ('https://www.cgtrader.com/3d-models/architectural', 277),
 ('https://www.cgtrader.com/3d-models/character', 277),
 ('https://www.cgtrader.com/3d-models/house', 277),
 ('https://www.cgtrader.com/3d-models/exterior', 277),
 ('https://www.cgtrader.com/3d-models/modern', 277),
 ('https://www.cgtrader.com/3d-models/decoration', 277),
 ('https://www.cgtrader.com/3d-models/chair', 277),
 ('https://www.cgtrader.com/3d-models/car', 277)]