In [39]:
import requests
import json
import os
import time
import numpy as np
import psycopg
from dotenv import load_dotenv


def pause_report(length, random_delay, file_count, disp=True):

	pause_time = length * np.random.normal(1, random_delay)
	if disp:
		print(
			f"Scraped {file_count} files. Pausing scraper for {round(pause_time, 2)} seconds."
		)
	time.sleep(pause_time)


def get_data(page_start, page_end, short_pause=3, long_pause=30, random_delay=0.25):

	if page_start < 1:
		print("Starting page number has to be greater than 0.")
		print("Process exiting with error.")
		return 0

	num_artworks = 0
	
	load_dotenv()
	connection = psycopg.connect(os.getenv("DATABASE_URL"), autocommit=True)
	cursor = connection.cursor()

	for page_number in range(page_start, page_end):

		url = f"https://www.artstation.com/api/v2/search/users.json?page={page_number}&per_page=30"

		headers = {
			"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
			"Accept": "application/json, text/plain, */*",
			"Content-Type": "application/json",
			"Origin": "https://www.artstation.com",
			"Referer": "https://www.artstation.com/search/artists?sort_by=followers&skill_ids_include=5",
			"Cookie": "PRIVATE-CSRF-TOKEN=%2Bbf9QnlXgfgdkriTL4yICQ6CUHN7lHIYVI49kSR3OE4%3D; __stripe_mid=64743a20-8848-4da9-b91a-34d1dce663c7efc8ab; visitor-uuid=b6e4f834-23ab-4054-bfc7-4206ef963fc4; referrer-host=www.google.com; G_ENABLED_IDPS=google; G_AUTHUSER_H=0; ArtStationSessionCookie=ImZlMGJkNTkyLTRjYWQtNDE0MC1iNTYxLWUwZDFmOTc4NWMwYSI%3D--cee01102500c9e04dc4ca44f0abbad52d408ef29fc871871b0741447be1ef189; __cf_bm=hdAB6qGkNEmgeDBU_uA4s3J6ij7w_8t3RhqrYY2gOvs-1746332609-1.0.1.1-_RHJ2aZlxpmYh_Dj3KKrMlsznNuQyDIE9_YQX6Vnh01rUN6Hs0slWoH9TAABioQ2MIpdS3eNZhJ4aM4_NakhPsN2oMtPYvTNFxmX1.lSOMtD3zGHbQpA3xOwofFvJvHu; _ArtStation_session=Y2NSWDVIN1BwWGJDOUJHMWE3Y0JZNXkyR1hzMjluenY2VStqeXBzVDVWd1JtMDk4N2VGckVGS1hJc09nb3VESmFyazgxWGcyOG9xRVJaV1B5L3JoVHc9PS0tajRIK21hTDBkRU5PVWJIRTQxYzJXdz09--f4aed996fe195b8b7d864783b541e99f96544cc7; __stripe_sid=3be8964d-ad03-4353-abc5-b500838f1494b16227",
			"X-CSRF-Token": "ocJRVYaiDU8+wXMmJVObAq4mpyjgK2Sf1KyAzQnsKsxm4WNn2RLHRew6gmVoGm74nIGBeCECEQrMxjnphQSPpQ==",
			"public-csrf-token": "4nMF4riR66I7GErhcqqB8pQue+CzKdZQcbtFVHRbzS0bxPigwcZqWiaK8nJdJgn7mqwrk8i9pEglNXjFUCz1Yw==",
		}

		payload = {
			"additional_fields": [],
			"filters": [{"field": "skill_ids", "method": "include",
								 "value": ["1", "2", "3", "5", "6", "7", "8", "9", "43", "52", "64", "67", "89", "100", "136", "149", "275", "367", "429"]}],
			"page": page_number,
			"per_page": 30,
			"pro_first": "0",
			"query": "",
			"sorting": "followers",
		}


		client = requests.session()
		r = client.post(url, headers=headers, json=payload)
		print(f"Scraping {url} {r.status_code}: {r.reason}")

		if r.status_code != 200:
			print(
				f"❌ Failed to fetch page {page_number}. Status: {r.status_code} - {r.reason}"
			)
			continue  # or return 0
		print(r.status_code)

		# check if rate-limit was exceeded.
		if r.status_code == 429:
			print(
				"Rate-limit exceeded. Wait for a while and try running scraper again.",
				f"Stopped at page {page_number}."
			)
			print("Process exiting with error.")
			return 0

		# if "data" in response text is empty, the max page has probably been reached.
		data = json.loads(r.text)
		if len(data["data"]) == 0:
			print(
				f"Max page for this query has probably been reached. Ending scraping process."
			)
			return 0

		# Save artwork in database
		for user in data["data"]:
			for page_number in range(1, 2):
				url2 = f"https://www.artstation.com/users/{user['username']}/projects.json?page={page_number}"
				r2 = client.get(url2, headers=headers)

				if "application/json" not in r2.headers.get("Content-Type", ""):
					print(f"Something went wrong at {user['username']} – not JSON")
					return 1
				
				artwork_data = json.loads(r2.text)
				for artwork in artwork_data["data"]:
					print(artwork)
					preview_url = artwork["cover"]["small_square_url"]
					permalink = artwork["permalink"]
					cursor.execute(
						"""
						INSERT INTO artworks (permalink, preview_url) 
						VALUES (%s, %s)
						ON CONFLICT (permalink) DO NOTHING
					""", 
					(permalink, preview_url)
					)
					num_artworks += 1

					# take a short pause per url scraped
					pause_report(short_pause, random_delay, num_artworks)

					# take a long pause if 10 urls have been scraped
					if num_artworks > 0 and num_artworks % 10 == 0:
						pause_report(long_pause, random_delay, num_artworks)

	# process end summary
	print(f"Number of artworks saved: num_artworks")
	print(f"Process finished running.")
	return 1

In [40]:
# download raw json data
get_data(2, 400, short_pause=5, long_pause=30)

Scraping https://www.artstation.com/api/v2/search/users.json?page=2&per_page=30 200: OK
200
{'id': 20321083, 'user_id': 12195, 'title': '359', 'description': '', 'created_at': '2025-04-17T07:04:40.213-05:00', 'updated_at': '2025-05-04T08:20:28.452-05:00', 'likes_count': 524, 'slug': '359-7', 'published_at': '2025-04-17T07:04:40.209-05:00', 'adult_content': False, 'cover_asset_id': 87101807, 'admin_adult_content': False, 'tag_list': None, 'hash_id': 'gR3W3e', 'permalink': 'https://www.artstation.com/artwork/gR3W3e', 'hide_as_adult': False, 'cover': {'id': 87101807, 'small_square_url': 'https://cdnb.artstation.com/p/assets/images/images/087/101/807/small_square/su-jian-asset.jpg?1744891470', 'micro_square_image_url': 'https://cdnb.artstation.com/p/assets/images/images/087/101/807/micro_square/su-jian-asset.jpg?1744891470', 'thumb_url': 'https://cdnb.artstation.com/p/assets/images/images/087/101/807/smaller_square/su-jian-asset.jpg?1744891470'}, 'icons': {'image': False, 'video': False, '

ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))