In [1]:
### A crawler for DIVAR.IR using its POST API

In [13]:
import requests
import json
import math
from bs4 import BeautifulSoup as BS4
import csv

In [2]:
## Fetch City Lists and Categories\
root_URL = 'https://divar.ir/s/tehran'
root_response = requests.get(root_URL)

parsed_root_reponse = BS4(root_response.text, 'html.parser')
preload_data = parsed_root_reponse.find_all('script')

In [3]:
preload_indx = -1
for i , val in enumerate(preload_data):
    if len(val.contents) == 1 and 'window.__PRELOADED_STATE__' in val.string:
        preload_indx = i
        
## find the first and last { }
start = preload_data[preload_indx].string.find('{')
end = preload_data[preload_indx].string.rfind('}')

preload_js = json.loads(preload_data[preload_indx].string[start : end + 1])

In [4]:
def find_city(city_name):
    for state in preload_js["multiCity"]["data"]["children"]:
        for city in state["children"]:
            if city["name"] == city_name:
                return city["id"]

    return -1

def find_category(category_name):
    for category in preload_js["search"]["rootCat"]["children"]:
        if category["name"] == category_name:
            return category["slug"]

        else:
            for sub_cat in category["children"]:
                if sub_cat["name"] == category_name:
                    return sub_cat["slug"]

    return -1

In [5]:
list_of_city_names = ['تهران' , 'اصفهان', 'شیراز']
categorie = 'وسایل نقلیه'
query = "پژو ۲۰۶"
pages_to_load = 4

city_ids = []
for city_name in list_of_city_names:
    city_ids.append( str(find_city(city_nfame)) )

categorie_slug = find_category(categorie)

print(f'Search for: {query} in Cities: {list_of_city_names} and in the Category: {categorie}, retrive {pages_to_load} pages')

Search for: پژو ۲۰۶ in Cities: ['تهران', 'اصفهان', 'شیراز'] and in the Category: وسایل نقلیه, retrive 4 pages


In [6]:
## Create Payload
payload = {
    "city_ids": city_ids,
    "pagination_data":{"@type":"type.googleapis.com/post_list.PaginationData","page":0,"layer_page":0},
    "search_data":{"form_data":{"data":{"category":{"str":{"value":categorie_slug}},"sort":{"str":{"value":"sort_date"}}}}, "query" : query}
}

req_url = "https://api.divar.ir/v8/postlist/w/search"

responses = []
for i in range(1, pages_to_load + 1):

    payload["pagination_data"]["page"] = i
    payload["pagination_data"]["layer_page"] = i
    
    response = requests.post(req_url , data = json.dumps(payload))
    if response.ok:
        responses.append( response.json() )
        
print(f'You got {len(responses)} OK responses')

You got 4 OK responses


In [7]:
## Clean Up the Data
collected_data = []
adver_url = "https://divar.ir/v/"

for resp in responses:
    for widget in resp["list_widgets"]:
        if widget["widget_type"] == "POST_ROW":
            collected_data.append({
                "name"  : widget["data"]["title"],
                "url": adver_url + widget["data"]["title"].replace(' ', '-') + "/" + widget["data"]["action"]["payload"]["token"],
                "token" : widget["data"]["action"]["payload"]["token"],
                "price" : widget["data"]["middle_description_text"] if "middle_description_text" in  widget["data"].keys() else "-1",
                "city"  : widget["data"]["action"]["payload"]["web_info"]["city_persian"],
                "desc"  : widget["data"]["top_description_text"] if "top_description_text" in  widget["data"].keys() else "-1"
            })

In [8]:
## in collected data, search for full description in the ad page
url_head = "https://api.divar.ir/v8/posts-v2/web/"

for i , data in enumerate(collected_data):
    token = data["token"]
    response = requests.get(url_head + token)

    if response.ok:
        colected = response.json()
        description = colected["seo"]["description"]
        collected_data[i]["full_desc"] = description

In [9]:
## convert prices to intigers
def convert_str_to_int(string):
    if string == "-1":
        return -1
    number = string[:-6]
    number = number.replace(',' , '')
    return int(number)

In [10]:
for i , data in enumerate(collected_data):
    collected_data[i]["price"] = convert_str_to_int(data["price"])

In [11]:
for data in collected_data:
    print("name:" , '\t' , data["name"])
    print("url:" , '\t' ,  data["url"])
    print("token:" , '\t' , data["token"])
    print("price:" , '\t' , data["price"])
    print("city:" , '\t' , data["city"])
    print("desc:" , '\t' , data["desc"])
    print('___________________________________________________________________________________________\n')

name: 	 پژو 206 تیپ ۲، مدل ۱۳۹۵
url: 	 https://divar.ir/v/پژو-206-تیپ-۲،-مدل-۱۳۹۵/gZhykLpm
token: 	 gZhykLpm
price: 	 400000000
city: 	 اصفهان
desc: 	 ۸۰,۰۰۰ کیلومتر
___________________________________________________________________________________________

name: 	 پژو 206 تیپ ۲، مدل ۱۳۹۷
url: 	 https://divar.ir/v/پژو-206-تیپ-۲،-مدل-۱۳۹۷/gZhyEELx
token: 	 gZhyEELx
price: 	 455000000
city: 	 شیراز
desc: 	 ۱۰۳,۰۰۰ کیلومتر
___________________________________________________________________________________________

name: 	 پژو 206 تیپ ۵، مدل ۱۳۹۶
url: 	 https://divar.ir/v/پژو-206-تیپ-۵،-مدل-۱۳۹۶/gZhS0vZz
token: 	 gZhS0vZz
price: 	 485000000
city: 	 تهران
desc: 	 ۸۱,۰۰۰ کیلومتر
___________________________________________________________________________________________

name: 	 ۲۰۶تیپ ۲ اتاق تعویض۱۴۰۰
url: 	 https://divar.ir/v/۲۰۶تیپ-۲-اتاق-تعویض۱۴۰۰/gZXaXNAc
token: 	 gZXaXNAc
price: 	 339000000
city: 	 تهران
desc: 	 ۸۶,۰۰۰ کیلومتر
___________________________________________________________

In [16]:
## Write data to CSV file
with open("divar_data.csv", "w", newline="") as file:
    w = csv.DictWriter(file, collected_data[0].keys())
    w.writeheader()

    for data in collected_data:
        w.writerow(data)