In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
GET_ERROR = -1
CLASS_NAMES = ["odTr" , "evendTr"]
PARENT_TAG = "tr"
PRICE_TAG = "span"
OTHERS_TAG = "td"
PHONE = 0
TIME = 3
STATE = 4
CITY = 5

In [3]:
address = {
    "class_names":CLASS_NAMES,
    "parent_tag":PARENT_TAG ,
    "price_tag" : PRICE_TAG ,
    "others_tag": OTHERS_TAG ,
    "index":{
        "phone":PHONE ,
        "time":TIME ,
        "state": STATE ,
        "city": CITY
    }
}

In [4]:
class Scraper:
    def __init__(self , url , pages , data_address , file_name):
        self.url = url
        self.pages = pages
        self.address = data_address
        self.df = pd.DataFrame(columns=["phone_number" , "price" , "status" , "city" , "time"])
        self.file_name = file_name
        
    @staticmethod    
    def extract_single_record(desc):
        price = desc.find(PRICE_TAG).text.strip()
        others = desc.find_all(OTHERS_TAG)
        phone_number = others[PHONE].text.strip()
        time = others[TIME].text.strip()
        state = others[STATE].text.strip()
        city = others[CITY].text.strip()
        record = {
            "phone_number":phone_number,
            "price":price,
            "status":state,
            "city":city,
            "time":time
        }
        return record
        
    @staticmethod    
    def get_page_contents(url):
        try:
            page = requests.get(url)
            return page
        except:
            return GET_ERROR
            
    
    def scrape_page(self , page):
        records = []
        rows = []
        soup = BeautifulSoup(page.text, "html.parser")
        for name in self.address["class_names"]:
            records.extend(soup.find_all(self.address["parent_tag"], class_=name))
        for index , record in enumerate(records):
            row = Scraper.extract_single_record(record)
            rows.append(row)
        
        page_df = pd.DataFrame.from_dict(rows)
        self.df = pd.concat([self.df , page_df] , axis=0)
    
        
    def get_next_page_url(self , page_num):
        start = self.url.find("page=") + 5
        return url[:start] + str(page_num) + url[start+1:]
    
    def save_data(self):
        self.df.to_csv(self.file_name , index=False)
    
    def run(self):
        for page_number in range(1,self.pages+1):
            url = self.get_next_page_url(page_number)
            page = Scraper.get_page_contents(url)
            if page == GET_ERROR:
                continue
            self.scrape_page(page)
            print(f'page {page_number} scraped')
        self.save_data()
    

In [5]:
url = "https://www.rond.ir/SearchSim?page=1&StateId=0&CityId=0&SimOrderBy=Update"

In [None]:
test = Scraper(url , 15800 , address , 'rond.com_full.csv')
test.run()

page 1 scraped
page 2 scraped
page 3 scraped
page 4 scraped
page 5 scraped
page 6 scraped
page 7 scraped
page 8 scraped
page 9 scraped
page 10 scraped
page 11 scraped
page 12 scraped
page 13 scraped
page 14 scraped
page 15 scraped
page 16 scraped
page 17 scraped
page 18 scraped
page 19 scraped
page 20 scraped
page 21 scraped
page 22 scraped
page 23 scraped
page 24 scraped
page 25 scraped
page 26 scraped
page 27 scraped
page 28 scraped
page 29 scraped
page 30 scraped
page 31 scraped
page 32 scraped
page 33 scraped
page 34 scraped
page 35 scraped
page 36 scraped
page 37 scraped
page 38 scraped
page 39 scraped
page 40 scraped
page 41 scraped
page 42 scraped
page 43 scraped
page 44 scraped
page 45 scraped
page 46 scraped
page 47 scraped
page 48 scraped
page 49 scraped
page 50 scraped
page 51 scraped
page 52 scraped
page 53 scraped
page 54 scraped
page 55 scraped
page 56 scraped
page 57 scraped
page 58 scraped
page 59 scraped
page 60 scraped
page 61 scraped
page 62 scraped
page 63 scraped
p