# Cars Data Collection through Web Scraping

Website: www.kolesa.kz

Data: 
- Link to the Ad page
- Car Brand /e.g: Hyundai, Toyota, BMW/
- Car Model /Sonata, Camry, 528/
- Year  of production
- Fuel Type /oil, hybrid/
- Engine volume /float/
- Total number of km driven = Mileage
- *Price*

In [1]:
!pip install requests --upgrade --quiet

In [2]:
!pip install beautifulsoup4 --upgrade --quiet

In [72]:
import requests
from bs4 import BeautifulSoup
import html
import time
from collections import defaultdict

import numpy as np
import pandas as pd


In [16]:
def get_response(url):
    try:
        response = requests.get(url, timeout=30)
    except requests.exceptions.ConnectionError:
        print("Connection refused by the server..")
        time.sleep(5)
        return
    page = BeautifulSoup(response.text, 'html.parser')
    return page

In [5]:
#Scrapes the Car Model, and the link to the Car Sale ad page
#Each main page contains 20 ads
def scrape_cars(page):
    class_name = 'a-card__title'
    car_names_tags = page.find_all('h5', {'class': class_name})
    cars_list = []
    cars_links = []
    base_url = 'https://kolesa.kz'
    for tag in car_names_tags:
        cars_list.append(tag.text.strip())
        cars_links.append(base_url + str(tag.find('a')['href']))
    return cars_list, cars_links #of size 20

In [6]:
def get_next_page(page): #return url for the next page (when clicked <next page> button)
    next_button = page.find('a', {'class': 'right-arrow next_page'})
    if next_button:
        next_page_url = 'https://kolesa.kz' + str(next_button['href'])
        return next_page_url
    else:
        return 

In [15]:
def scrape_cars_models():
    url = "https://kolesa.kz/kz/cars/avtomobili-s-probegom"
    page = get_response(url)
    next_url = get_next_page(page)
    
    cars_list = []
    cars_links = []
    cars, links = scrape_cars(page)
    cars_list.append(cars)
    cars_links.append(links)
     
    for i in range(240): #scraping the first 240 pages
        new_url = next_url
        new_page = get_response(new_url)
        new_cars, new_links = scrape_cars(new_page)
        cars_list.append(new_cars)
        cars_links.append(new_links)
        next_url = get_next_page(new_page)
        if not next_url:
            break
    return cars_list, cars_links

In [10]:
def scrape_car_params(car_page):
    if car_page != None: 
        tags = car_page.find_all('h1', {'class': 'offer__title'})
        brand_tag = tags[0].find('span', {'itemprop': 'brand'})
        model_tag = tags[0].find('span', {'itemprop': 'name'})
        year_tag = tags[0].find('span', {'class': 'year'})
        if brand_tag != None and model_tag != None and year_tag != None:
            brand = brand_tag.text.strip()
            model_name = model_tag.text.strip()
            year = year_tag.text.strip()
        else:
            brand, model_name, year = "", "", ""
    
        params = car_page.find_all('div', {'class': 'offer__parameters'})
        dt_tags = params[0].find_all('dt')
    
        engine_idx, mileage_idx = -1, -1
        for tag in dt_tags:
            if tag.text.strip() == "Қозғалтқыш көлемі, л":
                engine_idx = dt_tags.index(tag)
            elif tag.text.strip() == "Жүрісі":
                mileage_idx = dt_tags.index(tag)
            
        if engine_idx != -1 and mileage_idx != -1:
            engine_vol = dt_tags[engine_idx].parent.findNext('dd').text.split()[0]
            fuel_type = dt_tags[engine_idx].parent.findNext('dd').text.split()[1][1:-1]
            mileage = "".join(dt_tags[mileage_idx].parent.findNext('dd').text.strip()[:-2].split())
        else: 
            engine_vol = -1 
            fuel_type = ""
            mileage = -1
    
        price_html = car_page.find('div', {'class': 'offer__price'})
        price = "".join(html.unescape(price_html.text[:-2].split()))
    
        params_final = [brand, model_name, year, engine_vol, fuel_type, mileage, price]
        return params_final
    else:
        return ["", "", -1, -1, "", -1, -1]

In [68]:
#Web Crawling for more parameters information
def scrape_cars_data(cars_links):
    brands = []
    models = []
    years = []
    engine_vol = []
    fuel = []
    kms = []
    price = []
    for i in range(len(cars_links)):
        car_page_url = cars_links[i]
        car_page = get_response(car_page_url)
        car_parameters = scrape_car_params(car_page)
        
        brands.append(car_parameters[0])
        models.append(car_parameters[1])
        years.append(car_parameters[2])
        engine_vol.append(car_parameters[3])
        fuel.append(car_parameters[4])
        kms.append(car_parameters[5])
        price.append(car_parameters[6])
    
    cars_dict = {
        'Car Links' : cars_links,
        'Brand' : brands,
        'Model' : models,
        'Year' : years,
        'Engine Volume' : engine_vol,
        'Fuel Type' : fuel,
        'Mileage' : kms,
        'Price' : price
    }
    
    return cars_dict


In [12]:
def flatten(l):
    return [item for sublist in l for item in sublist]

In [28]:
car_models_list, cars_links = scrape_cars_models()

In [73]:
links = flatten(cars_links)
links[:10]

['https://kolesa.kz/kz/a/show/141995582',
 'https://kolesa.kz/kz/a/show/143313703',
 'https://kolesa.kz/kz/a/show/143274807',
 'https://kolesa.kz/kz/a/show/143235663',
 'https://kolesa.kz/kz/a/show/143236865',
 'https://kolesa.kz/kz/a/show/143236192',
 'https://kolesa.kz/kz/a/show/143189112',
 'https://kolesa.kz/kz/a/show/142921624',
 'https://kolesa.kz/kz/a/show/142874788',
 'https://kolesa.kz/kz/a/show/143771837']

In [37]:
df_cars = scrape_cars_data(links[600:700])
#4820
# = 100
#while i < 300:
#    df_cars.append(scrape_cars_data(links[i:(i+100)]))
#    time.sleep(120)
#    i = i + 100

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [80]:
dict_lists = []
dict_lists.append(df_cars)

In [82]:
i = 700
while i < 2000:
    print("i: ", i)
    dict_lists.append(scrape_cars_data(links[i:(i+100)]))
    time.sleep(180)
    i = i + 100

i:  100
i:  200
i:  300
i:  400
i:  500
i:  600
Connection refused by the server..
Connection refused by the server..
Connection refused by the server..
Connection refused by the server..


KeyboardInterrupt: 

In [90]:
len(dict_lists)

6

In [93]:
def merge_dicts(dict_lists):
    dd = defaultdict(list)
    for d in dict_lists:
        for key, value in d.items():
            dd[key].append(value)
    return dd

In [111]:
dd = merge_dicts(dict_lists)
for k, v in dd.items():
    dd[k] = flatten(dd[k])

df = pd.DataFrame(dd)
df.head()

Unnamed: 0,Car Links,Brand,Model,Year,Engine Volume,Fuel Type,Mileage,Price
0,https://kolesa.kz/kz/a/show/141995582,Volkswagen,Polo,2015,1.6,бензин,130000,5500000
1,https://kolesa.kz/kz/a/show/143313703,Mercedes-Benz,Sprinter,1999,-1.0,,-1,3300000
2,https://kolesa.kz/kz/a/show/143274807,Toyota,Camry,2013,2.5,бензин,145000,9500000
3,https://kolesa.kz/kz/a/show/143235663,Mercedes-Benz,S 500,2006,-1.0,,-1,4990000
4,https://kolesa.kz/kz/a/show/143236865,Volkswagen,Caravelle,2005,2.5,дизель,220000,5800000


In [113]:
df.to_csv('Cars')