# WEB SCRAPING FOR EMLAKJET.COM
In this project, it's aimed to scrape the property pricing data from emlakjet.com for Antalya/Turkey and to make EDA on data

In [1]:
# importing packages
import pandas as pd
from bs4 import BeautifulSoup
import requests
import cloudscraper
import scrapethat

In [2]:
# creating helper functionss to read the links and extract the text
def read_html(link):
    response = requests.get(link)
    return(BeautifulSoup(response.text, 'html'))

def get_texts(link_nodes):
    return([x.text for x in link_nodes])

In [3]:
# creating function for filtering the informations we need according the their HTML tag 
# using try and except for error handling of the function
def get_one_ship(link):
    try:
        t = read_html(link)

        # extracting the ad key number 
        elements_ad_number = t.select('._1bVOdb:nth-child(1)') # SelectorGadget was used for finding HTML tag
        key_ad_number = get_texts(elements_ad_number)[0] if elements_ad_number else None

        # extracting the ad value number according to HTML tag
        ad_number_value_elements = t.select('._1bVOdb:nth-child(1) + ._1bVOdb') # SelectorGadget was used for finding HTML tag
        ad_number_value = get_texts(ad_number_value_elements)[0] if ad_number_value_elements else None

        # extracting the price according the HTML tag(since the price is not in the ad key and ad value elements)
        price_elements = t.select('._2TxNQv')
        price_info = ''.join(price_elements[0].stripped_strings) if price_elements else None
        # 'join' concatenates strings into a single string.
        # 'stripped_strings' retrieves all the strings according to the specified HTML tag

        # extracting the location(since the location is not in the ad key and ad value elements)
        ad_location_elements = t.select('#harita .UD9vJq + div ._3VQ1JB p') # SelectorGadget was used for finding HTML tag
        ad_location = get_texts(ad_location_elements)[0] if ad_location_elements else None

        # extracting the other key-value pairs
        keys = get_texts(t.select('._1bVOdb:nth-child(1)'))# SelectorGadget was used for finding HTML tag
        values = get_texts(t.select('._1bVOdb:nth-child(1) + ._1bVOdb')) # SelectorGadget was used for finding HTML tag

        # constructing the dictionary for key-value pairs
        data = {
            key_ad_number: ad_number_value,
            'price': price_info,
            'location': ad_location
        }

        # adding the other key-value pairs to the dictionary
        data.update(zip(keys, values))

        return data

    except Exception as e:
        print(f"Error processing link {link}: {e}")
        return None


In [4]:
# extracting the all pages we want:15 pages
all_page_links = ['https://www.emlakjet.com/satilik-konut/antalya/']

links2 = [f"https://www.emlakjet.com/satilik-konut/antalya/{k}/" for k in range(2, 15)]

all_page_links.extend(links2)

all_page_links

['https://www.emlakjet.com/satilik-konut/antalya/',
 'https://www.emlakjet.com/satilik-konut/antalya/2/',
 'https://www.emlakjet.com/satilik-konut/antalya/3/',
 'https://www.emlakjet.com/satilik-konut/antalya/4/',
 'https://www.emlakjet.com/satilik-konut/antalya/5/',
 'https://www.emlakjet.com/satilik-konut/antalya/6/',
 'https://www.emlakjet.com/satilik-konut/antalya/7/',
 'https://www.emlakjet.com/satilik-konut/antalya/8/',
 'https://www.emlakjet.com/satilik-konut/antalya/9/',
 'https://www.emlakjet.com/satilik-konut/antalya/10/',
 'https://www.emlakjet.com/satilik-konut/antalya/11/',
 'https://www.emlakjet.com/satilik-konut/antalya/12/',
 'https://www.emlakjet.com/satilik-konut/antalya/13/',
 'https://www.emlakjet.com/satilik-konut/antalya/14/']

In [5]:
# creating empy list to put the all property links later
all_property_links = []

# iterating through each page 
for page_link in all_page_links:
    # retrieving HTML content 
    response = requests.get(page_link)
    soup = BeautifulSoup(response.text, 'html.parser')

    # finding the all property links within the specified class 
    property_links = [f'https://www.emlakjet.com{x.find("a")["href"]}' for x in soup.select('._3qUI9q') if x.find("a")]

    # adding property links to the empty list(all property links)
    all_property_links.extend(property_links)

# printing all property links 
print(all_property_links)

['https://www.emlakjet.com/ilan/yeniemek-mah-de-21-genis-ferah-daire-14426602/', 'https://www.emlakjet.com/ilan/fiyat-dustu-kuzeyyaka-mah-de-satilik-arakat-daire-14426397/', 'https://www.emlakjet.com/ilan/belek-yolu-file-market-uzerinde-satilik-31-daire-14426356/', 'https://www.emlakjet.com/ilan/sirinyali-emsalsiz-fiyatta-deniz-manzarali-31-mimar-yapimi-14418580/', 'https://www.emlakjet.com/ilan/teomanpasa-da-satilik-sifir-yuksek-giris-11-daire-14416439/', 'https://www.emlakjet.com/ilan/avsallarin-merkezinde-21-satilik-daire-14411871/', 'https://www.emlakjet.com/ilan/sarilar-camlikta-satilik-uygun-11-14411683/', 'https://www.emlakjet.com/ilan/kizilarik-ta-satilik-muhtesem-proje-de-21-daireler-14411185/', 'https://www.emlakjet.com/ilan/hurma-mahallesinde-luks-site-icerisinde-31-dubleks-14419289/', 'https://www.emlakjet.com/ilan/harika-tasarim-proje-den-satilik-11-ve-21-daireler-14408460/', 'https://www.emlakjet.com/ilan/serik-te-kacirilmayacak-firsata-0-daireler-14408218/', 'https://www

In [6]:
# creating a dataframe to get the information of the all properties
# using map function to use get_one_ship function for all_property_links
data_list = list(map(get_one_ship, all_property_links))
data_list = [item for item in data_list if item is not None]  # filterig out None values
df = pd.DataFrame(data_list)

# printing the DataFrame
df

Unnamed: 0,İlan Numarası,price,location,İlan Güncelleme Tarihi,Kategorisi,Net Metrekare,Oda Sayısı,Bulunduğu Kat,Isıtma Tipi,Krediye Uygunluk,...,Görüntülü Gezilebilir mi?,Aidat,Banyo Metrekare,Balkon Metrekare,WC Metrekare,Salon Metrekare,Ada,Parsel,Pafta,Balkon Tipi Fransız Balkon
0,14426602,"2,200,000TL",Antalya - Kepez - Yeni Emek Mahallesi,20 Aralık 2023,Satılık,100 M2,2+1,3.Kat,Klimalı,Krediye Uygun,...,,,,,,,,,,
1,14426397,"2,200,000TL",Antalya - Kepez - Kuzeyyaka Mahallesi,20 Aralık 2023,Satılık,95 M2,2+1,2.Kat,Klimalı,Krediye Uygun,...,,,,,,,,,,
2,14426356,"4,250,000TL",Antalya - Serik - Orta Mahallesi,20 Aralık 2023,Satılık,120 M2,3+1,1.Kat,Kombi Doğalgaz,Krediye Uygun,...,,,,,,,,,,
3,14418580,"8,750,000TL",Antalya - Muratpaşa - Şirinyalı Mahallesi,20 Aralık 2023,Satılık,150 M2,3+1,7.Kat,Kombi Doğalgaz,Krediye Uygun,...,Evet,,,,,,,,,
4,14416439,"1,600,000TL",Antalya - Kepez - Teomanpaşa Mahallesi,18 Aralık 2023,Satılık,60 M2,1+1,Yüksek Giriş,Klimalı,Krediye Uygun Değil,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
415,14423560,"1,999,999TL",Antalya - Kepez - Karşıyaka Mahallesi,20 Aralık 2023,Satılık,85 M2,2+1,3.Kat,Klimalı,Krediye Uygun,...,,,,,,,,,,
416,14423559,"1,725,000TL",Antalya - Muratpaşa - Yüksekalan Mahallesi,20 Aralık 2023,Satılık,100 M2,2+1,Yüksek Giriş,Isıtma Yok,Krediye Uygun,...,,,,,,,,,,
417,14423554,"2,000,000TL",Antalya - Kepez - Güneş Mahallesi,20 Aralık 2023,Satılık,95 M2,2+1,2.Kat,Klimalı,Krediye Uygun,...,,,,,,,,,,
418,14423552,"1,840,000TL",Antalya - Kepez - Gazi Mahallesi,20 Aralık 2023,Satılık,90 M2,2+1,Yüksek Giriş,Isıtma Yok,Krediye Uygun,...,,,,,,,,,,


In [7]:
# creating a csv file to save our results
df.to_csv('raw_data.csv', index=False)