In [None]:
# Web Scrapping Hurriyet Emlak

# coding: utf-8

""" 
This code is prepared as a Capstone project for the first level of the data science course given 
in Roermond, Nl, between  20/10/2018 - 10/11/2018.

The problem of the project is to find out the most profitable villas in İstanbul. 
For the purpose of the tarining, only the ads on hurriyetemlak.com on a spesific date is choosen.
"""

# I did not prefer to wrap results as string object in this project. 
# But if you want to work on strings to wrap the features you can return results string by using str function.

""" 
First we import the libraries necessary for scraping data.
"""
import pandas as pd
import csv
import requests
from bs4 import BeautifulSoup

""" 
Second, create the global variables.
"""
list_id= []
price = []
date = []
area = []
owner = []
room = []
seller = []
adres = []
title = []


""" 
Third, define a for loop to scrape data from the web. Here in order to limit the data size 
the range is set to 80 pages only. This will give us a total of 4K villa ads in total. 
(One page consist 50 ads)
"""

for j in range(1,__): #dikkat çalıştırmadan önce kaç sayfa istiyorsanız o rakamı giriniz...

    r = requests.get("https://www.hurriyetemlak.com/konut-satilik/villa/listeleme? \
    pageSize=50&view=catalog&page={}".format(j))

    soup = BeautifulSoup(r.text,'html.parser')
    results = soup.find_all("a", attrs={'class':'overlay-link'})

    for tag in results :
        price.append(tag.get('data-price'))
        date.append(tag.get('data-date'))
        area.append(tag.get('data-meter'))
        owner.append(tag.get('data-owner'))
        room.append(tag.get('data-room'))
        seller.append(tag.get('data-seller-type'))
        adres.append(tag.get('href'))
        title.append(tag.get('title'))
        list_id.append(tag.get('data-listing-id'))

records={"list_id":list_id,"title": title, "price": price, "date":date, "area-m2": area, \
         "owner":owner, "room": room, "seller": seller, "adres":adres, }

df = pd.DataFrame(records)

df.to_csv("hurriyet_raw_data_0.txt") 


In [10]:

""" 
Now our data is ready for inspecting. We need to get a general info of our data first...

This part is done seperately and not placed in the project code.
"""
import pandas as pd
df=pd.read_csv("hurriyet_raw_data_0.txt", index_col=0)
print(df.info()) # price and area-m2 can be changed to numeric types using df['___'].dtype('numeric')
                  # date can be changed to datetime type
print(df.keys())
print(df.head()) # adres should be corrected
print(df.tail()) # list_id should be discussed
# missing values should be discussed

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3076 entries, 0 to 3075
Data columns (total 9 columns):
list_id    1556 non-null object
title      3076 non-null object
price      1556 non-null object
date       1556 non-null object
area-m2    1556 non-null float64
owner      1556 non-null object
room       1556 non-null object
seller     3076 non-null object
adres      3076 non-null object
dtypes: float64(1), object(8)
memory usage: 240.3+ KB
None
Index(['list_id', 'title', 'price', 'date', 'area-m2', 'owner', 'room',
       'seller', 'adres'],
      dtype='object')
       list_id                                              title      price  \
0   0-32399967         Hadımköy Büyükçekmecede Satılık Lüks Villa  2.400.000   
1  49157-16438  142H ELFİ DEN OVAAKÇA CAMLIK VİLLALARINDA SATI...    550.000   
2      93505-0                             Doğam Gökova Villaları    495.000   
3    91638-334  Bodrum Gümüşlük\u0027te Doğa İçinde Satılık Sı...  1.200.000   
4    2152-6616  EGE EMLAK

In [None]:

""" 
Let's clean some data...
"""

import pandas as pd

df=pd.read_csv("hurriyet_raw_data_0.txt", index_col=0)
print(df.keys())
for i in range(0,df.index[-1]): 
       
    if df.loc[i,"adres"][0:5] == "https":  
        df.loc[i,"adres"] = "New project without adres info"
               
    else:
        try:
            df.loc[i,"adres"] = "-".join(df.loc[i,"adres"].replace("/konut-satilik/","").replace("-emlak\
            cidan-villa/detay","").replace("-sahibinden-villa/detay","").split("/"))
            adres_list = (df.loc[i,"adres"].split("-"))
            df.loc[i,"sehir"]= adres_list[0] 
            df.loc[i,"ilce"]= adres_list[1]
            df.loc[i,"mahalle"]= adres_list[2]
        except:
            continue
            
df= df.drop(["list_id"],axis=1)  # removed as not necessary for the purpose.
df= df.drop(["adres"],axis=1)  # removed as already split into three new columns.
df=df.dropna()  # removed all missing value elements.
df=df.reset_index(drop=True) # is this line necessary when we use the line below?
df.to_csv("hurriyet_raw_data_inorder_0.csv", index=True)


In [None]:

""" 
Let's clean some more... date and price values are prepared for explotation...
"""
import pandas as pd
from datetime import datetime
df=pd.read_csv("hurriyet_raw_data_inorder_0.csv", index_col=0)

for i in range(0,df.index[-1]): 

    try:
        df.loc[i,'price']=int("".join(str(df.loc[i,'price']).split('.')))
        df.loc[i,'date']= datetime.strptime(df.loc[i,'date'], '%d.%m.%Y') 
    except:
        df.loc[i,'price']="0"
        df.loc[i,'date']="None"

print(df.head())
df.to_csv("hurriyet_cleaned_data.csv", index=True)