In [34]:
""" Search flat results on wg-gesucht.de """
from __future__ import unicode_literals, absolute_import, generators, \
    print_function

from lxml import html
from urllib import urlencode
import requests
from datetime import datetime, timedelta
import time
import random
import locale
import pandas as pd

locale.setlocale(locale.LC_TIME, str('de_DE.UTF-8')) # You can view a list of available locales on your machine with the locale -a command.

base_url = 'http://www.wg-gesucht.de/wohnungen-in-Berlin.8.2.0.{}.html?'

def search(category, rent_type, minSize, maxPrice, minRooms, maxRooms, exc, balcony, pets, furnished):
        """ Search using a get request including flat details.
        :param category: type of flat, limited to Wohnung for now
        :param rent_type: type of contract
        :param minSize: minimum size in quare meters
        :param maxPrice: max rent price
        :param minRooms: minimum number of rooms
        :param maxRooms: max number of rooms
        :param exc: exchange flat y/n (optional)
        :param balcony: balcony y/n (optional)
        :param pets: pets allowed (optional)
        :param furnished: furnished y/n (optional)
        """
        params = {'offer_filter': 1, 'city_id': 8,
                  'category': category, 'rent_type': rent_type, 
                  'sMin': minSize, 'rMax': maxPrice,
                  'rmMin': minRooms, 'rmMax': maxRooms,
                  'exc': exc, 'bal': balcony, 
                 'pet': pets, 'fur': furnished}
        
        session = requests.Session()
        response = session.get('{}{}'.format(base_url.format('0'), urlencode(params)))        
        print(response.url)
        results = parse_response(response)[0]

        max_page = has_more_pages(response)
        if max_page:
            stop = False
            for i in range(1,max_page):
                time.sleep(random.randint(1,3))
                if stop:
                    break
                r = session.get(base_url.format(i))
                print(r.url)
                next_pages, stop = parse_response(r)
                results.extend(next_pages)
        return results
    
def has_more_pages(response):
    page = html.fromstring(response.content)
    try:
        max_page = int(page.xpath('//a[@class="a-pagination"][last()]')[-1].text.replace('\n', '').strip())
    except Exception as IndexError:
        print('Only one page results')
        max_page=False
    return max_page

def grab_xpath_text(element, xpath):
    """ Given an element and xpath pattern, return text content.
    :param element: lxml element
    :param xpath: string
    returns string
    """
    data = element.xpath(xpath)
    if len(data) == 1:
        return data[0].text
    elif len(data) > 1:
        return [x.text for x in data]
    return ''
    
def parse_response(response):
    """ Given a requests response object, return a list of dictionaries
    containing the pertinent flat info.
    :params response: response obj
    returns list of dictionaries
    """
    stop = False
    page = html.fromstring(response.content)
    results = page.xpath('//table/tbody/tr')
    active_inactive = [grab_xpath_text(res, 'td[contains(@class, "datum")]/a/span').replace('\n', '').strip() not in 'inaktiv' for res in results]
    final_results = []
    for is_active, res in zip(active_inactive[2:], results[2:]):
        if not is_active:
            stop = True
            break 
        item_dict = {}
        item_dict['rooms'] = grab_xpath_text(
            res, 'td[contains(@class, "zimmer")]/a/span').replace('\n', '').strip()
        if not item_dict['rooms']:
            item_dict['rooms'] = '1'
        item_dict['Free from'] = grab_xpath_text(
            res, 'td[contains(@class, "freiab")]/a/span').replace('\n', '').strip()
        item_dict['Free until'] = grab_xpath_text(
            res, 'td[contains(@class, "freibis")]/a/span').replace('\n', '').strip()
        item_dict['Rent price'] = grab_xpath_text(
            res, 'td[contains(@class, "miete")]/a/span/b').replace('\n', '').strip()
        item_dict['Size'] = grab_xpath_text(
            res, 'td[contains(@class, "groesse")]/a/span').replace('\n', '').strip()
        item_dict['District'] = grab_xpath_text(
            res, 'td[contains(@class, "stadt")]/a/span').replace('\n', '').strip()
        item_dict['Link'] = 'http://www.wg-gesucht.de/'+res.get('adid')
        final_results.append(item_dict)
    return final_results, stop

def prepare_wg_data(results):
    """ Prepare wg-gesucht results in a dataframe so they can be easily compared. """
    wg_df = pd.DataFrame(results)
    wg_df['search_engine'] = 'wg-gesucht.de'
    return wg_df

def main(category, rent_type, minSize, maxPrice, minRooms, maxRooms, exc, balcony, pets, furnished):
    """ Call search for each of the flat search engines."""
    wg_results = search(category, rent_type, minSize, maxPrice, minRooms, maxRooms, exc, balcony, pets, furnished)
    final_df = prepare_wg_data(wg_results)
    final_df.to_csv('Flat_search_results.csv', sep=str('\t'), encoding='utf-8')
    return final_df

In [35]:
if __name__ == '__main__':
    category = raw_input('what type of flat are you searching for? (enter 2 for Wohnung) ') or '2'
    rent_type = raw_input('where type of contract? (0 for Egal, 1 for limited, 2 for unlimited, 3 for daily rent) ') or "0"
    minSize = raw_input('min square meters? ')
    maxPrice = raw_input('max rent in Euro? ')
    minRooms = raw_input('min number of rooms? Optional(>=2)') or "2"
    maxRooms = raw_input('max number of rooms? Optional') or "0"
    exc = raw_input('swap flat? Optional (Press Enter for Egal, 1 for Yes, 2 for No)') or "2"
    balcony = raw_input('with balcony? Optional(0 for no, 1 for yes)') or "0"
    pets = raw_input('pets allowed? Optional(Press Enter for Egal, 1 for Yes)') or "0"
    furnished = raw_input('furnished? Optional(Press Enter for Egal, 1 for Yes, 2 for No)') or "0"
    final = main(category, rent_type, minSize, maxPrice, minRooms, maxRooms, exc, balcony, pets, furnished)

what type of flat are you searching for? (enter 2 for Wohnung) 2
where type of contract? (0 for Egal, 1 for limited, 2 for unlimited, 3 for daily rent) 2
min square meters? 45
max rent in Euro? 500
min number of rooms? Optional(>=2)
max number of rooms? Optional
swap flat? Optional (Press Enter for Egal, 1 for Yes, 2 for No)2
with balcony? Optional(0 for no, 1 for yes)
pets allowed? Optional(Press Enter for Egal, 1 for Yes)
furnished? Optional(Press Enter for Egal, 1 for Yes, 2 for No)
http://www.wg-gesucht.de/wohnungen-in-Berlin.8.2.0.0.html?rmMax=0&city_id=8&rent_type=2&sMin=45&offer_filter=1&category=2&fur=0&exc=2&pet=0&rMax=500&bal=0&rmMin=2
http://www.wg-gesucht.de/wohnungen-in-Berlin.8.2.0.1.html


In [37]:
final = pd.read_csv('Flat_search_results.csv', sep='\t', index_col=0)
final

Unnamed: 0,District,Free from,Free until,Link,Rent price,Size,rooms,search_engine
0,Neukölln,20.03.17,28.03.17,http://www.wg-gesucht.de/wohnungen-in-Berlin-N...,210€,65m²,2,wg-gesucht.de
1,Neukölln,28.03.17,04.04.17,http://www.wg-gesucht.de/wohnungen-in-Berlin-N...,250€,46m²,2,wg-gesucht.de
2,Neukölln,01.04.17,11.06.17,http://www.wg-gesucht.de/wohnungen-in-Berlin-N...,600€,55m²,2,wg-gesucht.de
3,Lichtenberg,16.04.17,09.06.17,http://www.wg-gesucht.de/wohnungen-in-Berlin-L...,30€,65m²,2,wg-gesucht.de
4,Neukölln,06.04.17,20.04.17,http://www.wg-gesucht.de/wohnungen-in-Berlin-N...,350€,64m²,2,wg-gesucht.de
5,Mitte,25.03.17,01.04.17,http://www.wg-gesucht.de/wohnungen-in-Berlin-M...,400€,65m²,2,wg-gesucht.de
6,Neukölln,17.03.17,20.03.17,http://www.wg-gesucht.de/wohnungen-in-Berlin-N...,90€,60m²,2,wg-gesucht.de
7,Prenzlauer Berg,16.04.17,05.05.17,http://www.wg-gesucht.de/wohnungen-in-Berlin-P...,350€,60m²,2,wg-gesucht.de
8,Hellersdorf,01.05.17,31.10.17,http://www.wg-gesucht.de/wohnungen-in-Berlin-H...,493€,62m²,2,wg-gesucht.de
9,Friedrichshain,04.04.17,16.04.17,http://www.wg-gesucht.de/wohnungen-in-Berlin-F...,400€,80m²,2,wg-gesucht.de
