# Data Mining. Leboncoin appartments.

###### Python 3

## Importing Libraries

##### Basic packages

In [2]:
import csv
import re
import math
import numpy as np
import pandas as pd
#import unidecode

##### Packages for web crawling

In [3]:
from lxml import html
from bs4 import BeautifulSoup
import urllib.request

##### Packages for action emulations and requests

In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains

import googlemaps
from datetime import datetime

# Auxiliary functions

#### Crawling

In [5]:
def MakeLinksOfPages(soup, city, postal_code, filters = '&mrs=100&ret=1&ret=2'):
    """
    Creates a list of leboncoin pages that contain offers for the city with postal_code, plus some filters,
    e.g.
    mrs : minimal price
    ret=1 : house
    ret=2 : appartments
    """
    pages=[]
    last_page=0
    for line in soup.find_all("a", {"class": "element page static"}):
        lpage=line.get('href')
        elem=re.split('=|&',lpage)
        if int(elem[1])>last_page:
            last_page=int(elem[1])
        else: next 
    for i in range(1,last_page+1):
            base = 'https://www.leboncoin.fr/locations/offres/'
            num_page = '?o=' + str(i)
            location = '&location='+city+'%20'+postal_code
            pages.append(base + num_page + location + filters)
    return pages

In [6]:
def TakeAllLinksFromPage(page):
    """
    From a leboncoin page it takes all links that correspond to offers.
    page is the link to this page
    """
    links=[]


    url=urllib.request.urlopen(page)
    soup=BeautifulSoup(url, 'html.parser')
    for line in soup.find_all('a'):
        newline = line.get('href')
        if newline is not None:
            if "www.leboncoin.fr/locations/" in newline:
                element = newline.split("/")[4]
                if ("." in element):
                    links.append("https:"+newline)
    return links

In [7]:
def CrawlThePage():
    """
    Documentation is needed
    """
    stop = False
    descrip = lsoup.find('p', {'itemprop': 'description'}).get_text(separator="\n")
    if PaidMonthly(descrip):
        result_dict['description'].append(descrip)
    else:
        stop = True
                
    if not stop:
        for line in lsoup.find_all('script', type="text/javascript"):
            if  "var utag_data" in str(line) :
                lines_list = str(line).split("\n")
                for line_ in lines_list:
                    interesting_line = line_.replace(" ","")
                    line_split = interesting_line.split(":")
                    if (line_split[0] in result_dict):
                        result_dict[line_split[0]].append(line_split[1].replace('"',""))
                        #.encode("utf-8")
                for key in (result_dict.keys()):
                    if len(result_dict[key]) != l+1:
                        result_dict[key].append(float('nan'))

#### Conversion of coordinates

In [8]:
def Geo2Mercator(latitude, longitude, zoomLevel=16):
    """Conversion from latitude and longitude to the mercator tiles coordinates
     
    It takes latitude and longitude and convert them to coordinates in Mercator projection (a case of tiles' map).
    
    Parameters
    ----------
    latitude : float
        Latitude in degrees.
    longitude : float
        Longitude in degrees.
    zoomLevel : int, optional
        A scale level. By default, it is equal to 16 that is a default zoom for opening the here.com map.
        
    Returns
    -------
    row : int
        Row's number given a zoomLevel in the map of tiles.
    column : int
        Column's number given a zoomLevel in the map of tiles.
    """
    lonRad = math.radians(longitude)
    latRad = math.radians(latitude)
    pi = math.pi
    
    columnIndex = lonRad
    rowIndex = math.log(math.tan(math.pi/4 + latRad/2))
    
    columnNormalized = (1 + (columnIndex / pi)) / 2
    rowNormalized = (1 - (rowIndex / pi)) / 2
    
    tilesPerRow = 2 ** zoomLevel
    
    column = round(columnNormalized * (tilesPerRow - 1))
    row = round(rowNormalized * (tilesPerRow - 1))
    return(row, column)

In [9]:
def Mercator2Geo(column, row, zoomLevel=16, degree = True):
    """Conversion from the mercator tiles coordinates to latitude and longitude
     
    It takes coordinates in Mercator projection (a case of tiles' map) and convert them to latitude and longitude.
    
    Parameters
    ----------
    column : int
        Column's number given a zoomLevel in the map of tiles.
    row : int
        Row's number given a zoomLevel in the map of tiles.
    zoomLevel : int, optional
        A scale level. By default, it is equal to 16 that is a default zoom for opening the here.com map.
    degree: bool, optional
        It indicates whether output should be returned in degrees.  By default, it is True.
        
    Returns
    -------
    phi : float
        Latitude. If degree == True, then in degrees. Otherwise, in radians.
    lmbda : float
        Longitude. If degree == True, then in degrees. Otherwise, in radians.
    """
    pi = math.pi
    phi = 2*np.arctan( np.exp(pi - (row)/(2**zoomLevel-1)*2*pi ) ) - pi/2
    lmbda = (column/(2**zoomLevel-1))*2*pi-pi
    if degree:
        phi = 180*phi/pi
        lmbda = 180*lmbda/pi
    return(phi, lmbda)

#### Retrieve coordinates

In [10]:
def GetCoordinates(link, path):
    """The function open a link by Webdriver driver, clicks on a map's button, 
    retrieve coordinates in mercator projection, converts them into standard coordinates.
    
    Parameters
    ----------
    link : str
        A link of a place on Leboncoin.fr
    driver : selenium.webdriver.firefox.webdriver.WebDriver
        Webdriver. It works well with Geckodriver. 
        If to use chromedriver, problem can arise that the button is not clickable in the center point.
    Returns
    -------
    out : numpy.ndarray
        Latitude and Longitude in degrees.
    """
    driver = webdriver.Firefox(executable_path = path)
    #set timeout of page load
    driver.set_page_load_timeout(5)
    try:
        driver.get(link)
    except TimeoutException:
        driver.execute_script("window.stop();")
    try:
        driver.get(link)
    except TimeoutException:
        driver.close()
        return np.array([0,0])
    
    #search in the html code for a button that opens a map with the location
    elem_line = driver.find_element_by_xpath("//div[contains(@class, 'line_city')]")
    elem_list = elem_line.find_elements_by_xpath(".//*")
    buttons = [elem for elem in elem_list if elem.tag_name == 'button']
    #there are two objects of a class button
    #basically, they have same properties, but only second one is clickable
    buttons[1].click()
    
    #after the map is open, new code in the html is appeared 
    try:
        leaflet = driver.find_elements_by_xpath("//div[@class='leaflet-tile-container']")[0]
        images = leaflet.find_elements_by_xpath("//img")
    except IndexError:
        driver.close()
        return np.array([0,0])
    
    #we need only images that have a structure http://1.base.maps.api.here.com/...
    pattern = 'http://1.base.maps.api.here.com'
    refs = []
    for image in images:
        s = image.get_attribute('src')
        if (pattern in s):
            refs.append(s)
    #all links are like:
    #http://1.base.maps.api.here.com/maptile/2.1/maptile/newest/normal.day/16/X/Y/256/png8...
    #so, we retrieve X and Y
    mercators = list(map(lambda x: list(map(int, x.split("/")[-4:-2])),refs))
    #convert to longitude and latitude
    geos = list(map(lambda x: Mercator2Geo(x[0], x[1]), mercators))
    coordinates = np.array(geos).mean(axis=0)
    # if not in the city, we set to 0
    if (not InTheCity(coordinates[0], coordinates[1])):
        coordinates[0] = 0
        coordinates[1] = 0
    driver.close()
    return coordinates

#### Different checks

In [11]:
def InTheCity(latitude, longitude, bottom = 45.1221, top = 45.2678, left = 5.6724, right = 5.9254):
    """Check whether a place belongs to the city area. By default, the city is Grenoble. 
    
    It checks coordinates of the place whether they are adequate or not. If the coordinate point is located
    inside a frame, then it returns True. As the frame for Grenoble it has been taken a territory between:
    Sarcenas(North)
    Le Pont-de-Claix(South)
    Engins(West)
    Revel(East)
    
    Parameters
    ----------
    latitude : float
        Latitude of a place in degrees.
    longitude : float
        Longtitude of a place in degrees.
    bottom : float
        A lower bound for the latitude of the place.
    top : float
        A upper bound for the latitude of the place.
    left : float
        A lower bound for the longitude of the place.
    right : float
        A upper bound for the longitude of the place.
    Returns
    -------
    out : bool
        True value when it is in the city. False is otherwise.
    """
    if (latitude > bottom) and (latitude < top) and (longitude > left) and (longitude < right):
        return True
    else:
        return False

In [12]:
def PaidMonthly(Description):
    """Checks in description whether the price is per month or per week/night. Returns True, if it is paid monthly.
    
    Parameters
    ----------
    Description : list
        A list of strings. Each element is a line from the Description.
    Returns
    -------
    out : bool
        True, when it is paid monthly,
        False, otherwise.
    """
    res = True
    not_monthly_paid = {"tarif semaine", "la nuitée", "la nuit", "la semaine"}
    for d in Description:
        for dic in not_monthly_paid:
            if dic in d:
                res = False
    return res

In [13]:
def OneSearch(regex, string):
    """
    Just handles an exception.
    """
    try:
        return regex.search(string).group()
    except AttributeError:
        return ''

In [14]:
def ContainsGrenoble(x):
    """
    It adds grenoble in the begining of the query x.
    """
    if 'grenoble' not in x: 
        return 'grenoble ' + x 
    else:
        return x

In [15]:
def SearchInDescription(description):
    """
    Using some primitive regular expressions, 
    it tries to find some parts of the description that can be the address of the offer. 
    Attention: to use it, google maps client should be initialized (as it is done in the main part of the code)
    """
    description = description.lower()
    description = description.split('\n')
    description = list(map(lambda x: ' '.join(x.split()), description))
    addr = []
    dictionary = [' rue ', ' avenue ', ' place ', ' cours ']
    
    #from 1 to 4 digits, then a word from dictionary, after any symbols, finally, pattern is ended by "grenoble"
    myregex = [re.compile('[0-9]{1,4}'+ dic +'(.*)grenoble') for dic in dictionary]
    #some other options
    myregex += [re.compile('[0-9]{1,4}'+ dic +'(.*),') for dic in dictionary]
    myregex += [re.compile('[0-9]{1,4}'+ dic +'(.*)') for dic in dictionary]
    myregex += [re.compile(dic +'(.*)grenoble') for dic in dictionary]
    #look for substrings that correspond to one of the patterns
    for d in description:
        res = list(map(lambda x: OneSearch(x,d), myregex))
        res = list(filter(None, res))
        addr += res
    
    addr = list(map(ContainsGrenoble, addr))
    #if we spotted possible addresses => send queries to the google api
    #if google returns some coordinates => return them
    #if it fails, return zeros
    if len(addr) != 0:
        for a in addr:
            geocode_result = gmaps.geocode(a)
            if len(geocode_result) != 0:
                return np.array([(geocode_result[0]['geometry']['location']['lat']),(geocode_result[0]['geometry']['location']['lng'])])
    return np.array([0,0])

## Main code

#### Find links

In [28]:
# Input : List containing City , Postal code

cities = ["Grenoble", "Grenoble"]

postal_codes = ["38000", "38100"]

filters = '&mrs=100&ret=1&ret=2'
# mrs is a minimal cost
# ret = 1: maison
# ret = 2: appartement

# because of selenium, all together takes lots of time
# so, data was mined city by city, page by page
i = 0

name = 'https://www.leboncoin.fr/locations/offres/?th=1&location=' + cities[i] + '%20' + postal_codes[i] + filters
url = urllib.request.urlopen(name).read()
soup = BeautifulSoup(url, 'html.parser')
pages = MakeLinksOfPages(soup, cities[i], postal_codes[i])

In [60]:
#links = []
#for page in pages:
#    links += TakeAllLinksFromPage(page)
num = 6
links = TakeAllLinksFromPage(pages[num])

#### Mining

In [61]:
result_dict={'region':[],'oas_region':[],'departement':[], 'oas_departement':[], 'cp':[],'city':[],\
             'titre':[], 'publish_date':[],'last_update_date':[],'loyer':[],'surface':[],'pieces':[],\
             'type':[],'meuble':[],'description':[], 'latitude':[], 'longitude':[]}

#insert below your webdriver's location
path = "D:/Drivers/geckodriver/geckodriver.exe"
#insert below your key for use of Google API Geocoding
gmaps = googlemaps.Client(key='???')

for l in range(len(links)):
    content=urllib.request.urlopen(links[l])
    lsoup=BeautifulSoup(content, "html.parser")
    CrawlThePage()
    coordinates = SearchInDescription(result_dict['description'][l])
    result_dict['description'][l] = result_dict['description'][l].replace('\n','. ')
    # zero values of lon. and lat. means that they were not found 
    # if both coordinates are 0, their sum is zero
    if sum(coordinates) == 0:
        # trying to find by Selenium
        coordinates = GetCoordinates(links[l], path)
    # finally, we check again, whether coordinates were found    
    if sum(coordinates) != 0:
        result_dict['latitude'][l] = coordinates[0]
        result_dict['longitude'][l] = coordinates[1]
    
df = pd.DataFrame.from_dict(result_dict)

In [64]:
name = "data/grenoble_additional_data/Data_from_leboncoin_"+cities[i]+"_"+postal_codes[i]+ "_page_"+ str(num) +".csv"
df.to_csv(name, sep=';', encoding="utf-8")