In [2]:
# request import
import requests as rq

# BSoup imports
import urllib.request
from bs4 import BeautifulSoup as BSoup

# Selenium imports
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select

# Pandas imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Utils import
import os
import random
import collections
import string
import time

In [3]:
DATA_FILE = './DataMeteo/StationInformation.csv'
FILTER_URL = 'http://www.infoclimat.fr/observations-meteo/temps-reel/lausanne/06710.html'
LATITUDE = 'lattitude'
LONGITUDE = 'longitude'
ALTITUDE = 'altitude (m)'
CURRENT_ADRESS = 'current_adress'
STATION = 'stationCity'

Get the different value for all the selection.

In [4]:
filters_page = rq.get(FILTER_URL) # Retrieving content of online form
filters = BSoup(filters_page.text, 'html.parser')

selects = filters.find('select',{'id': 'select_station'}) # Filter to get all stations
availableStation = []

for option in selects.findAll('option'):
    if option.text != 'Changer de station...':
        availableStation.append(option.text)
        
len(availableStation)
print(availableStation)

['Adelboden', 'Aigle', 'Alpnach', 'Altdorf', 'Altenrhein', 'Ascona', 'Bâle-Binningen', 'Bern/Belp (Berne)', 'Berne Liebefeld', 'Buchs-Suhr', 'Buochs', 'Buochs', 'BUOCHS', 'Changins', 'Chasseral', 'Chur-Ems', 'Cimetta', 'Comprovasco', 'Corvatsch', 'Disentis', 'Dübendorf', 'Emmen', 'Ernen', 'Evolene-Villaz', 'Fahy-Boncourt', 'Fionnay', 'Gemmi', 'Genève-Cointrin', 'Giswil', 'Goesgen', 'Grand Saint-Bernhard', 'Grenchen', 'Guetsch', 'Hoernli', 'Interlaken', 'Jungfraujoch', 'La Cure', 'La Dole', 'La Fouly', 'La Fretaz', 'Laegern', 'Laufenburg', 'Lausanne', 'Lausanne - Pully', 'Leibstadt', 'Les Eplatures', 'Leysin', 'Locarno Monti', 'Locarno-Magadino', 'Lugano', 'Lugano/Agno', 'Meiringen', 'Mettlen', 'Moleson', 'Montana', 'Mühleberg', 'Napf', 'Neuchatel', 'Payerne', 'Payerne/Metar', 'Piotta', 'Plaffeien', 'Robbia', 'Ruenenberg', 'Saentis', 'Saint Gallen', 'Saint Gallen-Altenrhein', 'Saint-Triphon', 'Samedan', 'San Bernardino', 'Savatan', 'Sion', 'SION (MIL)', 'St. Moritz', 'Stabio', 'Ulrichen

In [5]:
def getLatitude(valueListElement):
    temp = valueListElement.rsplit('\t',4)
    temp = temp[2].rsplit('|',1)
    temp = temp[0].replace(" ","")
    temp = temp.replace("Â°N","")
    temp = temp.replace(",",".")
    print ('lattitude is: '+temp )
    return temp

In [6]:
def getAltitude(valueListElement):
    temp = valueListElement.rsplit(' ',3)
    print ('altitude is: '+temp[1] )
    return temp[1]

In [7]:
def getLongitude(valueListElement):
    temp = valueListElement.rsplit('\t',4)
    temp = temp[2].rsplit('|',1)
    temp = temp[1].replace(" ","")
    temp = temp.replace("Â°E","")
    temp = temp.replace(",",".")
    print ('longitude is: '+ temp )
    return temp

In [8]:
getLongitude('\nCoordonnÃ©es\n\t\t    46,30Â°N | 7,34Â°E \t\t')

longitude is: 7.34


'7.34'

In [9]:
def multiselect_set_selections(driver, element_id, labels):
    result = collections.defaultdict(list)
    
    # Select with the id and loop in oder to get all information about the station.
    for labelText in labels:
        
        #Selenium soup request.
        print ('request for : ' + labelText)
        select = WebDriverWait(driver, 10).until(lambda driver:Select(driver.find_element_by_id(element_id)))
        select.select_by_visible_text(labelText)
        
        #BeautifulSoup request.
        filters = BSoup(driver.page_source.encode('utf-8'), 'html.parser')
        selects = filters.find('ul',{'id': 'station-informations'}).findAll('li')
        
        #get the result.
        result[CURRENT_ADRESS].append(driver.current_url)
        result[LATITUDE].append((getLatitude(selects[1].get_text())))
        result[LONGITUDE].append((getLongitude(selects[1].get_text())))
        result[ALTITUDE].append(getAltitude(selects[0].get_text())) 
        
        # We have to wait to avoid the capcha.
        time.sleep(5)
              
    result[STATION] = availableStation
    return pd.DataFrame(data=result)
        

In [10]:
def getInformationStation():
    chromedriver = "./Utils/chromedriver"
    os.environ["webdriver.chrome.driver"] = chromedriver
    driver = webdriver.Chrome(chromedriver)
    driver.get(FILTER_URL)
    
    # get the result.
    result_screen_scraping = multiselect_set_selections(driver,'select_station',availableStation)
    result_screen_scraping.to_csv('example.csv')
    driver.close()

In [11]:
getInformationStation()

request for : Adelboden
lattitude is: 46.30
longitude is: 7.34
altitude is: 1325
request for : Aigle
lattitude is: 46.33
longitude is: 6.92
altitude is: 383
request for : Alpnach
lattitude is: 45.93
longitude is: 8.28
altitude is: 445
request for : Altdorf
lattitude is: 46.87
longitude is: 8.63
altitude is: 449
request for : Altenrhein
lattitude is: 47.48
longitude is: 9.57
altitude is: 0
request for : Ascona
lattitude is: 46.15
longitude is: 8.76
altitude is: 196
request for : Bâle-Binningen
lattitude is: 47.55
longitude is: 7.58
altitude is: 316
request for : Bern/Belp (Berne)
lattitude is: 46.92
longitude is: 7.50
altitude is: 505
request for : Berne Liebefeld
lattitude is: 46.93
longitude is: 7.42
altitude is: 565
request for : Buchs-Suhr
lattitude is: 47.23
longitude is: 8.05
altitude is: 389
request for : Buochs
lattitude is: 46.98
longitude is: 8.38
altitude is: 0
request for : Buochs
lattitude is: 46.98
longitude is: 8.38
altitude is: 0
request for : BUOCHS
lattitude is: 46.98
