# Setting up environment

First, we import libraries.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import requests as rq
from bs4 import BeautifulSoup as bfs
from datetime import datetime
import re
import locale

We change locale to ensure correct format of date in further processing.

In [None]:
# We set the fr_CH locale to print amounts in correct way
try:
    locale.setlocale(locale.LC_ALL, 'fr_CH.utf8') # Linux locale
    print('locale for linux')
except:
    try:
        locale.setlocale(locale.LC_ALL, 'fr_CH.UTF-8') # Mac locale
        print('locale for Mac')
    except:
        print('Unable to set fr_CH.utf8 or fr_CH.UTF-8 locale. Currency will not be correct.')

We define columns of weather data.

In [None]:
READING_COLUMNS = ['Hour', 'Temperature (°C)', 'Rain (mm/1h)', 'Humidity (%)', 'Wind (average) (km/h)', 'Pressure (hPa)', 'Visibility (km)']

The following function are used to retrieve weather data for a given location and date.

In [None]:
def get_distance_between_locations(x_lat, x_long, y_lat, y_long):
    '''
    This function returns the distance (geodesic) between two locations A and B.
    
    More information:
    https://en.wikipedia.org/wiki/As_the_crow_flies
    https://en.wikipedia.org/wiki/Geodesic
    
    Parameters:
    x_lat: latitude of location A
    x_long: longitude of location A
    y_lat: latitude of location B
    y_long: longitude of location B
    '''
    
    R = 6371 # Radius of earth (km)
    d_lat = math.radians(y_lat-x_lat)
    d_long = math.radians(y_long-x_long) 
    a = ( 
        math.sin(d_lat/2) * math.sin(d_lat/2) +
        math.cos(math.radians(x_lat)) * math.cos(math.radians(y_lat)) * 
        math.sin(d_long/2) * math.sin(d_long/2)
        )
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    d = R * c
    return d

In [None]:
def retrieve_nearest_weather_sensor(latitude, longitude):
    '''
    This function retrieves the nearest sensor for a given location.
    
    Parameters:
    latitude: latitude of the location
    longitude: longitude of the location
    '''
    
    result = {'station': 'N/A', 'url': 'N/A', 'distance_from_location': float("inf")}
    for index, row in stations.iterrows():
        distance = get_distance_between_locations(latitude, longitude, row['Latitude'], row['Longitude'])
        # We store sensor's information if distance between the aforesaid sensor and the given location is lower than the stored one
        if(distance < result['distance_from_location']):
            result['station'] = index
            result['url'] = row['URL']
            result['distance_from_location'] = distance
    return result

In [None]:
def retrieve_weather_of_location_by_date(latitude, longitude, date):
    '''
    This function retrieves the weather of a location for a given date.
    
    Parameters:
    latitude: latitude of the location
    longitude: longitude of the location
    date: date to be considered for weather
    '''
    
    station = retrieve_nearest_weather_sensor(latitude, longitude)
    formatted_date = date.strftime('%d/%B/%Y')
    archive_url = station['url'].replace('temps-reel', 'archives/' + formatted_date)
    archive_url = archive_url.replace('/01/', '/1er/')
    archive_url = remove_zero(archive_url)
    page = rq.get(archive_url)
    return page

In [None]:
def format_complete_reading(page):
    '''
    This function creates a DataFrame from a HTML table containing weather data.
    
    Parameter:
    page: HTML page containing weather data of a location, for a given date
    '''
    
    content = bfs(page.text, 'html.parser')
    table = content.find(id='tableau-releves')
    
    columns = [column.text for column in table.find_all('th')]
    series = []
    
    for row in table.find_all('tr'):
        serie = []
        for col in row.find_all('td'):
            serie.append(col.text)
        series.append(serie)

    df = pd.DataFrame(series)
    df.columns = columns
    
    return df

In [None]:
def format_visibility(visibility):
    '''
    This function formats visibility's column.
    
    Parameter:
    visibility: visibility for a given date and a given location
    '''
    
    normalize = 1
    if visibility.find('km') == -1:
        normalize = 10**3
    regex_match = re.match('(\d*\.?\d+)', str(visibility))
    if regex_match:
        return float(regex_match.group(1))/normalize
    else:
        return None

def format_values(row):
    '''
    This function removes text and units and keep only float numbers.
    
    Parameter:
    row: row to be formatted (represents an entry created by a sensor for a given location and hour)
    '''
    
    columns = (col for col in READING_COLUMNS if col not in ['Visibility (km)'])
    for column in columns:
        regex_match = re.match('(\d*\.?\d+)', str(row[column]))
        if regex_match:
            row[column] = float(regex_match.group(1))
        else:
            row[column] = None
    if row['Visibility (km)']:
        row['Visibility (km)'] = format_visibility(row['Visibility (km)'])
    return row

In [None]:
def clean_reading_dataframe(df):
    '''
    This function cleans a DataFrame containing weather data.
    
    Parameter:
    df: DataFrame to be cleaned
    '''
    
    df_clean = df
    if 'Temps' in df.columns:
        df_clean.drop('Temps', axis=1, inplace=True)
        
    if 'Biométéo' in df.columns:
        df_clean.drop('Biométéo', axis=1, inplace=True)
        
    if 'Pt. de rosée' in df.columns:
        df_clean.drop('Pt. de rosée', axis=1,inplace=True)
    
    if 'Pluie' in df_clean.columns:
        df_clean.columns = READING_COLUMNS
        
    else:
        df_clean.columns = ['Hour', 'Temperature (°C)', 'Humidity (%)', 'Wind (average) (km/h)', 'Pressure (hPa)', 'Visibility (km)']
        df_clean['Rain (mm/1h)'] = np.nan
        
    df_clean.dropna(how='all', inplace=True)
    df_clean = df_clean.apply(format_values, axis=1)
    return df_clean

In [None]:
def get_weather_info(reading):
    '''
    This function retrieves weather data (summary) for a location and a given date.
    
    Parameter:
    reading: DataFrame containing weather data
    '''
    
    result = {'temperature': {},
             'wind': {},
             'pression': {}}
    
    detailed_metrics = {'temperature': 'Temperature (°C)', 'wind': 'Wind (average) (km/h)', 'pression': 'Pressure (hPa)'}
    mean_metrics = {'humidity': 'Humidity (%)', 'visibility': 'Visibility (km)'}
    sum_metrics = {'rain': 'Rain (mm/1h)'}
    
    for key, value in detailed_metrics.items():
        result[key]['min'] = reading[value].min()
        result[key]['max'] = reading[value].max()
        result[key]['avg'] = reading[value].mean()
        
    for key, value in mean_metrics.items():
        result[key] = reading[value].mean()
        
    for key, value in sum_metrics.items():
        result[key] = reading[value].sum()
    
    return result

In [None]:
def remove_zero (url_archived):
    '''
    Remove the 0 in the url containing number.
    
    Parameter:
    url_archived: String 
    '''
    
    for integer in range(1, 10):
        url_archived = url_archived.replace('/0'+ str(integer) +'/', '/'+ str(integer) +'/')
        
    return url_archived

# Example

We load data relative to stations.

In [None]:
stations_data_columns = ['Station', 'Altitude (m)', 'Latitude', 'Longitude', 'URL']
stations = pd.read_csv('./Data/InformationStation.csv', usecols=stations_data_columns, index_col=['Station'], dtype={'Latitude': 'float', 'Longitude': 'float', 'Altitude (m)': 'float'})
stations.head()

We retrieve complete weather data of <a href="https://en.wikipedia.org/wiki/Palace_of_Nations">Palace of Nations</a> for November, 24th 2016.

In [None]:
page = retrieve_weather_of_location_by_date(46.915183, 7.107277, datetime(2014, 10, 3))
df = format_complete_reading(page)
clean_df = clean_reading_dataframe(df)
clean_df

We also display summary of weather data for the aforesaid location and for given date.

In [None]:
weather_info = get_weather_info(clean_df)
weather_info