In [None]:
from pathlib import Path
import pandas as pd
import plotly.express as px
from sklearn.metrics.pairwise import haversine_distances
import numpy as np
from datetime import datetime

# 1. Loading the data

In [None]:
DATA_PATH = Path("data")
WEATHER_DATA_PATH = DATA_PATH / "RR59"

In [None]:
weather_data = pd.read_csv(
    WEATHER_DATA_PATH / "Q_59_previous-1950-2022_RR-T-Vent.csv",
    sep=";",
    parse_dates=["AAAAMMJJ"],
)

In [None]:
weather_data.info()

In [None]:
weather_data.head(5)

# 2. Exploratory data analysis

## 2.a Compute weather data distribution

In [None]:
weather_data.describe()

## 2.b Identify extremum values for each station

In [None]:
station_info = weather_data.groupby("NOM_USUEL", as_index=False).agg(
    START_DATE=("AAAAMMJJ", "min"),
    END_DATE=("AAAAMMJJ", "max"),
    MAXRR=("RR", "max"),
    MINRR=("RR", "min"),
    LAT=("LAT", "mean"),
    LON=("LON", "mean"),
)
station_info["DURATION"] = (
    station_info["END_DATE"] - station_info["START_DATE"]
).dt.days

In [None]:
map = px.scatter_mapbox(
    station_info,
    lat="LAT",
    lon="LON",
    hover_data=["MAXRR", "MINRR", "NOM_USUEL"],
    color="MAXRR",
    color_continuous_scale="Jet",
    size="DURATION",
)
map.update_layout(mapbox_style="carto-positron", height=1200)

## 2.c Display RR distribution for each station

In [None]:
weather_data.groupby("NOM_USUEL")["RR"].describe()

In [None]:
weather_data[weather_data["RR"] > 0].plot(
    kind="hist",
    color="NOM_USUEL",
    x="RR",
    backend="plotly",
    barmode="overlay",
    log_x=True,
    histnorm="probability",
    # cumulative = True
)

# 3. Explore correlation between stations

## 3.1 Align data on same datetime

In [None]:
weather_ts = weather_data.pivot_table(
    columns="NOM_USUEL", index="AAAAMMJJ", values="RR"
)

In [None]:
weather_ts.loc[datetime(2000, 1, 1) : datetime(2000, 1, 30)].plot(backend="plotly")

## 3.2 Compute pairwise correlation between timeseries

In [None]:
correlation_matrix = weather_ts.corr()

In [None]:
px.imshow(correlation_matrix, width=1800, height=1200, color_continuous_scale="Jet")

In [None]:
spearman_correlation_matrix = weather_ts.corr(method="spearman")

In [None]:
px.imshow(
    spearman_correlation_matrix, width=1800, height=1200, color_continuous_scale="Jet"
)

## 3.c Compare correlation to distance between stations

In [None]:
coords_in_radian = station_info[["LAT", "LON"]].values * np.pi / 180

distance_in_radians = haversine_distances(coords_in_radian)
distance_in_km = pd.DataFrame(
    data=distance_in_radians * 6_371,
    index=pd.Index(station_info["NOM_USUEL"]),
    columns=pd.Index(station_info["NOM_USUEL"]),
)

In [None]:
px.imshow(distance_in_km, width=1800, height=1200, color_continuous_scale="Jet")

In [None]:
correlation_serie = correlation_matrix.melt(
    ignore_index=False, value_name="Correlation", var_name="NOM_USUEL_2"
).set_index("NOM_USUEL_2", append=True)

distance_serie = distance_in_km.melt(
    ignore_index=False, value_name="Distance", var_name="NOM_USUEL_2"
).set_index("NOM_USUEL_2", append=True)

In [None]:
correlation_vs_distance = pd.concat([correlation_serie, distance_serie], axis=1)

correlation_vs_distance.reset_index().plot(
    backend="plotly",
    x="Distance",
    y="Correlation",
    kind="scatter",
    hover_data=["NOM_USUEL", "NOM_USUEL_2"],
)

In [None]:
station_of_interest = "DUNKERQUE"

correlation_map = px.scatter_mapbox(
    station_info,
    lat="LAT",
    lon="LON",
    color=correlation_matrix[station_of_interest],
    hover_data=["NOM_USUEL"],
    color_continuous_scale="Jet",
    size="DURATION",
)

correlation_map.update_layout(mapbox_style="carto-positron", height=1200)