In [67]:
import matplotlib.pyplot as plt
import numpy as np
import ptitprince as pt
import numpy as np
import seaborn as sns
import pandas as pd
import os

plt.rcParams.update({"font.size": 20})
sns.set_style("ticks")


In [68]:
def read_water_gage_station(filename):
    df = pd.read_csv(
        filename,
        index_col="time",
    )
    return df


MeteostatStationLNFT5 = read_water_gage_station("Meteostat_KBYY0.csv")
MeteostatStationKBYY0 = read_water_gage_station("Meteostat_KBYY0.csv")
MeteostatStationKLBX0 = read_water_gage_station("Meteostat_KLBX0.csv")


In [69]:
datasets = {
    "Station LNFT5": MeteostatStationLNFT5,
    "Station KBYY0": MeteostatStationKBYY0,
    "Station KLBX0": MeteostatStationKLBX0,
}


In [70]:
parameters = [
    "Air Temperature (°C)",
    "Dew Point (°C)",
    "Relative Humidity (%)",
    "Total Precipitation (mm)",
    "Wind (From) Direction (Degrees)",
    "Average Wind Speed (m/s)",
    "Sea-Level Air Pressure (hPa)",
]


In [71]:
month_map = {
    1: "January",
    2: "February",
    3: "March",
    4: "April",
    5: "May",
    6: "June",
    7: "July",
    8: "August",
    9: "September",
    10: "October",
    11: "November",
    12: "December",
}

month_order = [
    "January",
    "February",
    "March",
    "April",
    "May",
    "June",
    "July",
    "August",
    "September",
    "October",
    "November",
    "December",
]

rename_columns = {
    "air_temperature": "Air Temperature (°C)",
    "dewpoint": "Dew Point (°C)",
    "relative_humidity": "Relative Humidity (%)",
    "total_precipitation": "Total Precipitation (mm)",
    "snow_depth": "Snow Depth (m)",
    "wind_direction": "Wind (From) Direction (Degrees)",
    "average_wind_speed": "Average Wind Speed (km/h)",
    "wind_peak_gust": "Wind Peak Gust (km/h)",
    "sea_level_air_pressure": "Sea-Level Air Pressure (hPa)",
    "total_sunshine_duration": "Total Sunshine Duration (Minutes)",
    "weather_condition_code": "Weather Condition Code",
}


In [72]:
for dataset_name, df in datasets.items():
    df.rename(columns=rename_columns, inplace=True)
    for col in rename_columns.values():
        df[col] = pd.to_numeric(df[col], errors="coerce")

    df.index = pd.to_datetime(df.index)
    df.insert(0, "Year", df.index.year)
    df.insert(1, "Month", df.index.month)
    df["Month"] = df["Month"].map(month_map)
    df["Month"] = pd.Categorical(df["Month"], categories=month_order, ordered=True)
    df["Year"] = pd.Categorical(df["Year"])
    df["Average Wind Speed (km/h)"] = df["Average Wind Speed (km/h)"] * 1000 / 3600
    df.rename(
        columns={"Average Wind Speed (km/h)": "Average Wind Speed (m/s)"}, inplace=True
    )

    datasets[dataset_name] = df


In [73]:
def remove_outliers(df, column_name):
    data = df.dropna(subset=[column_name])
    Q1 = df[column_name].quantile(0.25)
    Q3 = df[column_name].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    filtered_df = df[
        (df[column_name] >= lower_bound) & (df[column_name] <= upper_bound)
    ]
    return filtered_df


In [74]:
def create_directory_for_plots(dataset_name):
    base_directory_path = "Results/Plots"
    monthly_directory = os.path.join(base_directory_path, dataset_name, "monthly")
    yearly_directory = os.path.join(base_directory_path, dataset_name, "yearly")
    os.makedirs(monthly_directory, exist_ok=True)
    os.makedirs(yearly_directory, exist_ok=True)
    return monthly_directory, yearly_directory


def generate_filenames(dataset_name, parameter, monthly_directory, yearly_directory):
    monthly_filename = (
        f"{monthly_directory}/{parameter}_monthly_raincloud_plot.png".replace(
            "(m/s)", "mـs"
        ).replace("(km/h)", "kmh").replace("(m/s)", "m_s")
    )
    yearly_filename = (
        f"{yearly_directory}/{parameter}_yearly_raincloud_plot.png".replace(
            "(m/s)", "mـs"
        ).replace("(km/h)", "kmh").replace("(m/s)", "m_s")
    )
    return monthly_filename, yearly_filename


In [75]:
def clean_data(data, numerical_variable):
    data = data.dropna(subset=[numerical_variable])
    return remove_outliers(data, numerical_variable)


def categorize_data(data, categorical_variable, categories=None, ordered=False):
    data_copy = data.copy()
    if categories is not None:
        data_copy.loc[:, categorical_variable] = pd.Categorical(
            data_copy[categorical_variable], categories=categories, ordered=ordered
        )
    data_copy.loc[:, categorical_variable] = data_copy[
        categorical_variable
    ].cat.remove_unused_categories()
    return data_copy


In [76]:
def add_line_plot(data, categorical_variable, numerical_variable, ax, categories=None):

    if categorical_variable == "Month":
        means = (
            data.groupby(categorical_variable)[numerical_variable]
            .mean()
            .reindex(categories)
            .reset_index()
        )

        sns.lineplot(
            data=means,
            x=numerical_variable,
            y=categorical_variable,
            ax=ax,
            color="red",
            zorder=20,
            sort=False,
            marker="o",
            markersize=8,
            linestyle="-",
            linewidth=2,
        )

    else:
        pass


In [77]:
def create_plot(
    data_cleaned,
    numerical_variable,
    categorical_variable,
    dataset_name,
    orientation,
    filename,
    month_order,
    showmeans=False,
):
    palette = "Set2"
    f, ax = plt.subplots(figsize=(7, 15))
    pt.half_violinplot(
        x=numerical_variable,
        y=categorical_variable,
        data=data_cleaned,
        palette=palette,
        bw=0.2,
        cut=0.0,
        scale="area",
        width=0.6,
        inner=None,
        orient=orientation,
    )

    sns.boxplot(
        x=numerical_variable,
        y=categorical_variable,
        data=data_cleaned,
        color="black",
        width=0.15,
        zorder=10,
        showcaps=True,
        showmeans=showmeans,
        meanprops={"marker": "o", "markerfacecolor": "red", "markersize": 6},
        boxprops={"facecolor": "none", "zorder": 10},
        showfliers=False,
        whiskerprops={"linewidth": 2, "zorder": 10},
        saturation=1,
        orient=orientation,
    )

    # Corrected the order of parameters for add_line_plot
    if categorical_variable == "Month":
        add_line_plot(
            data_cleaned, categorical_variable, numerical_variable, ax, month_order
        )
    else:
        add_line_plot(data_cleaned, categorical_variable, numerical_variable, ax)

    ax.set_title(dataset_name)
    plt.ylabel("")
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()


In [78]:
def plot_raincloud(
    categorical_variable,
    numerical_variable,
    data,
    dataset_name,
    sigma=0.1,
    orientation="h",
    palette="Set2",
    filename="raincloud_plot.png",
    month_order=None,
):
    data_cleaned = clean_data(data, numerical_variable)
    if categorical_variable == "Month":
        data_cleaned = categorize_data(
            data_cleaned, categorical_variable, categories=month_order, ordered=True
        )
    else:
        data_cleaned = categorize_data(data_cleaned, categorical_variable)

    showmeans = categorical_variable in ["Month", "Year"]
    create_plot(
        data_cleaned,
        numerical_variable,
        categorical_variable,
        dataset_name,
        orientation,
        filename,
        month_order,
        showmeans,
    )


In [79]:
def plot_data(df, dataset_name, parameter):
    monthly_directory, yearly_directory = create_directory_for_plots(dataset_name)
    monthly_filename, yearly_filename = generate_filenames(
        dataset_name, parameter, monthly_directory, yearly_directory
    )
    plot_raincloud(
        categorical_variable="Month",
        numerical_variable=parameter,
        data=df,
        dataset_name=dataset_name,
        filename=monthly_filename,
    )

    plot_raincloud(
        categorical_variable="Year",
        numerical_variable=parameter,
        data=df,
        dataset_name=dataset_name,
        filename=yearly_filename,
    )


In [80]:
def process_datasets_and_plot(datasets, parameters):
    for dataset_name, df in datasets.items():
        for parameter in parameters:
            plot_data(df, dataset_name, parameter)


In [81]:
process_datasets_and_plot(datasets, parameters)


In [83]:
MeteostatStationKBYY0.describe()

Unnamed: 0,Air Temperature (°C),Dew Point (°C),Relative Humidity (%),Total Precipitation (mm),Snow Depth (m),Wind (From) Direction (Degrees),Average Wind Speed (m/s),Wind Peak Gust (km/h),Sea-Level Air Pressure (hPa),Total Sunshine Duration (Minutes),Weather Condition Code
count,112878.0,112857.0,112857.0,29553.0,0.0,96938.0,113326.0,0.0,26259.0,0.0,16820.0
mean,21.644778,17.457209,79.791409,0.433733,,146.819864,3.098156,,1016.65905,,3.692866
std,7.654064,7.730572,18.773595,2.123935,,98.144534,2.333168,,5.344493,,2.96257
min,-9.6,-17.5,11.0,0.0,,0.0,0.0,,996.0,,1.0
25%,17.0,12.8,67.0,0.0,,70.0,1.5,,1013.0,,2.0
50%,23.1,20.3,85.0,0.0,,150.0,2.611111,,1016.0,,3.0
75%,26.8,23.5,96.0,0.0,,180.0,4.611111,,1019.8,,5.0
max,43.4,29.9,100.0,77.2,,360.0,17.5,,1040.0,,18.0
