In [None]:
import matplotlib.pyplot as plt
import numpy as np
import ptitprince as pt
import numpy as np
import seaborn as sns
import pandas as pd
import os

plt.rcParams.update({"font.size": 20})
sns.set_style("ticks")


In [None]:
def read_water_gage_station(filename):
    df = pd.read_csv(
        filename,
        index_col="t",
        usecols=lambda column: column not in ["s", "f_x", "g", "f_y", "direction"],
    )
    return df


WaterGageStation8771972 = read_water_gage_station("WaterGageStation8771972.csv")
WaterGageStation8772440 = read_water_gage_station("WaterGageStation8772440.csv")
WaterGageStation8772447 = read_water_gage_station("WaterGageStation8772447.csv")
WaterGageStation8772471 = read_water_gage_station("WaterGageStation8772471.csv")
WaterGageStation8773767 = read_water_gage_station("WaterGageStation8773767.csv")


In [None]:
datasets = {
    "Station 8771972": WaterGageStation8771972,
    "Station 8772440": WaterGageStation8772440,
    "Station 8772447": WaterGageStation8772447,
    "Station 8772471": WaterGageStation8772471,
    "Station 8773767": WaterGageStation8773767,
}


In [None]:
parameters = ["Water Level (m)", "Wind Speed (m/s)", "Wind Direction (degrees)"]

In [None]:
month_map = {
    1: "January",
    2: "February",
    3: "March",
    4: "April",
    5: "May",
    6: "June",
    7: "July",
    8: "August",
    9: "September",
    10: "October",
    11: "November",
    12: "December",
}

month_order = [
    "January",
    "February",
    "March",
    "April",
    "May",
    "June",
    "July",
    "August",
    "September",
    "October",
    "November",
    "December",
]

rename_columns = {
        "water_level": "Water Level (m)",
        "wind_speed": "Wind Speed (m/s)",
        "wind_direction": "Wind Direction (degrees)",
    }

In [None]:
for dataset_name, df in datasets.items():
    df.rename(columns=rename_columns, inplace=True)
    for col in rename_columns.values():
        df[col] = pd.to_numeric(df[col], errors="coerce")

    df.index = pd.to_datetime(df.index)
    df.insert(0, "Year", df.index.year)
    df.insert(1, "Month", df.index.month)
    df["Month"] = df["Month"].map(month_map)
    df["Month"] = pd.Categorical(df["Month"], categories=month_order, ordered=True)
    df["Year"] = pd.Categorical(df["Year"])

    datasets[dataset_name] = df


In [None]:
def remove_outliers(df, column_name):
    data = df.dropna(subset=[column_name])
    Q1 = df[column_name].quantile(0.25)
    Q3 = df[column_name].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    filtered_df = df[
        (df[column_name] >= lower_bound) & (df[column_name] <= upper_bound)
    ]
    return filtered_df


In [None]:
def create_directory_for_plots(dataset_name):
    base_directory_path = "Results/Plots"
    monthly_directory = os.path.join(base_directory_path, dataset_name, "monthly")
    yearly_directory = os.path.join(base_directory_path, dataset_name, "yearly")
    os.makedirs(monthly_directory, exist_ok=True)
    os.makedirs(yearly_directory, exist_ok=True)
    return monthly_directory, yearly_directory


def generate_filenames(dataset_name, parameter, monthly_directory, yearly_directory):
    monthly_filename = (
        f"{monthly_directory}/{parameter}_monthly_raincloud_plot.png".replace(
            "(m/s)", "mـs"
        )
    )
    yearly_filename = (
        f"{yearly_directory}/{parameter}_yearly_raincloud_plot.png".replace(
            "(m/s)", "mـs"
        )
    )
    return monthly_filename, yearly_filename


In [None]:
def clean_data(data, numerical_variable):
    data = data.dropna(subset=[numerical_variable])
    return remove_outliers(data, numerical_variable)


def categorize_data(data, categorical_variable, categories=None, ordered=False):
    data_copy = data.copy()
    if categories is not None:
        data_copy.loc[:, categorical_variable] = pd.Categorical(
            data_copy[categorical_variable], categories=categories, ordered=ordered
        )
    data_copy.loc[:, categorical_variable] = data_copy[
        categorical_variable
    ].cat.remove_unused_categories()
    return data_copy


In [None]:


def add_line_plot(data, categorical_variable, numerical_variable, ax, categories=None):

    if categorical_variable == "Month":
        means = (
            data.groupby(categorical_variable)[numerical_variable]
            .mean()
            .reindex(categories) 
            .reset_index()
        )
        
        sns.lineplot(
        data=means,
        x=numerical_variable,
        y=categorical_variable,
        ax=ax,
        color="red",
        zorder=20,
        sort=False,
        marker="o",
        markersize=8,
        linestyle="-",
        linewidth=2,
    )

    else:
        pass


In [None]:
def create_plot(data_cleaned, numerical_variable, categorical_variable, dataset_name, orientation, filename, month_order, showmeans=False):
    palette = "Set2"
    f, ax = plt.subplots(figsize=(7, 15))
    pt.half_violinplot(
        x=numerical_variable,
        y=categorical_variable,
        data=data_cleaned,
        palette=palette,
        bw=0.2,
        cut=0.0,
        scale="area",
        width=0.6,
        inner=None,
        orient=orientation,
    )

    sns.boxplot(
        x=numerical_variable,
        y=categorical_variable,
        data=data_cleaned,
        color="black",
        width=0.15,
        zorder=10,
        showcaps=True,
        showmeans=showmeans,
        meanprops={"marker": "o", "markerfacecolor": "red", "markersize": 6},
        boxprops={"facecolor": "none", "zorder": 10},
        showfliers=False,
        whiskerprops={"linewidth": 2, "zorder": 10},
        saturation=1,
        orient=orientation,
    )

    # Corrected the order of parameters for add_line_plot
    if categorical_variable == "Month":
        add_line_plot(data_cleaned, categorical_variable, numerical_variable, ax, month_order)
    else:
        add_line_plot(data_cleaned, categorical_variable, numerical_variable, ax)

    ax.set_title(dataset_name)
    plt.ylabel("")
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()



In [None]:
def plot_raincloud(
    categorical_variable,
    numerical_variable,
    data,
    dataset_name,
    sigma=0.1,
    orientation="h",
    palette="Set2",
    filename="raincloud_plot.png",
    month_order=None,
):
    data_cleaned = clean_data(data, numerical_variable)
    if categorical_variable == "Month":
        data_cleaned = categorize_data(
            data_cleaned, categorical_variable, categories=month_order, ordered=True
        )
    else:
        data_cleaned = categorize_data(data_cleaned, categorical_variable)

    showmeans = categorical_variable in ["Month", "Year"]
    create_plot(
        data_cleaned,
        numerical_variable,
        categorical_variable,
        dataset_name,
        orientation,
        filename,
        month_order,
        showmeans,
    )


In [None]:
def plot_data(df, dataset_name, parameter):
    monthly_directory, yearly_directory = create_directory_for_plots(dataset_name)
    monthly_filename, yearly_filename = generate_filenames(
        dataset_name, parameter, monthly_directory, yearly_directory
    )
    plot_raincloud(
        categorical_variable="Month",
        numerical_variable=parameter,
        data=df,
        dataset_name=dataset_name,
        filename=monthly_filename,
    )

    plot_raincloud(
        categorical_variable="Year",
        numerical_variable=parameter,
        data=df,
        dataset_name=dataset_name,
        filename=yearly_filename,
    )


In [None]:
def process_datasets_and_plot(datasets, parameters):
    for dataset_name, df in datasets.items():
        for parameter in parameters:
            plot_data(df, dataset_name, parameter)


In [None]:
process_datasets_and_plot(datasets, parameters)
