In [1]:
import matplotlib.pyplot as plt
import numpy as np
import ptitprince as pt
import numpy as np
import seaborn as sns
import pandas as pd
import os

plt.rcParams.update({"font.size": 20})
sns.set_style("ticks")


In [2]:
def read_water_gage_station(filename):
    df = pd.read_csv(
        filename,
        index_col="Datetime",
    )
    return df


WaveDataStation42002 = read_water_gage_station("WaveDataStation42002.csv")
WaveDataStation42019 = read_water_gage_station("WaveDataStation42019.csv")
WaveDataStation42020 = read_water_gage_station("WaveDataStation42020.csv")


In [3]:
datasets = {
    "Station 42002": WaveDataStation42002,
    "Station 42019": WaveDataStation42019,
    "Station 42020": WaveDataStation42020,
}

In [4]:
parameters = [
    "Wind Speed (m/s)",
    "Peak Gust Speed (m/s)",
    "Significant Wave Height (meters)",
    "Dominant Wave Period (seconds)",
    "Average Wave Period (seconds)",
    "Wave Direction (degrees)",
    "Air Temperature (Celsius)",
    "Sea Surface Temperature (Celsius)",
    "Dewpoint Temperature (Celsius)",
]


In [5]:
month_map = {
    1: "January",
    2: "February",
    3: "March",
    4: "April",
    5: "May",
    6: "June",
    7: "July",
    8: "August",
    9: "September",
    10: "October",
    11: "November",
    12: "December",
}

month_order = [
    "January",
    "February",
    "March",
    "April",
    "May",
    "June",
    "July",
    "August",
    "September",
    "October",
    "November",
    "December",
]

rename_columns = {
    "YYYY": "Year",
        "MM": "Month",
        "DD": "Day",
        "hh": "Hour",
        "mn": "Minute",
        "WDIR": "Wind Direction (degrees)",
        "WSPD": "Wind Speed (m/s)",
        "GST": "Peak Gust Speed (m/s)",
        "WVHT": "Significant Wave Height (meters)",
        "DPD": "Dominant Wave Period (seconds)",
        "APD": "Average Wave Period (seconds)",
        "MWD": "Wave Direction (degrees)",
        "BAR": "PRES",
        "ATMP": "Air Temperature (Celsius)",
        "WTMP": "Sea Surface Temperature (Celsius)",
        "DEWP": "Dewpoint Temperature (Celsius)",
}


In [6]:
for dataset_name, df in datasets.items():
    df.rename(columns=rename_columns, inplace=True)
    df["Month"] = df["Month"].map(month_map)
    df["Month"] = pd.Categorical(df["Month"], categories=month_order, ordered=True)
    df["Year"] = pd.Categorical(df["Year"])
    datasets[dataset_name] = df


In [7]:
def remove_outliers(df, column_name):
    data = df.dropna(subset=[column_name])
    Q1 = df[column_name].quantile(0.25)
    Q3 = df[column_name].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    filtered_df = df[
        (df[column_name] >= lower_bound) & (df[column_name] <= upper_bound)
    ]
    return filtered_df


In [8]:
def create_directory_for_plots(dataset_name):
    base_directory_path = "Results/Plots"
    monthly_directory = os.path.join(base_directory_path, dataset_name, "monthly")
    yearly_directory = os.path.join(base_directory_path, dataset_name, "yearly")
    os.makedirs(monthly_directory, exist_ok=True)
    os.makedirs(yearly_directory, exist_ok=True)
    return monthly_directory, yearly_directory


def generate_filenames(dataset_name, parameter, monthly_directory, yearly_directory):
    monthly_filename = (
        f"{monthly_directory}/{parameter}_monthly_raincloud_plot.png".replace(
            "(m/s)", "mـs"
        ).replace("(km/h)", "kmh")
    )
    yearly_filename = (
        f"{yearly_directory}/{parameter}_yearly_raincloud_plot.png".replace(
            "(m/s)", "mـs"
        ).replace("(km/h)", "kmh")
    )
    return monthly_filename, yearly_filename


In [9]:
def clean_data(data, numerical_variable):
    data = data.dropna(subset=[numerical_variable])
    return remove_outliers(data, numerical_variable)


def categorize_data(data, categorical_variable, categories=None, ordered=False):
    data_copy = data.copy()
    if categories is not None:
        data_copy.loc[:, categorical_variable] = pd.Categorical(
            data_copy[categorical_variable], categories=categories, ordered=ordered
        )
    data_copy.loc[:, categorical_variable] = data_copy[
        categorical_variable
    ].cat.remove_unused_categories()
    return data_copy


In [10]:
def add_line_plot(data, categorical_variable, numerical_variable, ax, categories=None):

    if categorical_variable == "Month":
        means = (
            data.groupby(categorical_variable)[numerical_variable]
            .mean()
            .reindex(categories)
            .reset_index()
        )

        sns.lineplot(
            data=means,
            x=numerical_variable,
            y=categorical_variable,
            ax=ax,
            color="red",
            zorder=20,
            sort=False,
            marker="o",
            markersize=8,
            linestyle="-",
            linewidth=2,
        )

    else:
        pass


In [11]:
def create_plot(
    data_cleaned,
    numerical_variable,
    categorical_variable,
    dataset_name,
    orientation,
    filename,
    month_order,
    showmeans=False,
):
    palette = "Set2"
    cm = 1/2.54  # centimeters in inches
    # f, ax = plt.subplots(figsize=(12*cm, 20*cm))
    f, ax = plt.subplots(figsize=(7, 15))
    pt.half_violinplot(
        x=numerical_variable,
        y=categorical_variable,
        data=data_cleaned,
        palette=palette,
        bw=0.2,
        cut=0.0,
        scale="area",
        width=0.6,
        inner=None,
        orient=orientation,
    )

    sns.boxplot(
        x=numerical_variable,
        y=categorical_variable,
        data=data_cleaned,
        color="black",
        width=0.15,
        zorder=10,
        showcaps=True,
        showmeans=showmeans,
        meanprops={"marker": "o", "markerfacecolor": "red", "markersize": 6},
        boxprops={"facecolor": "none", "zorder": 10},
        showfliers=False,
        whiskerprops={"linewidth": 2, "zorder": 10},
        saturation=1,
        orient=orientation,
    )

    # Corrected the order of parameters for add_line_plot
    if categorical_variable == "Month":
        add_line_plot(
            data_cleaned, categorical_variable, numerical_variable, ax, month_order
        )
    else:
        add_line_plot(data_cleaned, categorical_variable, numerical_variable, ax)

    ax.set_title(dataset_name)
    # ax.set_yticklabels(ax.get_yticklabels(), ha='right')
    # plt.subplots_adjust(left=0.4)

    plt.ylabel("")
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()


In [12]:
def plot_raincloud(
    categorical_variable,
    numerical_variable,
    data,
    dataset_name,
    sigma=0.1,
    orientation="h",
    palette="Set2",
    filename="raincloud_plot.png",
    month_order=None,
):
    data_cleaned = clean_data(data, numerical_variable)
    if categorical_variable == "Month":
        data_cleaned = categorize_data(
            data_cleaned, categorical_variable, categories=month_order, ordered=True
        )
    else:
        data_cleaned = categorize_data(data_cleaned, categorical_variable)

    showmeans = categorical_variable in ["Month", "Year"]
    create_plot(
        data_cleaned,
        numerical_variable,
        categorical_variable,
        dataset_name,
        orientation,
        filename,
        month_order,
        showmeans,
    )


In [13]:
def plot_data(df, dataset_name, parameter):
    monthly_directory, yearly_directory = create_directory_for_plots(dataset_name)
    monthly_filename, yearly_filename = generate_filenames(
        dataset_name, parameter, monthly_directory, yearly_directory
    )
    plot_raincloud(
        categorical_variable="Month",
        numerical_variable=parameter,
        data=df,
        dataset_name=dataset_name,
        filename=monthly_filename,
    )

    plot_raincloud(
        categorical_variable="Year",
        numerical_variable=parameter,
        data=df,
        dataset_name=dataset_name,
        filename=yearly_filename,
    )


In [14]:
def process_datasets_and_plot(datasets, parameters):
    for dataset_name, df in datasets.items():
        for parameter in parameters:
            plot_data(df, dataset_name, parameter)


In [15]:
process_datasets_and_plot(datasets, parameters)
