In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pytz
import os
import plotly.express as px
import plotly.graph_objects as go

1. How do meteorological conditions, such as wind speed and precipitation, impact noise levels in different locations of Leuven?

2. How do noise levels vary across different times of day and days of the week in Leuven? On which days do we expect higher amount of noise level?

3. How does the noise level vary across different times of the year? Are there any particular months or seasons when the noise level is higher or lower?

4. Is there any spatial patterns in noise levels within the area? Which location is consistently noisier than others?
5. How does the density and location of restaurants, bars and other destinations in the neighborhoods affect noise levels? 
6. Are there any temporal patterns in noise levels? For example, are there any specific hours of the night when the noise level tends to be higher or lower?
7. Can we identify any patterns in noise levels across different times of the day (not just nighttime)? For example, do noise levels tend to be higher during rush hour?

## Loading files

In [None]:
def merge_csv_files(directory_path,file_list,delim=";"):
    """
    Merges all CSV files in a directory into a single pandas DataFrame.

    Args:
    directory_path (str): The directory path containing the CSV files.
    delim (str): character used for delimiter in CSV files
    file_list = list of strings of individual file names

    Returns:
    merged_df (pandas.DataFrame): The merged pandas DataFrame of all CSV files in the directory.
    """
    dfs = []

    # loop through each file in the directory
    for file in file_list:
        # check if the file is a CSV file
        if file.endswith(".csv"):
            file_path = directory_path + file
            df = pd.read_csv(file_path, delimiter=delim)
            dfs.append(df)

    # concatenate all dataframes
    merged_df = pd.concat(dfs, ignore_index=True)

    return merged_df

In [None]:
# Noise data

folder_path = "s3://teamchadmda"
file_list_40 = ["csv_results_40_255439_mp-01-naamsestraat-35-maxim.csv",
               "csv_results_40_255440_mp-02-naamsestraat-57-xior.csv",
               "csv_results_40_255441_mp-03-naamsestraat-62-taste.csv",
               "csv_results_40_255442_mp-05-calvariekapel-ku-leuven.csv",
               "csv_results_40_255443_mp-06-parkstraat-2-la-filosovia.csv",
               "csv_results_40_255444_mp-07-naamsestraat-81.csv",
               "csv_results_40_255445_mp-08-kiosk-stadspark.csv",
               "csv_results_40_280324_mp08bis---vrijthof.csv",
               "csv_results_40_303910_mp-04-his-hears.csv"]

file_list_41 = ["csv_results_41_255439_mp-01-naamsestraat-35-maxim.csv",
               "csv_results_41_255440_mp-02-naamsestraat-57-xior.csv",
               "csv_results_41_255441_mp-03-naamsestraat-62-taste.csv",
               "csv_results_41_255442_mp-05-calvariekapel-ku-leuven.csv",
               "csv_results_41_255443_mp-06-parkstraat-2-la-filosovia.csv",
               "csv_results_41_255444_mp-07-naamsestraat-81.csv",
               "csv_results_41_255445_mp-08-kiosk-stadspark.csv",
               "csv_results_41_280324_mp08bis---vrijthof.csv",
               "csv_results_41_303910_mp-04-his-hears.csv"]

file_list_42 = ["csv_results_42_255439_mp-01-naamsestraat-35-maxim.csv",
               "csv_results_42_255440_mp-02-naamsestraat-57-xior.csv",
               "csv_results_42_255441_mp-03-naamsestraat-62-taste.csv",
               "csv_results_42_255442_mp-05-calvariekapel-ku-leuven.csv",
               "csv_results_42_255443_mp-06-parkstraat-2-la-filosovia.csv",
               "csv_results_42_255444_mp-07-naamsestraat-81.csv",
               "csv_results_42_255445_mp-08-kiosk-stadspark.csv",
               "csv_results_42_280324_mp08bis---vrijthof.csv",
               "csv_results_42_303910_mp-04-his-hears.csv"]

In [None]:
# lots of files, takes a while
file40 = merge_csv_files(folder_path + "/export_40/",file_list_40)
file41 = merge_csv_files(folder_path + "/export_41/",file_list_41)
file42 = merge_csv_files(folder_path + "/export_42/",file_list_42) #Uses the incomplete, reduced data set

In [None]:
# Meteo data

file_list_meteo = ["LC_2022Q1.csv","LC_2022Q2.csv","LC_2022Q3.csv","LC_2022Q4.csv",]

# lots of files, takes a while
meteo = merge_csv_files(folder_path + "/meteodata/",file_list_meteo,delim=',')

In [None]:
belgium_tz = pytz.timezone("Europe/Brussels")

# convert the 'dateutc' column to datetime objects and set the timezone to UTC
meteo["DATEUTC"] = pd.to_datetime(meteo["DATEUTC"], utc=True)

# localize the datetime objects to UTC and then convert to Belgium time
meteo["datetime"] = meteo["DATEUTC"].apply(
    lambda x: x.tz_convert("UTC").astimezone(belgium_tz)
)

meteo.head(3)

In [None]:
# remove the timezone information from the datetime objects
meteo["datetime"] = meteo["datetime"].dt.tz_localize(None)
meteo.to_csv("meteo.csv")

## File 41 - Noise event 

In [None]:
file41.groupby(["#object_id", "description"]).size()

In [None]:
# check missing values
file41.isna().mean()
file41

In [None]:
# drop unncessary cols
cols_to_drop = [
    "noise_event_laeq_model_id_unit",
    "noise_event_laeq_model_id",
    "noise_event_laeq_primary_detected_certainty_unit",
    "noise_event_laeq_primary_detected_class_unit",
]

file41.drop(cols_to_drop, axis=1, inplace=True)

In [None]:
# rename cols
file41.columns = [
    "object_id",
    "location",
    "result_timestamp",
    "noise_event_certainty",
    "noise_event",
]
file41.tail(5)

In [None]:
# convert the timestamp col to datetime format
file41["result_timestamp"] = pd.to_datetime(file41["result_timestamp"])

In [None]:
file41.dtypes

In [None]:
# extract from timestamp
file41["time"] = file41["result_timestamp"].dt.time
file41["date"] = file41["result_timestamp"].dt.date
file41["hour"] = file41["result_timestamp"].dt.hour
file41["weekday"] = file41["result_timestamp"].dt.strftime("%a")
weekday_order = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
file41["weekday"] = pd.Categorical(
    file41["weekday"], categories=weekday_order, ordered=True
)
file41.tail(5)

In [None]:
file41.to_csv("file41.csv")

In [None]:
# Aggregate the data by hour, weekday and calculate the count
aggregated_df = (
    file41.groupby(["hour", "weekday", "noise_event", "location"])
    .size()
    .reset_index(name="count")
)
aggregated_df
# For each location
locations = list(aggregated_df.location.unique())
mp01 = aggregated_df[aggregated_df.location == locations[0]].drop(["location"], axis=1)
weekday_order = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
mp01["weekday"] = pd.Categorical(
    mp01["weekday"], categories=weekday_order, ordered=True
)
mp01

In [None]:
# Heatmap for only Transport sound
mp01_car = mp01[mp01["noise_event"] == "Transport road - Passenger car"].drop(
    ["noise_event"], axis=1
)
mp01_car

# Pivot the data to create a heatmap
heatmap_data = mp01_car.pivot_table(
    index="hour", columns="weekday", values="count", fill_value=0
)

# Plot the heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(heatmap_data, cmap="YlGnBu", fmt="", cbar=False)
plt.title("Frequency of Transporting sound at MP01")
plt.xlabel("Weekday")
plt.ylabel("Hour")
plt.show()

In [None]:
df_file41 = pd.read_csv(
    "s3://teamchadmda/export_41/csv_results_41_255439_mp-01-naamsestraat-35-maxim.csv", delimiter=";"
)
df_file41["result_timestamp"] = pd.to_datetime(df_file41["result_timestamp"])
df_file41["date"] = df_file41["result_timestamp"].dt.date
df_file41["hour"] = df_file41["result_timestamp"].dt.hour
df_file41["weekday"] = df_file41["result_timestamp"].dt.strftime("%a")
cols_to_drop = [
    "noise_event_laeq_model_id_unit",
    "noise_event_laeq_model_id",
    "noise_event_laeq_primary_detected_certainty_unit",
    "noise_event_laeq_primary_detected_class_unit",
    "description",
    "#object_id",
    "noise_event_laeq_primary_detected_certainty",
]
df_file41.drop(cols_to_drop, axis=1, inplace=True)
df_file41.rename(
    columns={"noise_event_laeq_primary_detected_class": "noise_event"}, inplace=True
)
df_file41.head(5)

In [None]:
aggregated_file41 = (
    df_file41.groupby(["hour", "weekday", "noise_event"])
    .size()
    .reset_index(name="count")
)
weekday_order = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
aggregated_file41["weekday"] = pd.Categorical(
    aggregated_file41["weekday"], categories=weekday_order, ordered=True
)
aggregated_file41

- `Transporation road - Passenger car` is by far the most common noise event. 
- We also notice more noise in `MP_01` location than in other locations, followed by `MP 07`
- Only 7/9 locations appear in the plot?

In [None]:
sns.set_theme(context="notebook", style="darkgrid")

grouped = (
    file41.groupby(["location", "noise_event"])["date"]
    .count()
    .reset_index(name="count")
)

g = sns.catplot(
    data=grouped,
    x="noise_event",
    y="count",
    hue="location",
    kind="bar",
    height=6,
    aspect=1.5,
)

g.set(
    xlabel="Noise Event",
    ylabel="Frequency",
    title="Frequency by Location and Noise Event",
)
plt.xticks(size=8)
plt.tight_layout
plt.show()

- `Transport road - Passenger car` occurs most frequently during 7am - 10am (when people start going to work). Interestingly, we don't see another peak during rush hour.
- We see a peak of `Human voice - Shouting` during the midnight

In [None]:
# group data by 'hour' and 'noise_event' and count occurrences
grouped = (
    file41.groupby(["hour", "noise_event"])["noise_event_certainty"]
    .count()
    .reset_index(name="count")
)

sns.set_style("darkgrid")
plt.figure(figsize=(10, 6))
ax = sns.lineplot(x="hour", y="count", hue="noise_event", data=grouped)
ax.set_xlabel("Hour")
ax.set_ylabel("Frequency")
ax.set_title("Frequency of Noise event by Hour", size=15)
plt.show()

- `March - April` seems like the noisiest period
- We see wide variation of noise event frequency during Spring
- We see an unusual gap in the end of `January`

In [None]:
from matplotlib.dates import MonthLocator, DateFormatter

# group data by 'date' and 'noise_event' and count occurrences
grouped = (
    file41.groupby(["date", "noise_event"])["noise_event_certainty"]
    .count()
    .reset_index(name="count")
)

sns.set_style("darkgrid")
plt.figure(figsize=(10, 6))
ax = sns.lineplot(x="date", y="count", hue="noise_event", data=grouped)
ax.set_xlabel("Date")
ax.set_ylabel("Frequency")
ax.set_title("Frequency of Noise event by Date", size=15)

# set xticks to show all months
months = MonthLocator()
date_format = DateFormatter("%b")
ax.xaxis.set_major_locator(months)
ax.xaxis.set_major_formatter(date_format)

plt.show()

 - Thursday records the most noise events followed by Monday. But why?
 - Saturday and Sunday are the most quiet days, probably because students go home

In [None]:
# plot
grouped = (
    file41.groupby(["weekday", "noise_event"])["noise_event_certainty"]
    .count()
    .reset_index(name="count")
)

sns.set_style("darkgrid")
plt.figure(figsize=(10, 6))
ax = sns.lineplot(x="weekday", y="count", hue="noise_event", data=grouped)
ax.set_xlabel("Weekday")
ax.set_ylabel("Frequency")
ax.set_title("Frequency of Noise event by Weekday", size=15)
plt.show()

## File 40 - Noise level

In [None]:
file40.tail(5)
file40.to_csv("file40.csv")

In [None]:
# drop all _unit columns
cols_to_drop = [col for col in file40.columns if col.endswith("unit")]
file40.drop(cols_to_drop, axis=1, inplace=True)

# rename columns
file40.rename(columns={"description": "location"}, inplace=True)

In [None]:
# Convert the 'result_timestamp' column to a datetime data type
file40["result_timestamp"] = pd.to_datetime(file40["result_timestamp"])
file40["date"] = file40["result_timestamp"].dt.date
file40["hour"] = file40["result_timestamp"].dt.hour
file40["weekday"] = file40["result_timestamp"].dt.strftime("%a")
file40.head(4)

### By hour

- The noise level usually peaks around 8-9 AM which coincides with the time people start going to work
- The highest peaks are found during this hour in `MP 01`, this area also has highest frequency of Transportation sound. 
- If night sound should be below 40 dba for a good night sleep (according to WHO), then the area of `MP 03` and `MP 04` need to be regulated

In [None]:
laf_cols = [col for col in file40.columns if col.startswith("laf")]

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

locations = file40["location"].unique()

fig, axs = plt.subplots(nrows=2, ncols=4, figsize=(20, 12), sharey=True)

# loop over all location values and plot them in the grid
for i, loc in enumerate(locations):

    row = i // 4
    col = i % 4

    # filter the data for the current location and group by hour
    loc_data = file40[file40["location"] == loc]
    loc_hour = loc_data.groupby("hour")[laf_cols].mean()

    # create the line plot for each LAF column
    for var in laf_cols:
        sns.lineplot(data=loc_hour[var], label=None, ax=axs[row, col])

    axs[row, col].set_title(f"{loc}")
    axs[row, col].set_xlabel("Hour")
    axs[row, col].set_ylabel("dB(A)")
    axs[row, col].set_xticks(loc_hour.index)

legend = fig.legend(
    laf_cols, title="LAF values", loc="lower right", bbox_to_anchor=(1.1, 0.5)
)

# add a title to the whole plot
fig.suptitle("Mean LAF by hour and location")

plt.tight_layout()
plt.show()

The entries that have maximum `laf005_per_hour` > 100 occures primarily on `July 6` and `July 25`. What kind of events happened on these days?

In [None]:
file40[file40.laf005_per_hour > 100]

### By date

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

locations = file40["location"].unique()

fig, axs = plt.subplots(nrows=2, ncols=4, figsize=(20, 12), sharey=True)

# loop over all location values and plot them in the grid
for i, loc in enumerate(locations):

    row = i // 4
    col = i % 4

    # filter the data for the current location and group by hour
    loc_data = file40[file40["location"] == loc]
    loc_date = loc_data.groupby("date")[laf_cols].mean()

    # create the line plot for each LAF column
    for var in laf_cols:
        sns.lineplot(data=loc_date[var], label=None, ax=axs[row, col])

    axs[row, col].set_title(f"{loc}")
    axs[row, col].set_xlabel("Date")
    axs[row, col].set_ylabel("dB(A)")

legend = fig.legend(
    laf_cols, title="LAF values", loc="lower right", bbox_to_anchor=(1.1, 0.5)
)

# add a title to the whole plot
fig.suptitle("Mean LAF by date and location")

plt.tight_layout()
plt.show()

 You usually see a peak during `July - September` in terms of max LAF values

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

locations = file40["location"].unique()

fig, axs = plt.subplots(nrows=2, ncols=4, figsize=(20, 12), sharey=True)

# loop over all location values and plot them in the grid
for i, loc in enumerate(locations):

    row = i // 4
    col = i % 4

    # filter the data for the current location and group by hour
    loc_data = file40[file40["location"] == loc]
    loc_date = loc_data.groupby("date")[
        "laf05_per_hour", "laf50_per_hour", "laf95_per_hour"
    ].max()

    # create the line plot for each LAF column
    for var in ["laf05_per_hour", "laf50_per_hour", "laf95_per_hour"]:
        sns.lineplot(data=loc_date[var], label=None, ax=axs[row, col])

    axs[row, col].set_title(f"{loc}")
    axs[row, col].set_xlabel("Date")
    axs[row, col].set_ylabel("dB(A)")

legend = fig.legend(
    ["laf05_per_hour", "laf50_per_hour", "laf95_per_hour"],
    title="LAF values",
    loc="lower right",
    bbox_to_anchor=(1.1, 0.5),
)

# add a title to the whole plot
fig.suptitle("Max Laf values by date and location")

plt.tight_layout()
plt.show()

### By weekday

The mean LAF values usually peak on `Thu` and `Fri`. This coincides with our finding with File40 as we found transporation sound to occur most frequently on `Thu`

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

locations = file40["location"].unique()

fig, axs = plt.subplots(nrows=2, ncols=4, figsize=(20, 12), sharey=True)

# loop over all location values and plot them in the grid
for i, loc in enumerate(locations):

    row = i // 4
    col = i % 4

    # filter the data for the current location and group by hour
    loc_data = file40[file40["location"] == loc]
    loc_date = loc_data.groupby("weekday")[
        "laf05_per_hour", "laf50_per_hour", "laf95_per_hour"
    ].mean()

    # create the line plot for each LAF column
    for var in ["laf05_per_hour", "laf50_per_hour", "laf95_per_hour"]:
        sns.lineplot(data=loc_date[var], label=None, ax=axs[row, col])

    axs[row, col].set_title(f"{loc}")
    axs[row, col].set_xlabel("Weekday")
    axs[row, col].set_ylabel("dB(A)")

legend = fig.legend(
    ["laf05_per_hour", "laf50_per_hour", "laf95_per_hour"],
    title="LAF values",
    loc="lower right",
    bbox_to_anchor=(1.1, 0.5),
)

# add a title to the whole plot
fig.suptitle("Mean Laf values by weekday and location")

plt.tight_layout()
plt.show()

In [None]:
# Resample noise level by day in all locations
laf_cols = [col for col in file40.columns if col.startswith("laf")]
noise_level_daily_mean = file40.copy()
noise_level_daily_mean.rename(columns={"result_timestamp": "datetime"}, inplace=True)
noise_level_daily_mean.set_index("datetime", inplace=True)
noise_level_daily_mean = noise_level_daily_mean[laf_cols].resample("D").mean()
noise_level_daily_mean.head(3)

### All locations

In [None]:
sns.lineplot(data=noise_level_daily_mean)
plt.legend(
    loc="center left", bbox_to_anchor=(1.0, 0.5), title="LAF values", frameon=False
)
plt.title("Mean LAF by date in all locations")
plt.show()

## Meteo data

In [None]:
# show max columns
pd.set_option("display.max_columns", None)

In [None]:
meteo.head(5)

In [None]:
meteo.info()

In [None]:
# set time as index
meteo.set_index("datetime", inplace=True)
# resample the data by day and take the mean
lc_cols = [col for col in meteo.columns if col.startswith("LC")]
meteo_daily_mean = meteo[lc_cols].resample("D").mean()
meteo_hourly_mean = meteo[lc_cols].resample("H").mean()
meteo_daily_mean.reset_index(inplace=True)
meteo_hourly_mean.reset_index(inplace=True)
meteo_hourly_mean.to_csv("meteo_hourly.csv")

In [None]:
for var in lc_cols:
    plt.figure()
    sns.lineplot(data=meteo_daily_mean, x="datetime", y=meteo_daily_mean[var])
    plt.title(col)
    plt.xlabel("Date")
    plt.ylabel(var)

### Correlation between daily meteorological data and LAF

In [None]:
noise_level_daily_mean

In [None]:
# Merge meteo daily data with mean daily noise level data


noise_level_daily_mean.reset_index(inplace=True)
meteo_noise_daily_mean = meteo_daily_mean.merge(
    noise_level_daily_mean[["datetime", "laf50_per_hour"]], on=["datetime"]
)
meteo_noise_daily_mean.head(3)

`laf50_per_hour` by date displays a week correlation with weather conditions

In [None]:
plt.figure(figsize=(15, 12))
palette = sns.diverging_palette(20, 220, n=256)
corr = meteo_noise_daily_mean.corr(method="pearson")
sns.heatmap(corr, annot=True, fmt=".2f", cmap=palette, center=0, annot_kws={"size": 8})
plt.title(
    "Correlation Matrix between daily meteorological data and LAF value",
    size=15,
    weight="bold",
)
plt.show()

### Correlation between hourly meteorological data and LAF

In [None]:
# resample the data by hour and take the mean
lc_cols = [col for col in meteo.columns if col.startswith("LC")]
meteo_hourly_mean = meteo[lc_cols].resample("H").mean()
meteo_hourly_mean.reset_index(inplace=True)
meteo_hourly_mean.head(3)

In [None]:
# Resample noise level by day in all locations
laf_cols = [col for col in file40.columns if col.startswith("laf")]
noise_level_hourly_mean = file40.copy()
noise_level_hourly_mean.rename(columns={"result_timestamp": "datetime"}, inplace=True)
noise_level_hourly_mean.set_index("datetime", inplace=True)
noise_level_hourly_mean = noise_level_hourly_mean[laf_cols].resample("H").mean()
noise_level_hourly_mean.head(3)
noise_level_hourly_mean.to_csv("file40_hourly.csv")

In [None]:
# Merge meteo daily data with mean daily noise level data
noise_level_hourly_mean.reset_index(inplace=True)
meteo_noise_hourly_mean = meteo_hourly_mean.merge(
    noise_level_hourly_mean[["datetime", "laf50_per_hour"]], on=["datetime"]
)
meteo_noise_hourly_mean.head(3)

In [None]:
plt.figure(figsize=(15, 12))
palette = sns.diverging_palette(20, 220, n=256)
corr = meteo_noise_hourly_mean.corr(method="pearson")
sns.heatmap(corr, annot=True, fmt=".2f", cmap=palette, center=0, annot_kws={"size": 8})
plt.title(
    "Correlation Matrix between hourly meteorological data and LAF value",
    size=15,
    weight="bold",
)
plt.show()

In [None]:
file41.to_csv("file41.csv")

# File 41

In [None]:
file41.head()

In [None]:
file41 = pd.read_csv("file41.csv")

In [None]:
file41.result_timestamp = pd.to_datetime(file41.result_timestamp)
file41.date = pd.to_datetime(file41.date)
file41.head(5)

In [None]:
aggregated_file41 = (
    file41.groupby(["hour", "weekday", "date", "noise_event", "location"])
    .size()
    .reset_index(name="count")
)
weekday_order = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
aggregated_file41["weekday"] = pd.Categorical(
    aggregated_file41["weekday"], categories=weekday_order, ordered=True
)
aggregated_file41

In [None]:
heatmap_data = aggregated_file41.pivot_table(
    index="location", columns="weekday", values="count", fill_value=0
)
heatmap_data

In [None]:
# Create the heatmap using Plotly
fig = px.imshow(
    heatmap_data.values,
    x=heatmap_data.columns,
    y=heatmap_data.index,
    labels=dict(x="Weekday", y="Location", color="Frequency"),
    color_continuous_scale="YlOrRd",
)

# Set the title
fig.update_layout(title="Frequency of Noise Events")

In [None]:
# Group the data
grouped = file41.groupby(["location", "noise_event"]).size().reset_index(name="count")

# Create the bar plot using Plotly
fig = px.bar(
    grouped,
    x="noise_event",
    y="count",
    color="location",
    title="Frequency by Location and Noise Event",
    labels={"noise_event": "Noise Event", "count": "Frequency"},
    height=600,
    width=900,
)

# Customize the x-axis tick labels
fig.update_layout(xaxis={"categoryorder": "total descending"})

# Show the plot
fig.show()

# File 42

Uses the incomplete, reduced version of the dataset

In [None]:
file42.result_timestamp = pd.to_datetime(file42.result_timestamp)
file42["date"] = file42["result_timestamp"].dt.date
file42["month"] = file42["result_timestamp"].dt.month
file42["hour"] = file42["result_timestamp"].dt.hour
file42["weekday"] = file42["result_timestamp"].dt.strftime("%a")

In [None]:
file42.tail(5)

In [None]:
# Make sure weekday and lamax columns are of correct type
file42["weekday"] = pd.Categorical(
    file42["weekday"],
    categories=["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"],
    ordered=True,
)
file42["lamax"] = file42["lamax"].astype(float)

# Define a color for each day of the week
colors = sns.color_palette("husl", 7)  # 'husl' color palette with 7 colors

# Create a list of traces for each weekday
traces = []
for i, day in enumerate(file42["weekday"].cat.categories):
    color = "rgb" + str(
        tuple(int(c * 255) for c in colors[i])
    )  # Convert color to rgb format
    traces.append(
        go.Violin(
            x=file42["lamax"][file42["weekday"] == day], line_color=color, name=day
        )
    )

# Define the layout
layout = go.Layout(
    title="Distribution of Lamax by Weekday",
    xaxis_title="Lamax",
    yaxis_title="Weekday",
    violingap=0,
    violingroupgap=0,
    violinmode="overlay",
)

# Create the figure and add traces
fig = go.Figure(data=traces, layout=layout)

# Show the plot
fig.show()

In [None]:
def create_violin_plot(df, groupby_col, title):
    # Define a color palette
    num_unique_values = df[groupby_col].nunique()
    colors = sns.color_palette("husl", num_unique_values)

    # Create a list of traces for each unique value in the groupby column
    traces = []
    for i, val in enumerate(sorted(df[groupby_col].unique())):
        color = "rgb" + str(
            tuple(int(c * 255) for c in colors[i])
        )  # Convert color to rgb format
        traces.append(
            go.Violin(
                x=df["lamax"][df[groupby_col] == val], line_color=color, name=str(val)
            )
        )

    # Define the layout
    layout = go.Layout(
        title=title,
        xaxis_title="Lamax",
        yaxis_title=groupby_col.capitalize(),
        violingap=0,
        violingroupgap=0,
        violinmode="overlay",
    )

    # Create the figure and add traces
    fig = go.Figure(data=traces, layout=layout)

    # Show the plot
    fig.show()

In [None]:
# Make sure 'hour' and 'lamax' columns are of correct type
file42["hour"] = file42["hour"].astype(int)
file42["lamax"] = file42["lamax"].astype(float)

create_violin_plot(file42, "hour", "Distribution of Lamax by Hour")

# Make sure 'month' and 'lamax' columns are of correct type
file42["month"] = file42["month"].astype(int)

create_violin_plot(file42, "month", "Distribution of Lamax by Month")

In [None]:
# open files
weather_data = pd.read_csv("../data/processed_weather_data_leuven.csv", index_col=0)
air_quality = pd.read_csv("../data/processed_air_quality_data.csv", index_col=0)
file42 = pd.read_csv("../data/processed_file42_data.csv", index_col=0)
# drop NaN
file42.dropna(subset="lamax", inplace=True)
# rename time col
file42.rename(columns={"result_timestamp": "time"}, inplace=True)
air_quality.rename(columns={"dt": "time"}, inplace=True)
# merge all df
merged_df = pd.merge(
    weather_data, air_quality, on=["time", "hour", "month"], how="inner"
)
merged_df = pd.merge(merged_df, file42, on=["time", "hour", "month"], how="right")

In [None]:
plt.figure(figsize=(15, 12))
palette = sns.diverging_palette(20, 220, n=256)
corr = merged_df.corr(method="pearson")
sns.heatmap(corr, annot=False, fmt=".2f", cmap=palette, center=0, annot_kws={"size": 8})
plt.title(
    "Correlation Matrix between daily meteorological data and noise measurements",
    size=15,
    weight="bold",
)
plt.show()

In [None]:
import plotly.graph_objects as go
import plotly.io as pio
import pandas as pd
import numpy as np

# Calculate correlation matrix
corr = merged_df.corr(method="pearson")

# Create heatmap
fig = go.Figure(
    data=go.Heatmap(
        z=corr.values,
        x=corr.columns,
        y=corr.index,
        colorscale="RdBu",
        zmin=-1,
        zmax=1,
        colorbar=dict(title="Correlation"),
    )
)

# Customize layout
fig.update_layout(
    title="Correlation Matrix between daily meteorological data and noise measurements",
    width=900,
    height=900,
    xaxis=dict(title="Columns"),
    yaxis=dict(title="Rows"),
)

# Display the plot
pio.show(fig)