In [None]:
import numpy as np
import pandas as pd
import datetime
#import missingno as msno
from pandas import DataFrame
from pandas import concat

from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler

# Extra Libs
import seaborn as sns
import matplotlib
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import plotly.express as px
import holoviews as hv
from holoviews import opts
hv.extension('bokeh')
from bokeh.models import HoverTool
from IPython.display import HTML, display

# Default visual settings

plt.rcParams["font.family"] = "serif"
plt.rcParams['figure.dpi'] = 150

# Ignore warnings

import warnings
warnings.filterwarnings("ignore")



In [None]:

# reading xlsx file
df_RH=pd.read_excel("C:/Users/abdbe/Documents/Article ait melloul/RH101W.xlsx")
df_ZI=pd.read_excel("C:/Users/abdbe/Documents/Article ait melloul/ZI101W.xlsx")

In [None]:
df_ZI

In [None]:
df_RH

In [None]:
# Function to calculate the percentage of missing data
def calculate_missing_data(df):
    return df.isna().sum() * 100 / len(df)

# Calculate the percentage of missing data for each dataset
missing_data_df_RH = calculate_missing_data(df_RH)
missing_data_df_ZI = calculate_missing_data(df_ZI)

# Identify columns with no missing data (Complete Dataset)
complete_columns = df_RH.columns[~df_RH.isna().any() & ~df_ZI.isna().any()]

# Create a DataFrame to hold missing data percentages
missing_data_combined = pd.DataFrame({
    'Column': df_RH.columns,
    'df_RH': missing_data_df_RH.values,
    'df_ZI': missing_data_df_ZI.values,
    'Complete Dataset': [0 if col in complete_columns else 100 for col in df_RH.columns]
})

# Melt the DataFrame for plotting
missing_data_melted = missing_data_combined.melt(id_vars='Column', var_name='Dataset', value_name='MissingPercentage')

# Plot the missing data percentages
plt.figure(figsize=(20, 10))
sns.barplot(x="Column", y="MissingPercentage", hue="Dataset", data=missing_data_melted, palette='viridis')
plt.xticks(rotation=90)
plt.xlabel('Columns')
plt.ylabel('Percentage of Missing Data')
plt.title('Percentage of Missing Data by Column in df_RH, df_ZI, and Complete Dataset')
plt.tight_layout()

# Display the plot
plt.show()

# Optionally, save the plot
plt.savefig('missing_data_percentage_combined.png')



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Assuming df_impRH and df_impZI are already defined
# df_impRH = pd.DataFrame(...)
# df_impZI = pd.DataFrame(...)
df_impRH=df_RH[["PM2.5","PM10","PM1","CO","CO2","RH","RT"]]
df_impZI=df_ZI[["PM2.5","PM10","PM1","CO","CO2","RH","RT"]]

def plot_imputed_vs_actual(ax, df_imp, title):
    # Calculate the count of null and non-null values for each column
    count_zero_cols = df_imp.isnull().sum()
    count_nonzero_cols = df_imp.notnull().sum()

    # Combine both counts into one DataFrame
    combined_counts = pd.concat([count_zero_cols, count_nonzero_cols], axis=1)
    combined_counts.columns = ['Imputed', 'Actual']

    # Sort the column names alphabetically
    combined_counts = combined_counts.sort_index()

    # Define color palette
    colors = ['#4C72B0', '#C44E52']

    # Create the plot
    combined_counts.plot(kind='barh', ax=ax, color=colors, width=0.8, legend=False)

    # Add grid lines for better readability
    ax.grid(axis='x', linestyle='--', linewidth=0.5)

    # Add title and axis labels
    ax.set_title(title, fontsize=18, weight='bold', pad=20)
    ax.set_xlabel('Count', fontsize=16, weight='bold', labelpad=15)
    ax.set_ylabel('Variables', fontsize=16, weight='bold', labelpad=15)

    # Customize ticks
    ax.tick_params(axis='x', labelsize=14, width=2)
    ax.tick_params(axis='y', labelsize=14, width=2)

    # Remove top and right spines for a cleaner look
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_linewidth(1.5)
    ax.spines['bottom'].set_linewidth(1.5)

# Set the style
plt.style.use('seaborn-whitegrid')

# Create the figure and axes with a reduced height
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 10), sharey=True)

# Plot for df_impRH
plot_imputed_vs_actual(axes[0], df_impRH, "Imputed vs. Actual Data by Variable (S_RH)")

# Plot for df_impZI
plot_imputed_vs_actual(axes[1], df_impZI, "Imputed vs. Actual Data by Variable (S_ZI)")

# Set the legend for all subplots
handles, labels = axes[0].get_legend_handles_labels()
fig.legend(handles, labels, loc='lower center', bbox_to_anchor=(0.5, 0.0), fontsize=14, title_fontsize='16', ncol=2)

# Adjust layout to make room for titles and labels
fig.tight_layout(rect=[0, 0.1, 1, 1])

# Save the figure with high resolution
plt.savefig("imputed_data_comparison.png", dpi=300, bbox_inches='tight', transparent=True)

# Show the plot
plt.show()


In [None]:

# Assuming df_RH and df_ZI are already defined

# Function to calculate the count of missing data
def calculate_missing_data_counts(df):
    return df.isna().sum()

# Calculate the count of missing data for each dataset
missing_data_counts_df_RH = calculate_missing_data_counts(df_RH)
missing_data_counts_df_ZI = calculate_missing_data_counts(df_ZI)

# Filter columns with missing data
missing_data_counts_df_RH = missing_data_counts_df_RH[missing_data_counts_df_RH > 0]
missing_data_counts_df_ZI = missing_data_counts_df_ZI[missing_data_counts_df_ZI > 0]

# Identify complete columns (columns without any missing values)
complete_columns = df_RH.columns[~df_RH.isna().any() & ~df_ZI.isna().any()]

# Create a DataFrame to hold missing data counts
missing_data_combined = pd.DataFrame({
    'Column': list(set(missing_data_counts_df_RH.index).union(set(missing_data_counts_df_ZI.index))),
    'df_RH': [missing_data_counts_df_RH.get(col, 0) for col in set(missing_data_counts_df_RH.index).union(set(missing_data_counts_df_ZI.index))],
    'df_ZI': [missing_data_counts_df_ZI.get(col, 0) for col in set(missing_data_counts_df_RH.index).union(set(missing_data_counts_df_ZI.index))],
    'Complete Dataset': [0 if col in complete_columns else max(len(df_RH), len(df_ZI)) for col in set(missing_data_counts_df_RH.index).union(set(missing_data_counts_df_ZI.index))]
})

# Melt the DataFrame for plotting
missing_data_melted = missing_data_combined.melt(id_vars='Column', var_name='Dataset', value_name='MissingCount')

# Set the style
plt.style.use('seaborn-whitegrid')

# Create the figure and axes
fig, ax = plt.subplots(figsize=(20, 10))

# Create the plot
sns.barplot(x='MissingCount', y='Column', hue='Dataset', data=missing_data_melted, palette='viridis', ax=ax)

# Add grid lines for better readability
ax.grid(axis='x', linestyle='--', linewidth=0.5)

# Add title and axis labels
ax.set_title('Count of Missing Data by Column in df_RH, df_ZI, and Complete Dataset', fontsize=18, weight='bold', pad=20)
ax.set_xlabel('Count of Missing Data', fontsize=16, weight='bold', labelpad=15)
ax.set_ylabel('Variables', fontsize=16, weight='bold', labelpad=15)

# Customize ticks
ax.tick_params(axis='x', labelsize=14, width=2)
ax.tick_params(axis='y', labelsize=14, width=2)

# Remove top and right spines for a cleaner look
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_linewidth(1.5)
ax.spines['bottom'].set_linewidth(1.5)

# Remove the individual legend
ax.legend_.remove()

# Set the legend at the bottom
handles, labels = ax.get_legend_handles_labels()
fig.legend(handles, labels, loc='lower center', bbox_to_anchor=(0.5, -0.01), fontsize=14, title_fontsize='16', ncol=3)

# Adjust layout to make room for titles and labels
fig.tight_layout(rect=[0, 0.1, 1, 1])

# Save the figure with high resolution
plt.savefig('missing_data_count_combined.png', dpi=300, bbox_inches='tight', transparent=True)

# Show the plot
plt.show()


In [None]:
df_RH = df_RH.set_index(pd.DatetimeIndex(df_RH['Time']))

df_ZI = df_ZI.set_index(pd.DatetimeIndex(df_ZI['Time']))

In [None]:
df_graphZI=df_ZI[["PM2.5","PM10","CO","CO2","humidity","temp","dew","windgust","windspeed","winddir"]]
df_graphRH=df_RH[["PM2.5","PM10","CO","CO2","humidity","temp","dew","windgust","windspeed","winddir"]]

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Set seaborn style
sns.set(style="whitegrid")

# Subset of the data
df_graphZI_subset = df_graphZI[:320]

# Create the subplots with a reduced height
fig, axes = plt.subplots(nrows=len(df_graphZI_subset.columns), ncols=1, figsize=(15, 10), sharex=True)

# Plot each column in a subplot and add labels to the legend
for i, column in enumerate(df_graphZI_subset.columns):
    sns.lineplot(data=df_graphZI_subset, x=df_graphZI_subset.index, y=column, ax=axes[i], linewidth=2.5, label=column)
    axes[i].set_ylabel('', fontsize=14, weight='bold')  # Remove y-axis label
    axes[i].tick_params(axis='both', which='major', labelsize=12)
    axes[i].grid(True, linestyle='--', linewidth=0.5)

# Common labels
fig.text(0.5, 0.04, 'Time', ha='center', fontsize=18, weight='bold')
fig.text(0.04, 0.5, 'Values', va='center', rotation='vertical', fontsize=18, weight='bold')

# Adjust layout to make room for titles and labels
fig.tight_layout(rect=[0.05, 0.05, 1, 0.95])

# Set the legend for all subplots
handles, labels = axes[0].get_legend_handles_labels()

# Save the figure with high resolution
plt.savefig("pollutionMeteorologicalChanges.png", dpi=300, bbox_inches='tight', transparent=True)

# Show the plot
plt.show()


In [None]:

# Set the seaborn style
sns.set(style="whitegrid")

# Create the figure and axes
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(30, 10))

# Plot the boxplot for df_graphZI
sns.boxplot(data=df_graphZI[:320], orient='h', ax=axes[0], linewidth=2.5, palette="Set2")
axes[0].set_title("Boxplot of Hourly Mean Levels\n(Station ZI)", fontsize=24, weight='bold', pad=20)
axes[0].set_xlabel('Values', fontsize=20, weight='bold', labelpad=15)
axes[0].set_ylabel('Parameters', fontsize=20, weight='bold', labelpad=15)
axes[0].tick_params(axis='x', labelsize=18, width=2)
axes[0].tick_params(axis='y', labelsize=18, width=2)
axes[0].grid(True, which='both', linestyle='--', linewidth=0.6)
axes[0].spines['top'].set_visible(False)
axes[0].spines['right'].set_visible(False)
axes[0].spines['left'].set_linewidth(1.5)
axes[0].spines['bottom'].set_linewidth(1.5)

# Plot the boxplot for df_graphRH
sns.boxplot(data=df_graphRH[:320], orient='h', ax=axes[1], linewidth=2.5, palette="Set2")
axes[1].set_title("Boxplot of Hourly Mean Levels\n(Station RH)", fontsize=24, weight='bold', pad=20)
axes[1].set_xlabel('Values', fontsize=20, weight='bold', labelpad=15)
axes[1].set_ylabel('Parameters', fontsize=20, weight='bold', labelpad=15)
axes[1].tick_params(axis='x', labelsize=18, width=2)
axes[1].tick_params(axis='y', labelsize=18, width=2)
axes[1].grid(True, which='both', linestyle='--', linewidth=0.6)
axes[1].spines['top'].set_visible(False)
axes[1].spines['right'].set_visible(False)
axes[1].spines['left'].set_linewidth(1.5)
axes[1].spines['bottom'].set_linewidth(1.5)

# Adjust layout to make room for titles and labels
fig.tight_layout(rect=[0.05, 0.05, 0.95, 0.95])

# Save the figure with high resolution
plt.savefig("boxplot.png", dpi=300, bbox_inches='tight', transparent=True)

# Show the plot
plt.show()


In [None]:
df_RH['Year'] = df_RH['Time'].apply(lambda time: time.year)
df_RH['Month'] = df_RH['Time'].apply(lambda time: time.month)
df_RH['Day'] = df_RH['Time'].apply(lambda t: t.day)
df_RH['Hour'] = df_RH['Time'].apply(lambda time: time.hour)
df_RH['Day of Week'] = df_RH['Time'].apply(lambda time: time.dayofweek)

df_ZI['Day of Week'] = df_ZI['Time'].apply(lambda time: time.dayofweek)
df_ZI['Year'] = df_ZI['Time'].apply(lambda time: time.year)
df_ZI['Month'] = df_ZI['Time'].apply(lambda time: time.month)
df_ZI['Day'] = df_ZI['Time'].apply(lambda t: t.day)
df_ZI['Hour'] = df_ZI['Time'].apply(lambda time: time.hour)

In [None]:
df_RH = df_RH.drop(['Time'], axis=1)

df_ZI = df_ZI.drop(['Time'], axis=1)

In [None]:
df_RH=df_RH.resample('H').mean()
df_ZI=df_ZI.resample('H').mean()

In [None]:
df_ZI.describe()

In [None]:
dataRH=df_RH.copy()
dataZI=df_ZI.copy()

In [None]:

dataRH["date"]=pd.to_datetime(dataRH[['Year', 'Month', 'Day', 'Hour']])      
dataZI["date"]=pd.to_datetime(dataZI[['Year', 'Month', 'Day', 'Hour']])      


In [None]:
dataRH = dataRH.set_index(pd.DatetimeIndex(dataRH['date']))
dataZI = dataZI.set_index(pd.DatetimeIndex(dataZI['date']))


In [None]:
dataRH = dataRH.drop(['date'], axis=1)

dataZI = dataZI.drop(['date'], axis=1)

In [None]:
dataZI

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Assuming dataRH and dataZI are already loaded as pandas DataFrames with datetime index
# dataRH = pd.read_csv('your_dataRH.csv', index_col='datetime', parse_dates=True)
# dataZI = pd.read_csv('your_dataZI.csv', index_col='datetime', parse_dates=True)

# Resample data to get hourly values, keeping NaNs to show gaps
dataRH_hourly = dataRH['PM2.5'].resample('H').mean()
dataZI_hourly = dataZI['PM2.5'].resample('H').mean()

# Combine both dataframes for plotting
combined_data = pd.DataFrame({
    'Datetime': dataRH_hourly.index,
    'S_RH': dataRH_hourly.values,
    'S_ZI': dataZI_hourly.values
})

# Set the style
plt.style.use('seaborn-whitegrid')

# Create the plot with reduced height
fig, ax = plt.subplots(figsize=(15, 6))

# Plot the hourly PM2.5 concentration for both sites, including gaps
ax.plot(combined_data['Datetime'], combined_data['S_RH'], label='S_RH', color='blue', linewidth=2.5)
ax.plot(combined_data['Datetime'], combined_data['S_ZI'], label='S_ZI', color='red', linewidth=2.5)

# Add title and axis labels
ax.set_title("Hourly PM2.5 Concentration ", fontsize=20, weight='bold', pad=20)
ax.set_xlabel('Hours', fontsize=16, weight='bold', labelpad=15)
ax.set_ylabel('Concentration PM2.5 (µg/m³)', fontsize=16, weight='bold', labelpad=15)

# Customize ticks
ax.tick_params(axis='x', labelsize=14, width=2)
ax.tick_params(axis='y', labelsize=14, width=2)

# Set legend
ax.legend(loc="upper right", fontsize=14, title="Monitoring Site", title_fontsize='14')

# Remove top and right spines for a cleaner look
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_linewidth(1.5)
ax.spines['bottom'].set_linewidth(1.5)

# Add grid lines for better readability
ax.grid(axis='both', linestyle='--', linewidth=0.5)

# Save the figure with high resolution
plt.savefig("hourly_PM25.png", dpi=300, bbox_inches='tight', transparent=True)

# Show the plot
plt.show()


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Assuming dataRH and dataZI are already loaded as pandas DataFrames with datetime index
# dataRH = pd.read_csv('your_dataRH.csv', index_col='datetime', parse_dates=True)
# dataZI = pd.read_csv('your_dataZI.csv', index_col='datetime', parse_dates=True)

# Resample data to get average hourly values
dataRH_hourly = dataRH.resample('H').mean()
dataZI_hourly = dataZI.resample('H').mean()

# Group by hour of the day to get the average value for each hour across all days
dataRH_hourly_avg = dataRH_hourly.groupby(dataRH_hourly.index.hour).mean()
dataZI_hourly_avg = dataZI_hourly.groupby(dataZI_hourly.index.hour).mean()

# Combine both dataframes for plotting
combined_data = pd.DataFrame({
    'Hour': dataRH_hourly_avg.index,
    'S_RH': dataRH_hourly_avg['PM2.5'].values,
    'S_ZI': dataZI_hourly_avg['PM2.5'].values
})

# Set the style
plt.style.use('seaborn-whitegrid')

# Create the plot with reduced height
fig, ax = plt.subplots(figsize=(15, 6))

# Plot the average hourly PM2.5 concentration for both sites
sns.lineplot(x='Hour', y='S_RH', data=combined_data, label='S_RH', color='blue', linewidth=2.5)
sns.lineplot(x='Hour', y='S_ZI', data=combined_data, label='S_ZI', color='red', linewidth=2.5)

# Add title and axis labels
ax.set_title("Average Hourly PM2.5 Concentration Over 24 Hours", fontsize=20, weight='bold', pad=20)
ax.set_xlabel('Hour of Day', fontsize=16, weight='bold', labelpad=15)
ax.set_ylabel('Concentration PM2.5 (µg/m³)', fontsize=16, weight='bold', labelpad=15)

# Customize ticks
ax.tick_params(axis='x', labelsize=14, width=2)
ax.tick_params(axis='y', labelsize=14, width=2)

# Set legend
ax.legend(loc="upper right", fontsize=14, title="Monitoring Site", title_fontsize='14')

# Remove top and right spines for a cleaner look
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_linewidth(1.5)
ax.spines['bottom'].set_linewidth(1.5)

# Add grid lines for better readability
ax.grid(axis='both', linestyle='--', linewidth=0.5)

# Save the figure with high resolution
plt.savefig("average_hourly_PM25.png", dpi=300, bbox_inches='tight', transparent=True)

# Show the plot
plt.show()


In [None]:
dataRH.drop(['Year','Month','Day','Hour','Day of Week','PM1'], inplace=True, axis=1)
dataZI.drop(['Year','Month','Day','Hour','Day of Week','PM1'], inplace=True, axis=1)

In [None]:
dataRH

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

# Assuming dataRH and dataZI are already loaded as pandas DataFrames
# dataRH = pd.read_csv('your_dataRH.csv', index_col='datetime', parse_dates=True)
# dataZI = pd.read_csv('your_dataZI.csv', index_col='datetime', parse_dates=True)

# Compute the correlation matrices
corr_matrix_RH = dataRH.corr()
corr_matrix_ZI = dataZI.corr()

# Set the style
plt.style.use('seaborn-whitegrid')

# Create a mask for the upper triangle
mask_RH = np.zeros_like(corr_matrix_RH, dtype=bool)
mask_RH[np.triu_indices_from(mask_RH)] = True

mask_ZI = np.zeros_like(corr_matrix_ZI, dtype=bool)
mask_ZI[np.triu_indices_from(mask_ZI)] = True

# Set up the matplotlib figure with subplots
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(30, 20), sharey=True)

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap for dataRH
sns.heatmap(corr_matrix_RH, mask=mask_RH, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=True, fmt=".2f", ax=axes[0])

axes[0].set_title("Correlation Matrix in Station RH", fontweight="bold", fontsize=24, pad=20)
axes[0].tick_params(axis='x', labelsize=14, rotation=90)
axes[0].tick_params(axis='y', labelsize=14)

# Draw the heatmap for dataZI
sns.heatmap(corr_matrix_ZI, mask=mask_ZI, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=True, fmt=".2f", ax=axes[1])

axes[1].set_title("Correlation Matrix in Station ZI", fontweight="bold", fontsize=24, pad=20)
axes[1].tick_params(axis='x', labelsize=14, rotation=90)
axes[1].tick_params(axis='y', labelsize=14)

# Customize the spines and grid
for ax in axes:
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_linewidth(1.5)
    ax.spines['bottom'].set_linewidth(1.5)
    ax.grid(True, which='both', linestyle='--', linewidth=0.5)

# Adjust layout to make room for titles and labels
fig.tight_layout()

# Save the figure with high resolution
plt.savefig("correlation_figure.png", dpi=300, bbox_inches='tight', transparent=True)

# Show the plot
plt.show()


In [None]:
total_count = dataRH.shape[0]
calm_count = dataRH.query("windgust == 0").shape[0]

print('Of {} total observations, {} have calm winds.'.format(total_count, calm_count))

In [None]:
def speed_labels(bins, units):   
    labels = []
    for left, right in zip(bins[:-1], bins[1:]):
        if left == bins[0]:
            labels.append('calm'.format(right))
        elif numpy.isinf(right):
            labels.append('>{} {}'.format(left, units))
        else:
            labels.append('{} - {} {}'.format(left, right, units))

    return list(labels)

def _convert_dir(directions, N=None):
    if N is None:
        N = directions.shape[0]
    barDir = directions * numpy.pi/180. - numpy.pi/N
    barWidth = 2 * numpy.pi / N
    return barDir, barWidth

In [None]:
#import metar

seaborn.set_style('ticks')
%matplotlib inline

spd_bins = [-1, 0, 5, 10, 15, 20, 25, 30,35,40, numpy.inf]
spd_labels = speed_labels(spd_bins, units='KM/h')

dir_bins = numpy.arange(-7.5, 370, 15)
dir_labels = (dir_bins[:-1] + dir_bins[1:]) / 2

In [None]:
rose = (
    dataRH.assign(WindSpd_bins=lambda df:
            pandas.cut(df['windgust'], bins=spd_bins, labels=spd_labels, right=True)
         )
        .assign(WindDir_bins=lambda df:
            pandas.cut(df['winddir'], bins=dir_bins, labels=dir_labels, right=False)
         )
        .replace({'WindDir_bins': {360: 0}})
        .groupby(by=['WindSpd_bins', 'WindDir_bins'])
        .size()
        .unstack(level='WindSpd_bins')
        .fillna(0)
        .assign(calm=lambda df: calm_count / df.shape[0])
        .sort_index(axis=1)
        .applymap(lambda x: x / total_count * 100)
)

In [None]:
# Set seaborn style
sns.set_style('ticks')
%matplotlib inline



# Define bins and labels for wind speed and direction
spd_bins = [-1, 0, 5, 10, 15, 20, 25, 30, 35, 40, np.inf]
spd_labels = [f"{spd_bins[i]}-{spd_bins[i+1]}" for i in range(len(spd_bins)-1)]

dir_bins = np.arange(-7.5, 370, 15)
dir_labels = (dir_bins[:-1] + dir_bins[1:]) / 2

# Create the rose data
total_count = len(dataRH)
calm_count = len(dataRH[dataRH['windgust'] <= 0.5])

rose = (
    dataRH.assign(WindSpd_bins=lambda df:
            pd.cut(df['windgust'], bins=spd_bins, labels=spd_labels, right=True)
         )
        .assign(WindDir_bins=lambda df:
            pd.cut(df['winddir'], bins=dir_bins, labels=dir_labels, right=False)
         )
        .replace({'WindDir_bins': {360: 0}})
        .groupby(by=['WindSpd_bins', 'WindDir_bins'])
        .size()
        .unstack(level='WindSpd_bins')
        .fillna(0)
        .assign(calm=lambda df: calm_count / df.shape[0])
        .sort_index(axis=1)
        .applymap(lambda x: x / total_count * 100)
)

# Function to create the wind rose plot
def wind_rose(rosedata, wind_dirs, palette=None):
    if palette is None:
        # Custom color palette
        palette = sns.color_palette('magma', n_colors=rosedata.shape[1])

    bar_dir, bar_width = _convert_dir(wind_dirs)

    fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(polar=True))
    ax.set_theta_direction('clockwise')
    ax.set_theta_zero_location('N')

    for n, (c1, c2) in enumerate(zip(rosedata.columns[:-1], rosedata.columns[1:])):
        if n == 0:
            ax.bar(bar_dir, rosedata[c1].values, 
                   width=bar_width,
                   color=palette[0],
                   edgecolor='none',
                   label=c1,
                   linewidth=0)
        ax.bar(bar_dir, rosedata[c2].values, 
               width=bar_width, 
               bottom=rosedata.cumsum(axis=1)[c1].values,
               color=palette[n+1],
               edgecolor='none',
               label=c2,
               linewidth=0)

    ax.legend(loc=(0.75, 0.95), ncol=2)
    ax.set_xticklabels(['N', 'NE', 'E', 'SE', 'S', 'SW', 'W', 'NW'])
    
    return fig

# Convert wind directions
def _convert_dir(wind_dirs):
    bar_dir = np.deg2rad(wind_dirs)
    bar_width = np.deg2rad(15)  # Each bar covers 15 degrees
    return bar_dir, bar_width

# Generate the plot
directions = np.arange(0, 360, 15)
fig = wind_rose(rose, directions)

# Save the figure with high resolution and transparent background
fig.savefig("Wind_Rose_Nature.png", dpi=300, bbox_inches='tight', transparent=True)

# Show the plot
plt.show()


In [None]:
import plotly.express as px
import pandas as pd
import numpy as np

# Sample DataFrame
dataRH = pd.DataFrame({
    'Day': np.random.choice(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], size=100),
    'PM2.5': np.random.randn(100) * 10 + 50  # random data for illustration
})

fig = px.box(
    dataRH, 
    x="Day", 
    y="PM2.5", 
    points="all", 
    color="Day",
    color_discrete_sequence=px.colors.qualitative.Set3,
    title="PM2.5 Distribution by Day"
)

fig.update_layout(
    template="plotly_dark", 
    font=dict(family="PT Sans", size=20),
    title=dict(text="PM2.5 Distribution by Day", x=0.5, xanchor='center'),  # Centering the title
    xaxis_title="Day of the Week",
    yaxis_title="PM2.5 Levels",
    legend_title="Day",
    margin=dict(l=40, r=40, t=40, b=40),  # Adjusting margins
    xaxis=dict(showgrid=True, gridwidth=0.5, gridcolor='Gray'),
    yaxis=dict(showgrid=True, gridwidth=0.5, gridcolor='Gray')
)

fig.update_traces(marker=dict(size=3))  # Scaling down the markers for better visibility

# Save the figure with high resolution
fig.write_image("pm25_distribution_by_day.png", scale=3)

# Show the plot
fig.show()


In [None]:
import plotly.express as px
import pandas as pd
import numpy as np

# Sample DataFrame for illustration purposes
dataZI = pd.DataFrame({
    'Hour': np.random.randint(0, 24, size=100),
    'PM2.5': np.random.randn(100) * 10 + 25  # random data for illustration
})

fig = px.density_contour(dataZI, x="Hour", y="PM2.5", title="Density Contour of PM2.5 Levels by Hour")

# Updating layout and traces for better readability
fig.update_layout(
    yaxis_range=[0, 40],
    font=dict(family="PT Sans", size=15),
    title=dict(text="Density Contour of PM2.5 Levels by Hour", x=0.5, xanchor='center'),
    xaxis_title="Hour of the Day",
    yaxis_title="PM2.5 Levels",
    margin=dict(l=40, r=40, t=40, b=40),  # Adjusting margins
    template="plotly_dark",
    xaxis=dict(showgrid=True, gridwidth=0.5, gridcolor='Gray'),
    yaxis=dict(showgrid=True, gridwidth=0.5, gridcolor='Gray')
)

fig.update_traces(
    contours_coloring="fill", 
    contours_showlabels=True,
    line_smoothing=1.3  # Smoothing the contour lines for a cleaner look
)

# Save the figure with high resolution
fig.write_image("density_contour_pm25_by_hour.png", scale=3)

# Show the plot
fig.show()


In [None]:
import plotly.express as px
import pandas as pd
import numpy as np

# Sample DataFrame for illustration purposes
dataRH = pd.DataFrame({
    'Day': np.random.choice(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], size=100),
    'Hour': np.random.randint(0, 24, size=100),
    'PM2.5': np.random.randn(100) * 10 + 25  # random data for illustration
})

fig = px.scatter_3d(
    dataRH, 
    x="Day", 
    y="Hour", 
    z="PM2.5",
    color="PM2.5", 
    color_continuous_scale=["#00FF00", "#FFC800", "#FF0000", "#B803BF"], 
    range_color=(-45, 225),  # range of color bar
    title="3D Scatter Plot of PM2.5 Levels by Day and Hour"
)

# Updating layout and traces for better readability
fig.update_traces(marker=dict(size=3))  # Scaling down the markers

fig.update_layout(
    font=dict(family="PT Sans", size=12),
    title=dict(text="3D Scatter Plot of PM2.5 Levels by Day and Hour", x=0.5, xanchor='center'),
    scene=dict(
        xaxis=dict(title='Day of the Week', showgrid=True, gridcolor='Gray', gridwidth=0.5),
        yaxis=dict(title='Hour of the Day', showgrid=True, gridcolor='Gray', gridwidth=0.5),
        zaxis=dict(title='PM2.5 Levels', showgrid=True, gridcolor='Gray', gridwidth=0.5)
    ),
    margin=dict(l=40, r=40, t=40, b=40)  # Adjusting margins
)

# Save the figure with high resolution
fig.write_image("3d_scatter_pm25_by_day_hour.png", scale=3)

# Show the plot
fig.show()
