In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import zscore

In [None]:
# Load CSV files into DataFrames
bm = pd.read_csv("../data/benin-malanville.csv")
sb = pd.read_csv("../data/sierraleone-bumbuna.csv")
td = pd.read_csv("../data/togo-dapaong_qc.csv")

In [None]:
print("shape of benin-malanville:", bm.shape)
print("shape of sierraleone-bumbuna:", sb.shape)
print("shape of togo-dapaong_qc:",td.shape)

In [None]:

bm.head()

In [None]:

sb.head()

In [None]:

td.head()

In [None]:

bm.info()

In [None]:
sb.info()

In [None]:
td.info()

In [None]:

bm['Timestamp'] = pd.to_datetime(bm['Timestamp'])

In [None]:

sb['Timestamp'] = pd.to_datetime(sb['Timestamp'])

In [None]:

td['Timestamp'] = pd.to_datetime(td['Timestamp'])

In [None]:

bm.set_index('Timestamp', inplace=True)
sb.set_index('Timestamp', inplace=True)
td.set_index('Timestamp', inplace=True)

In [None]:
summary_stats_bm = bm.describe()
summary_stats_bm

In [None]:

summary_stats_sb = sb.describe()
summary_stats_sb

In [None]:
summary_stats_td = td.describe()
summary_stats_td

In [None]:

missing_values_bm = bm.isnull().sum()
print(missing_values_bm)

In [None]:

missing_values_sb = sb.isnull().sum()
print(missing_values_sb)

In [None]:

missing_values_td = td.isnull().sum()
print(missing_values_td)

In [None]:

bm.drop(columns=['Comments'], inplace=True)
sb.drop(columns=['Comments'], inplace=True)
td.drop(columns=['Comments'], inplace=True)

In [None]:
bm[['GHI', 'DNI', 'DHI', 'Tamb']].plot(subplots=True, figsize=(12, 8))
plt.show()

In [None]:
sb[['GHI', 'DNI', 'DHI', 'Tamb']].plot(subplots=True, figsize=(12, 8))
plt.show()

In [None]:

td[['GHI', 'DNI', 'DHI', 'Tamb']].plot(subplots=True, figsize=(12, 8))
plt.show()

In [None]:
# Plot sensor readings over time
plt.figure(figsize=(6, 3))
sns.lineplot(data=bm, x=bm.index, y='ModA', hue='Cleaning')
plt.title('ModA Sensor Readings Over Time')
plt.show()

In [None]:
# Plot sensor readings over time
plt.figure(figsize=(6, 3))
sns.lineplot(data=sb, x=sb.index, y='ModA', hue='Cleaning')
plt.title('ModA Sensor Readings Over Time')
plt.show()

In [None]:

# Plot sensor readings over time
plt.figure(figsize=(6, 3))
sns.lineplot(data=td, x=td.index, y='ModA', hue='Cleaning')
plt.title('ModA Sensor Readings Over Time')
plt.show()

In [None]:
# Histograms
plt.figure(figsize=(15, 10))
columns = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'Tamb', 'RH', 'WS', 'WSgust', 'WSstdev', 'WD', 'WDstdev', 'BP', 'Precipitation', 'TModA', 'TModB']
for i, col in enumerate(columns, 1):
    plt.subplot(4, 4, i)
    sns.histplot(bm[col], bins=50, kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

# Boxplots
plt.figure(figsize=(15, 10))
for i, col in enumerate(columns, 1):
    plt.subplot(4, 4, i)
    sns.boxplot(x=bm[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)
plt.tight_layout()
plt.show()


In [None]:
# Histograms
plt.figure(figsize=(15, 10))
columns = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'Tamb', 'RH', 'WS', 'WSgust', 'WSstdev', 'WD', 'WDstdev', 'BP', 'Precipitation', 'TModA', 'TModB']
for i, col in enumerate(columns, 1):
    plt.subplot(4, 4, i)
    sns.histplot(sb[col], bins=50, kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

# Boxplots
plt.figure(figsize=(15, 10))
for i, col in enumerate(columns, 1):
    plt.subplot(4, 4, i)
    sns.boxplot(x=sb[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)
plt.tight_layout()
plt.show()

In [None]:
# Histograms
plt.figure(figsize=(15, 10))
columns = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'Tamb', 'RH', 'WS', 'WSgust', 'WSstdev', 'WD', 'WDstdev', 'BP', 'Precipitation', 'TModA', 'TModB']
for i, col in enumerate(columns, 1):
    plt.subplot(4, 4, i)
    sns.histplot(td[col], bins=50, kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

# Boxplots
plt.figure(figsize=(15, 10))
for i, col in enumerate(columns, 1):
    plt.subplot(4, 4, i)
    sns.boxplot(x=td[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)
plt.tight_layout()
plt.show()

In [None]:
# Check for negative values in columns where they are not expected
negative_values_bm = bm[bm[['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'Tamb', 'RH', 'WS', 'WSgust', 'WSstdev', 'BP', 'Precipitation', 'TModA', 'TModB']] < 0]
print(negative_values_bm)

In [None]:
# Check for negative values in columns where they are not expected
negative_values_sb = sb[sb[['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'Tamb', 'RH', 'WS', 'WSgust', 'WSstdev', 'BP', 'Precipitation', 'TModA', 'TModB']] < 0]
print(negative_values_sb)

In [None]:
# Check for negative values in columns where they are not expected
negative_values_td = td[td[['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'Tamb', 'RH', 'WS', 'WSgust', 'WSstdev', 'BP', 'Precipitation', 'TModA', 'TModB']] < 0]
print(negative_values_td)

In [None]:
def count_negative_values(df, columns):
    negative_counts = {}
    for col in columns:
        if col in df.columns:
            negative_counts[col] = (df[col] < 0).sum()
        else:
            negative_counts[col] = None  # Column not found in DataFrame
    return negative_counts

In [None]:
cols_to_check = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'Tamb', 'RH', 'WS', 'WSgust', 'WSstdev', 'BP', 'Precipitation', 'TModA', 'TModB']
negative_counts_bm = count_negative_values(bm, cols_to_check)
negative_counts_sb = count_negative_values(sb, cols_to_check)
negative_counts_td = count_negative_values(td, cols_to_check)
# Print results
print("Negative counts in bm:")
print(negative_counts_bm)

print("\nNegative counts in sb:")
print(negative_counts_sb)

print("\nNegative counts in td:")
print(negative_counts_td)

In [None]:
cols_to_clean = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'Tamb', 'RH', 'WS', 'WSgust', 'WSstdev', 'BP', 'Precipitation', 'TModA', 'TModB']

In [None]:
def replace_negative_values(df, columns):
    for col in columns:
        if col in df.columns:
            df[col] = df[col].apply(lambda x: max(x, 0))  # Replace negative values with 0
    return df

In [None]:
bm_cleaned = replace_negative_values(bm.copy(), cols_to_clean)
sb_cleaned = replace_negative_values(sb.copy(), cols_to_clean)
td_cleaned = replace_negative_values(td.copy(), cols_to_clean)

In [None]:
def check_negative_values(df, columns):
    negative_counts = {}
    for col in columns:
        if col in df.columns:
            negative_counts[col] = (df[col] < 0).sum()
        else:
            negative_counts[col] = None  # Column not found in DataFrame
    return negative_counts

In [None]:
# Verify the changes
print("Negative counts in cleaned bm:")
print(check_negative_values(bm_cleaned, cols_to_clean))

print("\nNegative counts in cleaned sb:")
print(check_negative_values(sb_cleaned, cols_to_clean))

print("\nNegative counts in cleaned td:")
print(check_negative_values(td_cleaned, cols_to_clean))

In [None]:
# Replace negative values with zero
bm['GHI'] = bm['GHI'].apply(lambda x: max(x, 0))
bm['DNI'] = bm['DNI'].apply(lambda x: max(x, 0))
bm['DHI'] = bm['DHI'].apply(lambda x: max(x, 0))

In [None]:
def detect_outliers_iqr(df, columns):
    outliers = {}
    for col in columns:
        if col in df.columns:
            # Calculate Q1 (25th percentile) and Q3 (75th percentile)
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            
            # Define bounds for outliers
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            
            # Find outliers
            outlier_indices = df[(df[col] < lower_bound) | (df[col] > upper_bound)].index
            outliers[col] = len(outlier_indices)
    return outliers

In [None]:
# Check for outliers
outliers_bm = detect_outliers_iqr(bm_cleaned, cols_to_clean)
outliers_sb = detect_outliers_iqr(sb_cleaned, cols_to_clean)
outliers_td = detect_outliers_iqr(td_cleaned, cols_to_clean)

# Print outliers information
print("Outliers in cleaned bm:")
print(outliers_bm)

print("\nOutliers in cleaned sb:")
print(outliers_sb)

print("\nOutliers in cleaned td:")
print(outliers_td)

In [None]:
def cap_outliers_iqr(df, columns):
    for col in columns:
        if col in df.columns:
            # Calculate Q1 (25th percentile) and Q3 (75th percentile)
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            
            # Define bounds for capping
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            
            # Cap outliers
            df[col] = np.clip(df[col], lower_bound, upper_bound)
    return df

In [None]:
# Cap outliers
bm_final = cap_outliers_iqr(bm_cleaned, cols_to_clean)
sb_final = cap_outliers_iqr(sb_cleaned, cols_to_clean)
td_final = cap_outliers_iqr(td_cleaned, cols_to_clean)

In [None]:
# Summary statistics after handling outliers
print("Summary statistics for final bm:")
print(bm_final.describe())

In [None]:
print("\nSummary statistics for final sb:")
print(sb_final.describe())

In [None]:
print("\nSummary statistics for final td:")
print(td_final.describe())

In [None]:
def plot_histograms(df, columns, figsize=(15, 10)):
    """
    Plots histograms for each specified column in the DataFrame.
    
    Parameters:
    df (pd.DataFrame): The DataFrame containing the data.
    columns (list): List of column names to plot histograms for.
    figsize (tuple): Size of the figure.
    """
    num_columns = len(columns)
    num_rows = (num_columns // 3) + (num_columns % 3 > 0)  # Create enough rows to fit all columns
    
    plt.figure(figsize=figsize)
    
    for i, col in enumerate(columns):
        plt.subplot(num_rows, 3, i + 1)
        plt.hist(df[col].dropna(), bins=30, edgecolor='black')  # Drop NaNs for histogram plotting
        plt.title(f'Histogram of {col}')
        plt.xlabel(col)
        plt.ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Plot histograms for final datasets
plot_histograms(bm_final, cols_to_clean)
plot_histograms(sb_final, cols_to_clean)
plot_histograms(td_final, cols_to_clean)

In [None]:
def plot_correlation_matrix(df, columns, figsize=(12, 10)):
    """
    Plots a heatmap of the correlation matrix for specified columns in the DataFrame.
    
    Parameters:
    df (pd.DataFrame): The DataFrame containing the data.
    columns (list): List of column names to include in the correlation matrix.
    figsize (tuple): Size of the figure.
    """
    plt.figure(figsize=figsize)
    correlation_matrix = df[columns].corr()  # Compute the correlation matrix
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5, vmin=-1, vmax=1)
    plt.title('Correlation Matrix')
    plt.show()

In [None]:
# Plot correlation matrices for final datasets
plot_correlation_matrix(bm_final, cols_to_clean)
plot_correlation_matrix(sb_final, cols_to_clean)
plot_correlation_matrix(td_final, cols_to_clean)

In [None]:
def plot_time_series(df, columns, title='Time Series Plot', figsize=(15, 10), legend_loc='upper right'):
    """
    Plots time series data for specified columns in the DataFrame.
    
    Parameters:
    df (pd.DataFrame): The DataFrame containing the data.
    columns (list): List of column names to plot as time series.
    title (str): Title of the plot.
    figsize (tuple): Size of the figure.
    legend_loc (str): Location of the legend.
    """
    plt.figure(figsize=figsize)
    for col in columns:
        # Sample data if necessary
        df_sampled = df[::100]  # Adjust sampling rate as needed
        plt.plot(df_sampled.index, df_sampled[col], label=col)
    
    plt.title(title)
    plt.xlabel('Timestamp')
    plt.ylabel('Value')
    plt.legend(loc=legend_loc)  # Fixed legend location
    plt.tight_layout()
    plt.show()

In [None]:
# Plot time series for final datasets
plot_time_series(bm_final, cols_to_clean, 'Final bm')
plot_time_series(sb_final, cols_to_clean, 'Final sb')
plot_time_series(td_final, cols_to_clean, 'Final td')

In [None]:
def plot_wind_polar(df, wind_speed_col='WS', wind_direction_col='WD', title='Wind Speed and Direction Polar Plot', figsize=(8, 8)):
    """
    Plots a polar plot for wind speed and direction.
    
    Parameters:
    df (pd.DataFrame): The DataFrame containing the data.
    wind_speed_col (str): Column name for wind speed.
    wind_direction_col (str): Column name for wind direction.
    title (str): Title of the plot.
    figsize (tuple): Size of the figure.
    """
    # Convert wind direction from degrees to radians
    wind_direction_rad = np.deg2rad(df[wind_direction_col].dropna())
    wind_speed = df[wind_speed_col].dropna()
    
    plt.figure(figsize=figsize)
    ax = plt.subplot(111, projection='polar')
    ax.scatter(wind_direction_rad, wind_speed, c=wind_speed, cmap='coolwarm', alpha=0.75, edgecolors='w', s=30)
    ax.set_title(title, va='bottom')
    plt.show()

In [None]:
# Call the function to plot the polar plot for wind speed and direction
plot_wind_polar(bm_final, wind_speed_col='WS', wind_direction_col='WD', title='Wind Speed and Direction Polar Plot')

In [None]:
# Call the function to plot the polar plot for wind speed and direction
plot_wind_polar(bm_final, wind_speed_col='WS', wind_direction_col='WD', title='Wind Speed and Direction Polar Plot')

In [None]:
# Call the function to plot the polar plot for wind speed and direction
plot_wind_polar(sb_final, wind_speed_col='WS', wind_direction_col='WD', title='Wind Speed and Direction Polar Plot')

In [None]:
# Call the function to plot the polar plot for wind speed and direction
plot_wind_polar(td_final, wind_speed_col='WS', wind_direction_col='WD', title='Wind Speed and Direction Polar Plot')

In [None]:
def plot_wind_histograms(df, wind_speed_col='WS', wind_direction_col='WD', bins=30, figsize=(14, 6)):
    """
    Plots histograms for wind speed and direction.
    
    Parameters:
    df (pd.DataFrame): The DataFrame containing the data.
    wind_speed_col (str): Column name for wind speed.
    wind_direction_col (str): Column name for wind direction.
    bins (int): Number of bins for the histograms.
    figsize (tuple): Size of the figure.
    """
    plt.figure(figsize=figsize)
    
    plt.subplot(1, 2, 1)
    plt.hist(df[wind_speed_col].dropna(), bins=bins, color='skyblue', edgecolor='black')
    plt.title('Wind Speed Histogram')
    plt.xlabel('Wind Speed')
    plt.ylabel('Frequency')
    
    plt.subplot(1, 2, 2)
    plt.hist(df[wind_direction_col].dropna(), bins=bins, color='lightgreen', edgecolor='black')
    plt.title('Wind Direction Histogram')
    plt.xlabel('Wind Direction (Degrees)')
    plt.ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Call the function to plot histograms for wind speed and direction
plot_wind_histograms(bm_final, wind_speed_col='WS', wind_direction_col='WD', bins=30)

In [None]:

# Call the function to plot histograms for wind speed and direction
plot_wind_histograms(sb_final, wind_speed_col='WS', wind_direction_col='WD', bins=30)

In [None]:
# Call the function to plot histograms for wind speed and direction
plot_wind_histograms(td_final, wind_speed_col='WS', wind_direction_col='WD', bins=30)

In [None]:
def wind_rose(df, wind_speed_col='WS', wind_direction_col='WD', bins=8, title='Wind Rose Chart', figsize=(10, 10)):
    """
    Plots a wind rose chart.
    
    Parameters:
    df (pd.DataFrame): The DataFrame containing the data.
    wind_speed_col (str): Column name for wind speed.
    wind_direction_col (str): Column name for wind direction.
    bins (int): Number of bins for the wind directions.
    title (str): Title of the plot.
    figsize (tuple): Size of the figure.
    """
    # Prepare data
    df = df.dropna(subset=[wind_speed_col, wind_direction_col])
    directions = df[wind_direction_col]
    speeds = df[wind_speed_col]
    
    # Create wind rose bins
    direction_bins = np.linspace(0, 360, bins + 1)
    speed_bins = np.linspace(0, speeds.max(), bins + 1)
    
    wind_direction = pd.cut(directions, direction_bins, right=False)
    wind_speed = pd.cut(speeds, speed_bins, right=False)
    
    # Count occurrences in each bin
    wind_rose_df = pd.crosstab(wind_direction, wind_speed).fillna(0)
    
    # Normalize and plot
    wind_rose_df = wind_rose_df.div(wind_rose_df.sum(axis=1), axis=0)
    
    plt.figure(figsize=figsize)
    ax = plt.subplot(111, polar=True)
    
    # Plot each wind speed bin
    for i, col in enumerate(wind_rose_df.columns):
        theta = np.linspace(0, 2 * np.pi, len(wind_rose_df.index), endpoint=False)
        radii = wind_rose_df[col].values
        width = 2 * np.pi / len(wind_rose_df.index)
        bars = ax.bar(theta, radii, width=width, color=plt.cm.viridis(i / len(wind_rose_df.columns)), edgecolor='w', alpha=0.7)
    
    ax.set_title(title, va='bottom')
    plt.show()


In [None]:
# Call the function to plot the wind rose chart
wind_rose(bm_final, wind_speed_col='WS', wind_direction_col='WD', bins=8)

In [None]:
# Call the function to plot the wind rose chart
wind_rose(sb_final, wind_speed_col='WS', wind_direction_col='WD', bins=8)

In [None]:
# Call the function to plot the wind rose chart
wind_rose(td_final, wind_speed_col='WS', wind_direction_col='WD', bins=8)

In [None]:
def plot_temperature_scatter(df, temp_col='Tamb', other_cols=None, figsize=(12, 6)):
    """
    Plots scatter plots for temperature vs other variables.
    
    Parameters:
    df (pd.DataFrame): The DataFrame containing the data.
    temp_col (str): Column name for temperature.
    other_cols (list): List of column names to plot against temperature.
    figsize (tuple): Size of the figure.
    """
    if other_cols is None:
        other_cols = ['GHI', 'DNI', 'DHI', 'RH']
    
    plt.figure(figsize=figsize)
    
    for col in other_cols:
        plt.scatter(df[temp_col], df[col], alpha=0.5, label=col)
    
    plt.title(f'Temperature vs Other Variables')
    plt.xlabel('Temperature')
    plt.ylabel('Other Variables')
    plt.legend()
    plt.tight_layout()
    plt.show()

In [None]:
# Call the function to plot scatter plots for temperature vs other variables
plot_temperature_scatter(bm_final, temp_col='Tamb', other_cols=['GHI', 'DNI', 'DHI', 'RH'])

In [None]:
# Call the function to plot scatter plots for temperature vs other variables
plot_temperature_scatter(sb_final, temp_col='Tamb', other_cols=['GHI', 'DNI', 'DHI', 'RH'])

In [None]:
# Call the function to plot scatter plots for temperature vs other variables
plot_temperature_scatter(td_final, temp_col='Tamb', other_cols=['GHI', 'DNI', 'DHI', 'RH'])

In [None]:
def plot_temperature_histogram(df, temp_col='Tamb', bins=30, figsize=(10, 6)):
    """
    Plots a histogram for the distribution of temperature readings.
    
    Parameters:
    df (pd.DataFrame): The DataFrame containing the data.
    temp_col (str): Column name for temperature.
    bins (int): Number of bins for the histogram.
    figsize (tuple): Size of the figure.
    """
    plt.figure(figsize=figsize)
    plt.hist(df[temp_col].dropna(), bins=bins, color='skyblue', edgecolor='black')
    plt.title('Temperature Distribution Histogram')
    plt.xlabel('Temperature')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()

In [None]:
# Call the function to plot the temperature distribution histogram
plot_temperature_histogram(bm_final, temp_col='Tamb', bins=30)
plot_temperature_histogram(sb_final, temp_col='Tamb', bins=30)
plot_temperature_histogram(td_final, temp_col='Tamb', bins=30)

In [None]:
def plot_bubble_chart(df, x_col, y_col, size_col, title='', xlabel='', ylabel='', size_scale=100):
    """
    Plot a bubble chart with the given DataFrame.

    Parameters:
    - df: DataFrame containing the data
    - x_col: column name for x-axis
    - y_col: column name for y-axis
    - size_col: column name for bubble size
    - title: title of the plot
    - xlabel: label for x-axis
    - ylabel: label for y-axis
    - size_scale: scaling factor for bubble sizes
    """
    plt.figure(figsize=(10, 6))
    plt.scatter(df[x_col], df[y_col], s=df[size_col] * size_scale, alpha=0.5, edgecolors="w", linewidth=0.5)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.grid(True)
    plt.show()

In [None]:
plot_bubble_chart(bm_final, x_col='GHI', y_col='Tamb', size_col='RH', title='GHI vs. Tamb with RH as Bubble Size', xlabel='GHI', ylabel='Tamb')
plot_bubble_chart(sb_final, x_col='GHI', y_col='Tamb', size_col='RH', title='GHI vs. Tamb with RH as Bubble Size', xlabel='GHI', ylabel='Tamb')
plot_bubble_chart(td_final, x_col='GHI', y_col='Tamb', size_col='RH', title='GHI vs. Tamb with RH as Bubble Size', xlabel='GHI', ylabel='Tamb')

In [None]:
def calculate_z_scores(df, cols):
    """
    Calculate Z-scores for specified columns in the DataFrame.

    Parameters:
    - df: DataFrame containing the data
    - cols: list of column names for which to calculate Z-scores

    Returns:
    - DataFrame with Z-scores for the specified columns
    """
    z_scores = df[cols].apply(zscore)
    return z_scores

In [None]:
def identify_outliers_z_score(df, cols, threshold=3):
    """
    Identify outliers in the DataFrame based on Z-scores.

    Parameters:
    - df: DataFrame containing the data
    - cols: list of column names to check for outliers
    - threshold: Z-score threshold for considering a point as an outlier

    Returns:
    - DataFrame with outliers marked
    """
    z_scores = calculate_z_scores(df, cols)
    outliers = (z_scores.abs() > threshold).any(axis=1)
    return df[outliers]

In [None]:
# Calculate Z-scores
z_scores = calculate_z_scores(bm_final, ['GHI', 'DNI', 'DHI'])
print(z_scores.head())

# Identify outliers
outliers = identify_outliers_z_score(bm_final, ['GHI', 'DNI', 'DHI'], threshold=3)
print(outliers)