In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import datetime

In [None]:
path = 'processed_data_multichannel.json'
with open(path, 'r') as file:
    sensor_data = json.load(file)

In [None]:
flattened_data = []

# Iterate over the key, value pairs in the dictionary
for sensor_id, readings in sensor_data.items():
    # Iterate over the list of dictionaries for each sensor_id
    for reading in readings:  # Assuming there is only one list per sensor_id
        # Add the sensor_id to the dictionary
        reading['sensor_id'] = sensor_id
        # Append the dictionary to the flattened_data list
        flattened_data.append(reading)

# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(flattened_data)

df.head()

In [None]:
min_time = datetime.datetime.utcfromtimestamp(df.time_stamp.min())
max_time = datetime.datetime.utcfromtimestamp(df.time_stamp.max())

print(min_time.strftime("%d/%m/%y"), max_time.strftime("%d/%m/%y"))

In [None]:
df.shape

In [None]:
def filter_on_channels(dataframe, channel_a, channel_b, max_diff=5, max_pct_diff=0.61) -> pd.DataFrame:
    '''
    Filters rows in the DataFrame based on the absolute and percentage difference 
    between two specified columns representing sensor channels A and B.

    Args:
        dataframe (pd.DataFrame): A pandas DataFrame with air pollution data.
        channel_a (str): The name of the first channel (e.g., 'pm1.0_atm_a').
        channel_b (str): The name of the second channel (e.g., 'pm1.0_atm_b').
        max_diff (float, optional): The maximum acceptable absolute difference 
                                    in μg between the two channels. Default is 5.
        max_pct_diff (float, optional): The maximum acceptable percentage difference 
                                        between the two channels, expressed as a fraction 
                                        (e.g., 0.61 for 61%). Default is 0.61.
    
    Returns:
        pd.DataFrame: A DataFrame filtered based on the specified criteria.
    '''
    # Abs difference
    absolute_difference = abs(dataframe[channel_a] - dataframe[channel_b])
    # Pc difference, where the denominator is not zero to avoid division by zero
    percentage_difference = abs((dataframe[channel_a] - dataframe[channel_b]) / dataframe[channel_b].replace(0, float('nan')))
    
    condition = (absolute_difference <= max_diff) & (percentage_difference <= max_pct_diff)
    return dataframe[condition].copy()

In [None]:
# Function to calculate differences
def calculate_differences(row, channel_a, channel_b, max_diff, max_pct_diff):
    # Compute the absolute difference
    absolute_difference = abs(row[channel_a] - row[channel_b])
    # Compute the percentage difference, handling the division by zero
    percentage_difference = abs((row[channel_a] - row[channel_b]) / row[channel_b] if row[channel_b] != 0 else float('nan'))

    # Return the differences
    return pd.Series([absolute_difference, percentage_difference], index=['abs_diff', 'pct_diff'])

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df[['abs_diff', 'pct_diff']] = df.apply(lambda row: calculate_differences(row, 'pm2.5_atm_a', 'pm2.5_atm_b', 5, 0.61), axis=1)
condition = (df['abs_diff'] <= 5) & (df['pct_diff'] <= 0.61)
filtered_df = df[condition]
filtered_df = filtered_df.drop(columns=['abs_diff', 'pct_diff'])


In [None]:
filtered_df = filtered_df.rename(columns={"time_stamp":"unixtime"})
data = filtered_df.copy()
data['datetime'] = pd.to_datetime(data['unixtime'], unit='s')

data['day'] = data['datetime'].dt.day
data['month'] = data['datetime'].dt.month
data['year'] = data['datetime'].dt.year
data['time'] = data['datetime'].dt.time
data['day_of_week'] = data['datetime'].dt.dayofweek

In [None]:
data.head()

In [None]:
data.day.min(), data.day.max(), data.month.min(), data.month.max(), data.year.min(), data.year.max()

In [None]:
data.month.unique()

In [None]:
data.day_of_week.value_counts()

In [None]:
data['pm1.0_atm'] = data[['pm1.0_atm_a', 'pm1.0_atm_b']].mean(axis=1)
data['pm2.5_atm'] = data[['pm2.5_atm_a', 'pm2.5_atm_b']].mean(axis=1)
data['pm10.0_atm'] = data[['pm10.0_atm_a', 'pm10.0_atm_b']].mean(axis=1)

In [None]:
data.columns

In [None]:
data_select = data[['sensor_id', 'humidity', 'temperature', 'pressure', 'datetime', 'day', 'month', 'year', 'time',
       'day_of_week', 'pm1.0_atm', 'pm2.5_atm', 'pm10.0_atm']]

In [None]:
value_vars = ['humidity', 'temperature', 'pressure', 'pm1.0_atm', 'pm2.5_atm', 'pm10.0_atm']

# Melt the DataFrame
melted_data = data_select.melt(id_vars=['sensor_id', 'datetime', 'day', 'month', 'year', 'time', 'day_of_week'],
                               value_vars=value_vars,
                               var_name='field',
                               value_name='reading')

In [None]:
melted_data.day_of_week.value_counts()

In [None]:
data_hourly_readings = melted_data.groupby(['sensor_id', 'field', 'day_of_week', 'time']).resample('H', on='datetime')['reading'].mean().reset_index()

In [None]:
data_hourly_readings.head()

In [None]:
len(data_hourly_readings)

In [None]:
data_hourly_readings.datetime.min(), data_hourly_readings.datetime.max()

In [None]:
data_hourly_readings.day_of_week.unique()

In [None]:
data_hourly_readings.to_csv('processed_data_can_sensors_hourly_calibrated_wide.csv', index=False)

In [None]:
data_hourly_readings_wide = data_hourly_readings.pivot(index=['sensor_id', 'datetime'], columns='field', values='reading').reset_index()
data_hourly_readings_wide.head()

In [None]:
data_hourly_readings_wide.to_csv('processed_data_can_sensors_hourly_calibrated.csv', index=False)

In [None]:
def calibrate_reading_generic(s1: float, s2: float, s3: float, pm25: float, rh: float, i: float) -> float:
    """
    Calibrates a PM2.5 reading.

    Args:
        s1: the first coefficient in the calibration equation.
        s2: the second coefficient in the calibration equation.
        s3: the third coefficient in the calibration equation.
        pm25: the PM2.5 reading in raw units.
        rh: the relative humidity in percent.
        i: an intercept.

    Returns:
        The calibrated PM2.5 reading in ug/m^3.
    """

    adjusted_rh = rh**2 / (1 - rh)
    return s1 * pm25 + s2 * adjusted_rh * pm25 + s3 * adjusted_rh + i

In [None]:
def calibrate_reading(s1: float, s2: float, pm25: float, rh: float, i: float) -> float:
    """
    Calibrates a PM2.5 reading.

    Args:
        s1: the first coefficient in the calibration equation.
        s2: the second coefficient in the calibration equation.
        pm25: the PM2.5 reading in raw units.
        rh: the relative humidity in percent.
        i: an intercept.

    Returns:
        The calibrated PM2.5 reading in ug/m^3.
    """

    return s1 * pm25 - s2 * rh + i

In [None]:
data = pd.melt(data, id_vars=['sensor_index', 'time'], var_name='fields', value_name='reading')

In [None]:
data = data.rename(columns={"time":"unixtime"})
data['datetime'] = pd.to_datetime(data['unixtime'], unit='s')

data['day'] = data['datetime'].dt.day
data['month'] = data['datetime'].dt.month
data['year'] = data['datetime'].dt.year
data['time'] = data['datetime'].dt.time
data['day_of_week'] = data['datetime'].dt.dayofweek

In [None]:
data.to_csv('processed_data_can_sensors.csv', index=False)

In [None]:
data.head()

In [None]:
print(data['datetime'].min())
print(data['datetime'].max())

In [None]:
data.sensor_index.unique()

In [None]:
data.fields.unique()

In [None]:
data[data.fields == 'humidity'].reading.describe()

In [None]:
data.groupby('sensor_index').fields.unique()

In [None]:
data_hourly_readings = data.groupby(['sensor_index', 'fields']).resample('H', on='datetime')['reading'].mean().reset_index()

In [None]:
data_hourly_readings

In [None]:
data_hourly_readings_wide = data_hourly_readings.pivot(index=['sensor_index', 'datetime'], columns='fields', values='reading').reset_index()
data_hourly_readings_wide.head()

In [None]:
data_hourly_readings_wide['pm2.5_calibrated'] = data_hourly_readings_wide.apply(lambda x: calibrate_reading(0.524, 0.0862, x['pm2.5_atm'], x['humidity'], 5.75), axis=1)

In [None]:
data_hourly_readings_long = data_hourly_readings_wide.melt(id_vars=['sensor_index', 'datetime'], var_name='fields', value_name='reading')

In [None]:
data_hourly_readings_long['day'] = data_hourly_readings_long['datetime'].dt.day
data_hourly_readings_long['month'] = data_hourly_readings_long['datetime'].dt.month
data_hourly_readings_long['year'] = data_hourly_readings_long['datetime'].dt.year
data_hourly_readings_long['time'] = data_hourly_readings_long['datetime'].dt.time
data_hourly_readings_long['day_of_week'] = data_hourly_readings_long['datetime'].dt.dayofweek

In [None]:
data_hourly_readings_long.to_csv('processed_data_can_sensors_hourly_calibrated.csv', index=False)

In [None]:
data = data_hourly_readings_long.copy()

In [None]:
data_select = data[data.fields == 'pm1.0_atm']
grouped_data = data_select.groupby('sensor_index')

# Create a plot for each group
for name, group in grouped_data:
    plt.plot(group['datetime'], group['reading'], label=name)

# Set labels and title
plt.xlabel('Date')
plt.ylabel('Reading')
plt.title('Readings over Time')

# Add a legend
plt.legend()

# Display the plot
plt.show()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

variables = ["pm1.0_atm", "pm2.5_calibrated", "pm10.0_atm"]
groupbys = ['datetime', 'day_of_week', 'month']

for grouping in groupbys:
    for var in variables:
        grouped_data = data.groupby(by=grouping).mean()
        sns.lineplot(data=grouped_data, x=grouped_data.index, y="reading")
        plt.show()
