In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import load_model

def forecast_aqi(file_path, seq_length=24, future_steps=8):
    # Load and preprocess the dataset
    df = pd.read_csv(file_path)
    df.set_index('Date_Start Time', inplace=True)
    df.drop(['Sr.NO', 'End Time'], axis=1, inplace=True)

    # Ensure AQI columns are float
    aqi_columns = ['AQI_PM2.5', 'AQI_PM10', 'AQI_NO2', 'AQI_CO', 'AQI_SO2']
    for col in aqi_columns:
        df[col] = df[col].astype(float)

    # Normalize the data using MinMaxScaler
    scalers = {}
    scaled_data = {}
    for col in aqi_columns:
        scaler = MinMaxScaler()
        scaled_data[col] = scaler.fit_transform(df[[col]])
        scalers[col] = scaler

    # Function to forecast future AQI values
    def forecast_future(model, data, seq_length, future_steps):
        future_predictions = []
        current_seq = data[-seq_length:]  # Start from the last available sequence
        
        for _ in range(future_steps):
            pred = model.predict(current_seq[np.newaxis, :, :])  # Predict next step
            future_predictions.append(pred[0])
            current_seq = np.append(current_seq[1:], pred, axis=0)  # Update the sequence
        
        return np.array(future_predictions)

    # Load pre-trained models and forecast the next 'n' hours
    future_predictions = {}
    for col in aqi_columns:
        model = load_model(f'{col}_LSTM_model.h5')  # Load pre-trained LSTM model
        future_scaled_predictions = forecast_future(model, scaled_data[col], seq_length, future_steps)
        # Inverse transform to get the actual AQI values
        future_predictions[col] = scalers[col].inverse_transform(future_scaled_predictions).flatten()

    # Convert future predictions to a DataFrame with a proper timestamp index
    future_df = pd.DataFrame(future_predictions, index=pd.date_range(start=df.index[-1], periods=future_steps+1, freq='h')[1:])
    return future_df

# File path of the dataset
file_path = '../ARIF/AQI_Weather_Data.csv'

# Call the forecast function to get future AQI predictions
future_df = forecast_aqi(file_path)

# Display the predicted values
print(future_df)




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 104ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 106ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
                     AQI_PM2.5    AQI_PM10    AQI_NO2     AQI_CO    AQI_SO2
2023-04-01 00:00:00  42.472965  102.229370  49.327312  31.932623  28.024145
2023-04-01 01:00:00  43.372379  102.931725  49.198601  33.566483  28.203175
2023-04-01 02:00:00  44.211086  103.279778  49.051525  34.603947  28.438957
2023-04-01 03:00:00  44.921944  103.427887  48.899712  35.322994  28.714672
2023-04-01 04:00:00  45.542187  103.380775  4

In [10]:
for row in future_df.iterrows():
    print(row[1])
    print(type(row[1]))


AQI_PM2.5     42.472965
AQI_PM10     102.229370
AQI_NO2       49.327312
AQI_CO        31.932623
AQI_SO2       28.024145
Name: 2023-04-01 00:00:00, dtype: float32
<class 'pandas.core.series.Series'>
AQI_PM2.5     43.372379
AQI_PM10     102.931725
AQI_NO2       49.198601
AQI_CO        33.566483
AQI_SO2       28.203175
Name: 2023-04-01 01:00:00, dtype: float32
<class 'pandas.core.series.Series'>
AQI_PM2.5     44.211086
AQI_PM10     103.279778
AQI_NO2       49.051525
AQI_CO        34.603947
AQI_SO2       28.438957
Name: 2023-04-01 02:00:00, dtype: float32
<class 'pandas.core.series.Series'>
AQI_PM2.5     44.921944
AQI_PM10     103.427887
AQI_NO2       48.899712
AQI_CO        35.322994
AQI_SO2       28.714672
Name: 2023-04-01 03:00:00, dtype: float32
<class 'pandas.core.series.Series'>
AQI_PM2.5     45.542187
AQI_PM10     103.380775
AQI_NO2       48.747456
AQI_CO        36.013939
AQI_SO2       29.024403
Name: 2023-04-01 04:00:00, dtype: float32
<class 'pandas.core.series.Series'>
AQI_PM2.5 

In [1]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import load_model
import pandas as pd

def forecast_aqi_from_current(current_aqi, seq_length=24, future_steps=8):
    """
    This function forecasts future AQI values based on the current AQI data passed as input.

    :param current_aqi: A dictionary with current AQI values, e.g.:
                        {'AQI_PM2.5': 45.0, 'AQI_PM10': 60.0, 'AQI_NO2': 20.0, 'AQI_CO': 1.5, 'AQI_SO2': 10.0}
    :param seq_length: Number of previous steps to use for the LSTM model (default: 24 hours)
    :param future_steps: Number of future hours to predict (default: 8 hours)
    :return: A DataFrame with the predicted future AQI values for the next 'future_steps' hours.
    """
    
    aqi_columns = ['AQI_PM2.5', 'AQI_PM10', 'AQI_NO2', 'AQI_CO', 'AQI_SO2']

    # Normalize current AQI values
    scalers = {}
    scaled_current_aqi = {}
    
    for col in aqi_columns:
        scaler = MinMaxScaler()
        # Fit the scaler on a range, assuming typical AQI value ranges (e.g., 0 to 500 for PM2.5 and PM10, etc.)
        scaler.fit(np.array([0, 500]).reshape(-1, 1))
        scaled_current_aqi[col] = scaler.transform(np.array(current_aqi[col]).reshape(-1, 1))
        scalers[col] = scaler

    # Function to forecast future AQI values
    def forecast_future(model, data, seq_length, future_steps):
        future_predictions = []
        current_seq = np.array(data[-seq_length:])  # Start from the last available sequence
        
        for _ in range(future_steps):
            pred = model.predict(current_seq[np.newaxis, :, :])  # Predict next step
            future_predictions.append(pred[0])
            current_seq = np.append(current_seq[1:], pred, axis=0)  # Update the sequence

        return np.array(future_predictions)

    # Load pre-trained models and forecast the next 'n' hours
    future_predictions = {}
    
    for col in aqi_columns:
        model = load_model(f'{col}_LSTM_model.h5')  # Load pre-trained LSTM model
        
        # Assuming current AQI values serve as the last known sequence
        future_scaled_predictions = forecast_future(model, scaled_current_aqi[col], seq_length, future_steps)
        
        # Inverse transform to get the actual AQI values
        future_predictions[col] = scalers[col].inverse_transform(future_scaled_predictions).flatten()

    # Convert future predictions to a DataFrame with a proper timestamp index
    future_df = pd.DataFrame(future_predictions, index=pd.date_range(start=pd.Timestamp.now(), periods=future_steps, freq='H'))
    return future_df

# Example Usage:
current_aqi = {
    'AQI_PM2.5': 45.0,
    'AQI_PM10': 60.0,
    'AQI_NO2': 20.0,
    'AQI_CO': 1.5,
    'AQI_SO2': 10.0
}

# Call the function to get future AQI predictions
future_df = forecast_aqi_from_current(current_aqi)

# Display the predicted future AQI values
print(future_df)




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
                             AQI_PM2.5    AQI_PM10    AQI_NO2      AQI_CO  \
2024-08-19 21:52:35.250874  -18.761978   33.683098  23.624792  -45.963631   
2024-08-19 22:52:35.250874 -110.532150   -1.685523  26.746155 -103.369652   
2024-08-19 23:52:35.250874 -215.434692  -46.177261  29.455200 -166.950134   
2024-08-20 00:52:35.250874 -302.007690  -97.323051  31.822323 -230.052765   
2024-08-20 01:52:35.250874 -352.337891 -149.77

  future_df = pd.DataFrame(future_predictions, index=pd.date_range(start=pd.Timestamp.now(), periods=future_steps, freq='H'))


In [3]:
from aqi_cal import calculate_aqi, get_aqi_cat

breakpoints = {
    'PM2.5': [(0, 30, 0, 50), (31, 60, 51, 100), (61, 90, 101, 200), (91, 120, 201, 300), (121, 250, 301, 400), (251, 500, 401, 500)],
    'PM10': [(0, 50, 0, 50), (51, 100, 51, 100), (101, 250, 101, 200), (251, 350, 201, 300), (351, 430, 301, 400), (431, 500, 401, 500)],
    'NO2': [(0, 40, 0, 50), (41, 80, 51, 100), (81, 180, 101, 200), (181, 280, 201, 300), (281, 400, 301, 400), (401, 1000, 401, 500)],
    'SO2': [(0, 40, 0, 50), (41, 80, 51, 100), (81, 380, 101, 200), (381, 800, 201, 300), (801, 1600, 301, 400), (1601, 2100, 401, 500)],
    'CO': [(0, 1, 0, 50), (1.1, 2, 51, 100), (2.1, 10, 101, 200), (10.1, 17, 201, 300), (17.1, 34, 301, 400), (34.1, 50, 401, 500)],
}

def ret(pm2_5, pm10, no2, co, so2):
    try:
        # Dictionary to store provided pollutant values
        data = {
            "PM2.5": pm2_5,
            "PM10": pm10,
            "NO2": no2,
            "CO": co,
            "SO2": so2
        }

        # Calculate AQI for each relevant pollutant based on its concentration and breakpoints
        aqi_val = {}
        for pollutant, concentration in data.items():
            if pollutant in ['PM2.5', 'PM10', 'NO2', 'CO', 'SO2']:
                aqi_val[f'{pollutant}'] = calculate_aqi(concentration, breakpoints[pollutant])

        # Calculate overall AQI by selecting the maximum value from calculated AQI values
        overall_aqi = max(aqi_val.values())

        # Get the AQI category and health impact based on the overall AQI
        remark, health_impact = get_aqi_cat(overall_aqi)

        # Identify the pollutant that contributed the most to the AQI
        pollutant_res = list(filter(lambda x: aqi_val[x] == overall_aqi, aqi_val))[0]

        # Return the final result
        return {
            "aqi": overall_aqi,
            "remark": remark,
            "impact": health_impact,
            "pollutant_res": pollutant_res
        }

    except Exception as e:
        print(f"Exception in ret method: {e}")
        return str(e)


In [4]:
future_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 8 entries, 2023-04-01 00:00:00 to 2023-04-01 07:00:00
Freq: h
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   AQI_PM2.5  8 non-null      float32
 1   AQI_PM10   8 non-null      float32
 2   AQI_NO2    8 non-null      float32
 3   AQI_CO     8 non-null      float32
 4   AQI_SO2    8 non-null      float32
dtypes: float32(5)
memory usage: 224.0 bytes


In [8]:
future_df["AQI_CO"] /= 1000
for row in future_df.iterrows():
    print(type(row[1][1]))
    print(ret(row[1]["AQI_PM2.5"], row[1]["AQI_PM10"], row[1]["AQI_NO2"], row[1]["AQI_CO"], row[1]["AQI_SO2"]))

<class 'numpy.float32'>
{'aqi': 101.817, 'remark': 'Moderate', 'impact': 'Breathing discomfort to the people with lungs, asthma, and heart diseases', 'pollutant_res': 'PM10'}
<class 'numpy.float32'>
{'aqi': 102.283, 'remark': 'Moderate', 'impact': 'Breathing discomfort to the people with lungs, asthma, and heart diseases', 'pollutant_res': 'PM10'}
<class 'numpy.float32'>
{'aqi': 102.515, 'remark': 'Moderate', 'impact': 'Breathing discomfort to the people with lungs, asthma, and heart diseases', 'pollutant_res': 'PM10'}
<class 'numpy.float32'>
{'aqi': 102.613, 'remark': 'Moderate', 'impact': 'Breathing discomfort to the people with lungs, asthma, and heart diseases', 'pollutant_res': 'PM10'}
<class 'numpy.float32'>
{'aqi': 102.582, 'remark': 'Moderate', 'impact': 'Breathing discomfort to the people with lungs, asthma, and heart diseases', 'pollutant_res': 'PM10'}
<class 'numpy.float32'>
{'aqi': 102.443, 'remark': 'Moderate', 'impact': 'Breathing discomfort to the people with lungs, asth

  print(type(row[1][1]))


In [25]:
import sys
print(sys.executable)

/usr/local/bin/python3


# <b>For Trend Analysis</b>

In [1]:
import pandas as pd

In [2]:
data_path = "./data/data_college.csv"

In [19]:
data = pd.read_csv(data_path)
df = pd.DataFrame(data)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15898 entries, 0 to 15897
Data columns (total 25 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Date                  15898 non-null  object
 1    Time                 15898 non-null  object
 2   Date & Time           15898 non-null  object
 3   AQI                   15898 non-null  int64 
 4   High AQI              15898 non-null  int64 
 5   PM 1 - ug/m?          15898 non-null  int64 
 6   High PM 1 - ug/m?     15898 non-null  int64 
 7   PM 2.5 - ug/m?        15898 non-null  int64 
 8   High PM 2.5 - ug/m?   15898 non-null  int64 
 9   PM 10 - ug/m?         15898 non-null  int64 
 10  High PM 10 - ug/m?    15898 non-null  int64 
 11  Temp - ?C             15898 non-null  int64 
 12  High Temp - ?C        15898 non-null  int64 
 13  Low Temp - ?C         15898 non-null  int64 
 14  Hum - %               15898 non-null  int64 
 15  High Hum - %          15898 non-null

In [20]:
df.head()

Unnamed: 0,Date,Time,Date & Time,AQI,High AQI,PM 1 - ug/m?,High PM 1 - ug/m?,PM 2.5 - ug/m?,High PM 2.5 - ug/m?,PM 10 - ug/m?,...,High Hum - %,Low Hum - %,Dew Point - ?C,High Dew Point - ?C,Low Dew Point - ?C,Wet Bulb - ?C,High Wet Bulb - ?C,Low Wet Bulb - ?C,Heat Index - ?C,High Heat Index - ?C
0,1-Sep-2023,12:00:00 AM,9/1/2023,21,30,9,13,12,18,16,...,75,74,25,25,25,26,26,26,36,36
1,1-Sep-2023,12:15:00 AM,9/1/2023,22,30,10,12,13,18,17,...,76,75,25,25,25,26,26,26,36,36
2,1-Sep-2023,12:30:00 AM,9/1/2023,23,32,10,15,14,19,17,...,76,76,25,25,25,26,26,26,36,36
3,1-Sep-2023,12:45:00 AM,9/1/2023,23,35,11,14,14,21,18,...,76,75,25,25,25,26,26,26,36,36
4,1-Sep-2023,1:00:00 AM,9/1/2023,20,32,9,13,12,19,16,...,77,76,25,25,25,26,26,26,36,36


In [65]:
df[" Time"].unique()

array(['12:00:00 AM', '12:15:00 AM', '12:30:00 AM', '12:45:00 AM',
       '1:00:00 AM', '1:15:00 AM', '1:30:00 AM', '1:45:00 AM',
       '2:00:00 AM', '2:15:00 AM', '2:30:00 AM', '2:45:00 AM',
       '3:00:00 AM', '3:15:00 AM', '3:30:00 AM', '3:45:00 AM',
       '4:00:00 AM', '4:15:00 AM', '4:30:00 AM', '4:45:00 AM',
       '5:00:00 AM', '5:15:00 AM', '5:30:00 AM', '5:45:00 AM',
       '6:00:00 AM', '6:15:00 AM', '6:30:00 AM', '6:45:00 AM',
       '7:00:00 AM', '7:15:00 AM', '7:30:00 AM', '7:45:00 AM',
       '8:00:00 AM', '8:15:00 AM', '8:30:00 AM', '8:45:00 AM',
       '9:00:00 AM', '9:15:00 AM', '9:30:00 AM', '9:45:00 AM',
       '10:00:00 AM', '10:15:00 AM', '10:30:00 AM', '10:45:00 AM',
       '11:00:00 AM', '11:15:00 AM', '11:30:00 AM', '11:45:00 AM',
       '12:00:00 PM', '12:15:00 PM', '12:30:00 PM', '12:45:00 PM',
       '1:00:00 PM', '1:15:00 PM', '1:30:00 PM', '1:45:00 PM',
       '2:00:00 PM', '2:15:00 PM', '2:30:00 PM', '2:45:00 PM',
       '3:00:00 PM', '3:15:00 PM', '3:3

In [22]:
df["Date"] = df["Date"].apply(lambda x: x[-8:-5])
df["Date"].unique()

array(['Sep', 'Oct', 'Nov', 'Dec', 'Jan', 'Feb', 'Mar'], dtype=object)

In [9]:
df.drop("Date", axis=1, inplace=True)
df

Unnamed: 0,Time,Date & Time,AQI,High AQI,PM 1 - ug/m?,High PM 1 - ug/m?,PM 2.5 - ug/m?,High PM 2.5 - ug/m?,PM 10 - ug/m?,High PM 10 - ug/m?,...,High Hum - %,Low Hum - %,Dew Point - ?C,High Dew Point - ?C,Low Dew Point - ?C,Wet Bulb - ?C,High Wet Bulb - ?C,Low Wet Bulb - ?C,Heat Index - ?C,High Heat Index - ?C
0,12:00:00 AM,9/1/2023,21,30,9,13,12,18,16,26,...,75,74,25,25,25,26,26,26,36,36
1,12:15:00 AM,9/1/2023,22,30,10,12,13,18,17,32,...,76,75,25,25,25,26,26,26,36,36
2,12:30:00 AM,9/1/2023,23,32,10,15,14,19,17,27,...,76,76,25,25,25,26,26,26,36,36
3,12:45:00 AM,9/1/2023,23,35,11,14,14,21,18,30,...,76,75,25,25,25,26,26,26,36,36
4,1:00:00 AM,9/1/2023,20,32,9,13,12,19,16,26,...,77,76,25,25,25,26,26,26,36,36
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15893,11:00:00 PM,2/29/2024,203,242,62,68,92,103,102,114,...,58,56,22,22,21,24,24,23,34,34
15894,11:15:00 PM,2/29/2024,214,259,64,71,95,108,105,119,...,59,57,22,22,21,24,24,23,34,34
15895,11:30:00 PM,2/29/2024,210,256,63,68,94,107,104,114,...,59,58,22,22,22,24,24,24,34,34
15896,11:45:00 PM,2/29/2024,219,249,65,70,96,105,107,118,...,61,59,22,22,22,24,24,24,34,34


In [27]:
df[["AQI", "High AQI", "PM 10 - ug/m?", "High PM 10 - ug/m?"]].head()

Unnamed: 0,AQI,High AQI,PM 10 - ug/m?,High PM 10 - ug/m?
0,21,30,16,26
1,22,30,17,32
2,23,32,17,27
3,23,35,18,30
4,20,32,16,26


In [34]:
li = ["Date", " Time", "Date & Time", "AQI"]
new_df = df[li]
new_df

Unnamed: 0,Date,Time,Date & Time,AQI
0,Sep,12:00:00 AM,9/1/2023,21
1,Sep,12:15:00 AM,9/1/2023,22
2,Sep,12:30:00 AM,9/1/2023,23
3,Sep,12:45:00 AM,9/1/2023,23
4,Sep,1:00:00 AM,9/1/2023,20
...,...,...,...,...
15893,Feb,11:00:00 PM,2/29/2024,203
15894,Feb,11:15:00 PM,2/29/2024,214
15895,Feb,11:30:00 PM,2/29/2024,210
15896,Feb,11:45:00 PM,2/29/2024,219


In [37]:
new_df["Date & Time"].unique()

array(['9/1/2023', '9/2/2023', '9/3/2023', '9/4/2023', '9/5/2023',
       '9/6/2023', '9/7/2023', '9/21/2023', '9/22/2023', '9/23/2023',
       '9/24/2023', '9/25/2023', '9/26/2023', '9/27/2023', '9/28/2023',
       '9/29/2023', '9/30/2023', '10/1/2023', '10/2/2023', '10/3/2023',
       '10/4/2023', '10/5/2023', '10/6/2023', '10/7/2023', '10/8/2023',
       '10/9/2023', '10/10/2023', '10/11/2023', '10/12/2023',
       '10/13/2023', '10/14/2023', '10/15/2023', '10/16/2023',
       '10/17/2023', '10/18/2023', '10/19/2023', '10/20/2023',
       '10/21/2023', '10/22/2023', '10/23/2023', '10/24/2023',
       '10/25/2023', '10/26/2023', '10/27/2023', '10/28/2023',
       '10/29/2023', '10/30/2023', '10/31/2023', '11/1/2023', '11/2/2023',
       '11/3/2023', '11/4/2023', '11/5/2023', '11/6/2023', '11/7/2023',
       '11/8/2023', '11/9/2023', '11/10/2023', '11/11/2023', '11/12/2023',
       '11/13/2023', '11/14/2023', '11/15/2023', '11/16/2023',
       '11/17/2023', '11/18/2023', '11/19/2023',

In [42]:
new_df = new_df.loc[new_df.groupby("Date & Time")["AQI"].idxmax()]
new_df = new_df.sort_index()

In [44]:
new_df

Unnamed: 0,Date,Time,Date & Time,AQI
68,Sep,5:00:00 PM,9/1/2023,171
132,Sep,9:00:00 AM,9/2/2023,126
193,Sep,12:15:00 AM,9/3/2023,79
323,Sep,8:45:00 AM,9/4/2023,171
412,Sep,7:00:00 AM,9/5/2023,88
...,...,...,...,...
15555,Feb,10:30:00 AM,2/26/2024,382
15645,Feb,9:00:00 AM,2/27/2024,408
15736,Feb,7:45:00 AM,2/28/2024,334
15832,Feb,7:45:00 AM,2/29/2024,365


In [45]:
new_df["Ratio"] = (new_df["AQI"] / new_df["AQI"].max()) * 100

In [59]:
new_df.rename(columns={" Time":"Time"}, inplace=True)

In [63]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 169 entries, 68 to 15897
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Date         169 non-null    object 
 1   Time         169 non-null    object 
 2   Date & Time  169 non-null    object 
 3   AQI          169 non-null    int64  
 4   Ratio        169 non-null    float64
dtypes: float64(1), int64(1), object(3)
memory usage: 12.0+ KB


In [64]:
ret = new_df.to_json(orient='records')
print(ret)

[{"Date":"Sep","Time":"5:00:00 PM","Date & Time":"9\/1\/2023","AQI":171,"Ratio":30.1056338028},{"Date":"Sep","Time":"9:00:00 AM","Date & Time":"9\/2\/2023","AQI":126,"Ratio":22.1830985915},{"Date":"Sep","Time":"12:15:00 AM","Date & Time":"9\/3\/2023","AQI":79,"Ratio":13.9084507042},{"Date":"Sep","Time":"8:45:00 AM","Date & Time":"9\/4\/2023","AQI":171,"Ratio":30.1056338028},{"Date":"Sep","Time":"7:00:00 AM","Date & Time":"9\/5\/2023","AQI":88,"Ratio":15.4929577465},{"Date":"Sep","Time":"12:45:00 AM","Date & Time":"9\/6\/2023","AQI":67,"Ratio":11.7957746479},{"Date":"Sep","Time":"11:45:00 AM","Date & Time":"9\/7\/2023","AQI":50,"Ratio":8.8028169014},{"Date":"Sep","Time":"10:00:00 PM","Date & Time":"9\/21\/2023","AQI":25,"Ratio":4.4014084507},{"Date":"Sep","Time":"10:45:00 AM","Date & Time":"9\/22\/2023","AQI":48,"Ratio":8.4507042254},{"Date":"Sep","Time":"2:15:00 PM","Date & Time":"9\/23\/2023","AQI":99,"Ratio":17.4295774648},{"Date":"Sep","Time":"10:30:00 AM","Date & Time":"9\/24\/2023

In [68]:
import pandas as pd

def fetch_trend_data(file_path):
    try:
        # Load data from CSV
        df = pd.read_csv(file_path)

        # Clean up column names (strip leading/trailing spaces)
        df.columns = df.columns.str.strip()

        # Convert 'Date & Time' to datetime
        df['Date & Time'] = pd.to_datetime(df['Date & Time'])

        # Step 1: Get the row with the highest AQI for each unique 'Date & Time'
        new_df = df.loc[df.groupby("Date & Time")["AQI"].idxmax()]

        # Step 2: Sort the DataFrame by index (or alternatively by 'Date & Time')
        new_df = new_df.sort_index()

        # Step 3: Calculate the AQI Ratio
        new_df["Ratio"] = (new_df["AQI"] / new_df["AQI"].max()) * 100

        # Step 4: Rename the ' Time' column to 'Time' (remove leading spaces)
        new_df.rename(columns={"Time": "Time"}, inplace=True)

        # Step 5: Limit the data (e.g., last 100 rows)
        limited_df = new_df.tail(100)

        # Step 6: Convert to dictionary for returning
        trend_data = limited_df[['Date', 'Date & Time', 'AQI', 'Ratio']].to_dict(orient='records')

        return trend_data

    except Exception as e:
        print(f"Error in trend analysis fetch function: {e}")
        return None


# Example usage
file_path = './data/data_college.csv'
trend_data = fetch_trend_data(file_path)
print(trend_data)


[{'Date & Time': Timestamp('2023-11-22 00:00:00'), 'AQI': 310, 'Ratio': 54.5774647887324}, {'Date & Time': Timestamp('2023-11-23 00:00:00'), 'AQI': 313, 'Ratio': 55.1056338028169}, {'Date & Time': Timestamp('2023-11-24 00:00:00'), 'AQI': 348, 'Ratio': 61.267605633802816}, {'Date & Time': Timestamp('2023-11-25 00:00:00'), 'AQI': 328, 'Ratio': 57.74647887323944}, {'Date & Time': Timestamp('2023-11-26 00:00:00'), 'AQI': 287, 'Ratio': 50.52816901408451}, {'Date & Time': Timestamp('2023-11-27 00:00:00'), 'AQI': 252, 'Ratio': 44.36619718309859}, {'Date & Time': Timestamp('2023-11-28 00:00:00'), 'AQI': 384, 'Ratio': 67.6056338028169}, {'Date & Time': Timestamp('2023-11-29 00:00:00'), 'AQI': 318, 'Ratio': 55.98591549295775}, {'Date & Time': Timestamp('2023-11-30 00:00:00'), 'AQI': 336, 'Ratio': 59.154929577464785}, {'Date & Time': Timestamp('2023-12-01 00:00:00'), 'AQI': 366, 'Ratio': 64.43661971830986}, {'Date & Time': Timestamp('2023-12-02 00:00:00'), 'AQI': 366, 'Ratio': 64.43661971830986},

# <b>RAG</b>

In [8]:
!pip3 install pandas numpy scikit-learn genai torch transformers


Collecting genai
  Using cached genai-2.1.0-py3-none-any.whl.metadata (6.5 kB)
Collecting transformers
  Using cached transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
Collecting openai<0.28.0,>=0.27.0 (from genai)
  Using cached openai-0.27.10-py3-none-any.whl.metadata (13 kB)
Collecting tiktoken<0.4.0,>=0.3.2 (from genai)
  Using cached tiktoken-0.3.3.tar.gz (25 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)
  Using cached huggingface_hub-0.25.0-py3-none-any.whl.metadata (13 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Using cached safetensors-0.4.5-cp312-cp312-macosx_11_0_arm64.whl.metadata (3.8 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers)
  Using cached tokenizers-0.19.1-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.7 kB)
Using cached genai-2.1.0-py3-none-any.whl (16 kB)
Usin

In [4]:
import pandas as pd
import faiss
import numpy as np
import genai
from genai.embeddings import EmbeddingModel  # Assuming this is available for embeddings
import torch

genai.configure(api_key="AIzaSyCyM56FJ679pGZsUZ5aYNi2gOZEtoUWxAw")

# Step 1: Initialize the Gemini model for text generation and embedding model for RAG
generative_model = genai.GenerativeModel("gemini-1.5-flash")
embedding_model = EmbeddingModel("gemini-1.5-embed")  # Assuming an embedding model is available

# Load the schedule data from CSV
def load_schedule(file_path):
    """
    Loads the schedule data from the provided CSV file.
    :param file_path: The path to the CSV file containing the schedule.
    :return: DataFrame with schedule information.
    """
    schedule_df = pd.read_csv(file_path)
    return schedule_df

# Step 2: Generate embeddings using Gemini's embedding model
def generate_embeddings(sentences):
    """
    Generate embeddings for a list of sentences using Gemini's embedding model.
    :param sentences: List of sentences to embed.
    :return: Numpy array of embeddings.
    """
    embeddings = embedding_model.embed(sentences)
    return np.array(embeddings)

# Step 3: Build a FAISS index for the schedule data
def build_faiss_index(data, embedding_dim):
    """
    Build a FAISS index for the given data embeddings.
    :param data: List of text data to index.
    :param embedding_dim: Dimensionality of the embedding space.
    :return: FAISS index and list of corresponding data.
    """
    embeddings = generate_embeddings(data)
    
    # Create a FAISS index for efficient vector search
    index = faiss.IndexFlatL2(embedding_dim)  # L2 similarity search
    index.add(embeddings)  # Add embeddings to the index
    
    return index, data

# Step 4: Search the FAISS index for relevant schedule information
def search_schedule(query, index, schedule_data, top_k=3):
    """
    Search for the most relevant schedule information based on the user's query.
    :param query: User's query.
    :param index: FAISS index to search.
    :param schedule_data: List of schedule entries.
    :param top_k: Number of top results to return.
    :return: Top-k relevant schedule entries.
    """
    query_embedding = generate_embeddings([query])[0]  # Embed the query
    distances, indices = index.search(np.array([query_embedding]), top_k)  # Search the FAISS index
    
    # Retrieve the top-k matching schedule entries
    top_results = [schedule_data[i] for i in indices[0]]
    
    return top_results

# Step 5: Generate a response using the generative model
def generate_response(query, retrieved_data):
    """
    Generate a response using Gemini's generative model, augmented by retrieved data.
    :param query: User's query.
    :param retrieved_data: Data retrieved from the FAISS index.
    :return: Generated response.
    """
    # Augment the user's query with retrieved schedule information
    augmented_prompt = f"User's query: {query}\nRelevant schedule data: {retrieved_data}\nPlease provide a detailed response based on this information."
    
    # Generate a response using the generative model
    response = generative_model.generate_content(augmented_prompt)
    
    return response

# Step 6: Chatbot function to handle user queries
def chatbot(query, schedule_df, index, schedule_data):
    """
    Main chatbot function to handle queries.
    :param query: User's question or query.
    :param schedule_df: Schedule data.
    :param index: FAISS index.
    :param schedule_data: List of schedule entries.
    :return: Chatbot's response.
    """
    # Search for relevant schedule information
    retrieved_data = search_schedule(query, index, schedule_data)
    
    # Generate a response using the retrieved data
    response = generate_response(query, retrieved_data)
    
    return response

# Load the schedule
schedule_df = load_schedule('./data/history_data.csv')
schedule_data = schedule_df['Schedule'].tolist()  # Assuming a 'Schedule' column with text data
embedding_dim = 384  # Assuming embedding dimensions

# Build the FAISS index
index, schedule_data = build_faiss_index(schedule_data, embedding_dim)

# Example chatbot usage
user_query = "What's on the schedule for 2024-09-20?"
response = chatbot(user_query, schedule_df, index, schedule_data)
print(response)


ModuleNotFoundError: No module named 'faiss'