<a href="https://colab.research.google.com/github/akasharya044/AQI_Predication_Project_Using-ML/blob/main/College_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Libaries Imports Here

In [5]:
pip install streamlit

Collecting streamlit
  Downloading streamlit-1.45.1-py3-none-any.whl.metadata (8.9 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.45.1-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m64.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m105.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hIns

In [9]:
import pandas as pd
import numpy as np
import os

def load_sample_data():
    """
    Load sample AQI data for demonstration purposes

    Returns:
    --------
    pandas.DataFrame
        A DataFrame containing sample AQI data
    """
    # Create a sample dataset with realistic AQI data features
    np.random.seed(42)

    # Number of samples
    n_samples = 1000

    # Generate features related to air quality
    pm25 = np.random.gamma(shape=2.0, scale=10.0, size=n_samples)  # PM2.5 levels
    pm10 = pm25 * 1.5 + np.random.normal(0, 5, n_samples)  # PM10 levels
    so2 = np.random.gamma(shape=1.0, scale=5.0, size=n_samples)  # SO2 levels
    no2 = np.random.gamma(shape=1.5, scale=10.0, size=n_samples)  # NO2 levels
    co = np.random.gamma(shape=0.5, scale=0.5, size=n_samples)  # CO levels
    o3 = np.random.gamma(shape=1.0, scale=15.0, size=n_samples)  # O3 levels

    # Temperature, humidity and wind features
    temperature = np.random.normal(25, 10, n_samples)  # Temperature in °C
    humidity = np.random.normal(60, 15, n_samples)  # Relative humidity (%)
    wind_speed = np.random.gamma(shape=2.0, scale=2.0, size=n_samples)  # Wind speed in m/s

    # Create temporal features
    # Month (1-12)
    month = np.random.randint(1, 13, n_samples)
    # Season (1: Spring, 2: Summer, 3: Fall, 4: Winter)
    season = np.ceil(month / 3) % 4 + 1

    # Location type (urban, suburban, rural, industrial)
    location_types = ['Urban', 'Suburban', 'Rural', 'Industrial']
    location_type = np.random.choice(location_types, n_samples)

    # Create AQI buckets based on the features
    # Calculate a weighted sum as a proxy for AQI
    aqi_proxy = (pm25 * 3.0 + pm10 * 1.5 + so2 * 2.0 + no2 * 2.0 +
                co * 10.0 + o3 * 1.0 - wind_speed * 5.0 +
                np.where(season == 2, 20, 0))  # Summer penalty

    # Create AQI buckets
    conditions = [
        (aqi_proxy < 50),
        (aqi_proxy >= 50) & (aqi_proxy < 100),
        (aqi_proxy >= 100) & (aqi_proxy < 150),
        (aqi_proxy >= 150) & (aqi_proxy < 200),
        (aqi_proxy >= 200) & (aqi_proxy < 300),
        (aqi_proxy >= 300)
    ]

    aqi_buckets = [
        'Good',
        'Moderate',
        'Unhealthy for Sensitive Groups',
        'Unhealthy',
        'Very Unhealthy',
        'Hazardous'
    ]

    aqi_bucket = np.select(conditions, aqi_buckets, default='Unknown')

    # Create DataFrame
    data = pd.DataFrame({
        'PM2.5': pm25,
        'PM10': pm10,
        'SO2': so2,
        'NO2': no2,
        'CO': co,
        'O3': o3,
        'Temperature': temperature,
        'Humidity': humidity,
        'Wind_Speed': wind_speed,
        'Month': month,
        'Season': season,
        'Location_Type': location_type,
        'AQI_Bucket': aqi_bucket
    })

    # Add some missing values to make it more realistic
    for col in data.columns:
        if col != 'AQI_Bucket':  # Don't add missing values to the target column
            # Add 2% missing values
            mask = np.random.random(n_samples) < 0.02
            data.loc[mask, col] = np.nan

    return data

def save_model(model, model_name):
    """
    Save a trained model to disk

    Parameters:
    -----------
    model : object
        The trained model to save
    model_name : str
        The name to use for the saved model
    """
    import joblib

    # Create models directory if it doesn't exist
    if not os.path.exists('models'):
        os.makedirs('models')

    # Save the model
    joblib.dump(model, f'models/{model_name}.joblib')
    print(f"Model saved as models/{model_name}.joblib")

def load_model(model_name):
    """
    Load a trained model from disk

    Parameters:
    -----------
    model_name : str
        The name of the model to load

    Returns:
    --------
    object
        The loaded model
    """
    import joblib

    model_path = f'models/{model_name}.joblib'

    if not os.path.exists(model_path):
        raise FileNotFoundError(f"Model file {model_path} not found")

    # Load the model
    return joblib.load(model_path)

In [12]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.metrics import roc_curve, auc
import plotly.express as px
import plotly.graph_objects as go

def plot_correlation_heatmap(data):
    """
    Plot a correlation heatmap for the given data

    Parameters:
    -----------
    data : pandas.DataFrame
        The data to plot the correlation heatmap for

    Returns:
    --------
    matplotlib.figure.Figure
        The figure object containing the heatmap
    """
    plt.figure(figsize=(12, 10))
    corr = data.corr()
    mask = np.triu(np.ones_like(corr, dtype=bool))
    cmap = sns.diverging_palette(230, 20, as_cmap=True)

    fig, ax = plt.subplots(figsize=(12, 10))
    sns.heatmap(
        corr,
        mask=mask,
        cmap=cmap,
        vmax=.3,
        center=0,
        square=True,
        linewidths=.5,
        cbar_kws={"shrink": .5},
        annot=True,
        fmt=".2f"
    )
    plt.title('Feature Correlation Heatmap')

    return fig

def plot_feature_importance(model, feature_names, model_type='logistic'):
    """
    Plot feature importance for the given model

    Parameters:
    -----------
    model : object
        The trained model (LogisticRegression or XGBClassifier)
    feature_names : array-like
        The names of the features
    model_type : str
        The type of model ('logistic' or 'xgboost')

    Returns:
    --------
    matplotlib.figure.Figure
        The figure object containing the feature importance plot
    """
    plt.figure(figsize=(12, 8))

    if model_type == 'logistic':
        # For multi-class, take average of absolute coefficients across classes
        if len(model.classes_) > 2:
            importances = np.mean(np.abs(model.coef_), axis=0)
        else:
            importances = np.abs(model.coef_[0])
    else:  # XGBoost
        importances = model.feature_importances_

    # Sort features by importance
    indices = np.argsort(importances)

    fig, ax = plt.subplots(figsize=(12, 8))
    plt.barh(range(len(indices)), importances[indices], align='center')
    plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
    plt.xlabel('Feature Importance')

    if model_type == 'logistic':
        plt.title('Logistic Regression Coefficient Magnitudes')
    else:
        plt.title('XGBoost Feature Importance')

    plt.tight_layout()

    return fig

def plot_confusion_matrix(cm, classes):
    """
    Plot a confusion matrix

    Parameters:
    -----------
    cm : array-like
        The confusion matrix to plot
    classes : array-like
        The class labels

    Returns:
    --------
    matplotlib.figure.Figure
        The figure object containing the confusion matrix plot
    """
    fig, ax = plt.subplots(figsize=(10, 8))

    sns.heatmap(
        cm,
        annot=True,
        fmt='d',
        cmap='Blues',
        xticklabels=classes,
        yticklabels=classes
    )

    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')

    return fig

def plot_roc_curve(y_true, lr_probs, xgb_probs, classes):
    """
    Plot ROC curves for both models

    Parameters:
    -----------
    y_true : array-like
        True target values
    lr_probs : array-like
        Predicted probabilities from Logistic Regression
    xgb_probs : array-like
        Predicted probabilities from XGBoost
    classes : array-like
        Class labels

    Returns:
    --------
    matplotlib.figure.Figure
        The figure object containing the ROC curve plot
    """
    fig, ax = plt.subplots(figsize=(10, 8))

    # Convert y_true to one-hot encoding for multi-class ROC
    y_true_dummies = pd.get_dummies(y_true, columns=classes).values

    # Calculate ROC curve and AUC for each class for both models
    for i, class_name in enumerate(classes):
        # Logistic Regression
        fpr_lr, tpr_lr, _ = roc_curve(y_true_dummies[:, i], lr_probs[:, i])
        roc_auc_lr = auc(fpr_lr, tpr_lr)
        ax.plot(fpr_lr, tpr_lr, lw=2, alpha=0.7,
                label=f'LR - {class_name} (AUC = {roc_auc_lr:.2f})')

        # XGBoost
        fpr_xgb, tpr_xgb, _ = roc_curve(y_true_dummies[:, i], xgb_probs[:, i])
        roc_auc_xgb = auc(fpr_xgb, tpr_xgb)
        ax.plot(fpr_xgb, tpr_xgb, lw=2, alpha=0.7, linestyle='--',
                label=f'XGB - {class_name} (AUC = {roc_auc_xgb:.2f})')

    # Plot diagonal
    ax.plot([0, 1], [0, 1], 'k--', lw=2)

    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.05])
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title('Receiver Operating Characteristic (ROC) Curves')
    ax.legend(loc="lower right")

    return fig

def plot_prediction_comparison(y_true, lr_preds, xgb_preds):
    """
    Plot a comparison of model predictions vs true values

    Parameters:
    -----------
    y_true : array-like
        True target values
    lr_preds : array-like
        Predictions from Logistic Regression
    xgb_preds : array-like
        Predictions from XGBoost

    Returns:
    --------
    matplotlib.figure.Figure
        The figure object containing the prediction comparison plot
    """
    # Calculate accuracy per class
    classes = np.unique(np.concatenate([y_true, lr_preds, xgb_preds]))

    # Initialize dictionaries to store class accuracies
    lr_class_acc = {}
    xgb_class_acc = {}

    for cls in classes:
        # Get indices where true class is cls
        idx = np.where(y_true == cls)[0]

        if len(idx) > 0:
            # Calculate class accuracy for Logistic Regression
            lr_class_acc[cls] = np.sum(lr_preds[idx] == cls) / len(idx)

            # Calculate class accuracy for XGBoost
            xgb_class_acc[cls] = np.sum(xgb_preds[idx] == cls) / len(idx)

    # Create a DataFrame for plotting
    df = pd.DataFrame({
        'Class': list(lr_class_acc.keys()),
        'Logistic Regression': list(lr_class_acc.values()),
        'XGBoost': list(xgb_class_acc.values())
    })

    # Melt the DataFrame for easier plotting
    df_melted = pd.melt(df, id_vars=['Class'], var_name='Model', value_name='Accuracy')

    # Create the plot
    fig, ax = plt.subplots(figsize=(12, 8))

    sns.barplot(x='Class', y='Accuracy', hue='Model', data=df_melted, ax=ax)

    plt.title('Model Accuracy by Class')
    plt.xlabel('Class')
    plt.ylabel('Accuracy')
    plt.xticks(rotation=45)
    plt.legend(title='Model')
    plt.tight_layout()

    return fig



In [14]:
[server]
headless = true
address = "0.0.0.0"
port = 5000

NameError: name 'server' is not defined

In [13]:
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from preprocessing import preprocess_data, handle_missing_values, encode_categorical_features
from models import train_logistic_regression, train_xgboost, evaluate_model
from visualization import (
    plot_correlation_heatmap,
    plot_feature_importance,
    plot_confusion_matrix,
    plot_roc_curve,
    plot_prediction_comparison
)
from utils import load_sample_data

# Set page configuration
st.set_page_config(
    page_title="AQI Prediction App",
    page_icon="🌍",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Define app title and description
st.title("🌍 Air Quality Index (AQI) Prediction")
st.markdown("""
This application helps you predict Air Quality Index (AQI) using machine learning models.
Upload your data, explore it, and compare the performance of Logistic Regression and XGBoost models.
""")

# Initialize session state variables if they don't exist
if 'data' not in st.session_state:
    st.session_state.data = None
if 'preprocessed_data' not in st.session_state:
    st.session_state.preprocessed_data = None
if 'X_train' not in st.session_state


SyntaxError: expected ':' (<ipython-input-13-61c6ad8756b0>, line 44)