In [None]:
import pandas as pd
import numpy as np
from scipy.stats import shapiro
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Load dataset (replace with actual file path)
df = pd.read_csv('air_quality_data.csv')

# Step 1: Handle missing values using Moving Average imputation
def impute_missing_values(df, window=3):
    df_imputed = df.fillna(df.rolling(window=window, min_periods=1).mean())
    return df_imputed

# Step 2: Detect and remove outliers using IQR method
def remove_outliers(df):
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    df_clean = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]
    return df_clean

# Step 3: Logarithmic Transformation for skewed data
def log_transform(df):
    return np.log1p(df)  # log(x + 1) to handle zeros

# Step 4: Check for normality using Shapiro-Wilk test
def check_normality(df):
    normality_results = {}
    for column in df.columns:
        stat, p_value = shapiro(df[column])
        normality_results[column] = p_value
    return normality_results

# Step 5: Z-score normalization
def normalize_data(df):
    scaler = StandardScaler()
    df_normalized = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
    return df_normalized

# Apply preprocessing steps
df_imputed = impute_missing_values(df)
df_clean = remove_outliers(df_imputed)
df_log_transformed = log_transform(df_clean)
normality_results = check_normality(df_log_transformed)
df_normalized = normalize_data(df_log_transformed)

# Save the preprocessed data
df_normalized.to_csv('preprocessed_data.csv', index=False)
