In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import IPython #Need for showing EDA visual within VSCODE
import nbformat #Need for showing EDA visual within VSCODE
print("✅ All packages successfully imported!")


✅ All packages successfully imported!


In [2]:
pollution_dataset = pd.read_csv('pollution_2000_2023.csv')

# removed Nans
pollution_dataset = pollution_dataset.dropna()

#Converting Date to datetime
pollution_dataset["Date"] = pd.to_datetime(pollution_dataset["Date"])

#Converting  Address columns  to Strings
pollution_dataset["Address"] = pollution_dataset["Address"].astype('string')
pollution_dataset["State"] = pollution_dataset["State"].astype('string')
pollution_dataset["County"] = pollution_dataset["County"].astype('string')
pollution_dataset["City"] = pollution_dataset["City"].astype('string')

#Converting the Max Hour of Pollutants to categorical variables
pollution_dataset["O3 1st Max Hour"] = pd.Categorical(pollution_dataset['O3 1st Max Hour'], categories=range(24))
pollution_dataset["CO 1st Max Hour"] = pd.Categorical(pollution_dataset['CO 1st Max Hour'], categories=range(24))
pollution_dataset["SO2 1st Max Hour"] = pd.Categorical(pollution_dataset['CO 1st Max Hour'], categories=range(24))
pollution_dataset["NO2 1st Max Hour"] = pd.Categorical(pollution_dataset['NO2 1st Max Hour'], categories=range(24))

#Converting AQI variables to Integer since that's what they are always reported as, and 2/4 were already int types
pollution_dataset["CO AQI"] = pollution_dataset["CO AQI"].astype(int)
pollution_dataset["SO2 AQI"] = pollution_dataset["SO2 AQI"].astype(int)

In [None]:
pollution_dataset

In [3]:
#Heat map showing mean concentration of each pollutant by state
mean_pollutant_cols=['O3 Mean','CO Mean','SO2 Mean','NO2 Mean']
avg_pollutant_state =pollution_dataset.groupby('State')[mean_pollutant_cols].mean().reset_index()
avg_pollutant_state

#Plotly expects state abbreviations, which is why we have to map States
state_abbrev = {
    'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR',
    'California': 'CA', 'Colorado': 'CO', 'Connecticut': 'CT',
    'Delaware': 'DE','District Of Columbia':'DC', 'Florida': 'FL', 'Georgia': 'GA', 'Hawaii': 'HI',
    'Idaho': 'ID', 'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA',
    'Kansas': 'KS', 'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME',
    'Maryland': 'MD', 'Massachusetts': 'MA', 'Michigan': 'MI',
    'Minnesota': 'MN', 'Mississippi': 'MS', 'Missouri': 'MO',
    'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV', 'New Hampshire': 'NH',
    'New Jersey': 'NJ', 'New Mexico': 'NM', 'New York': 'NY',
    'North Carolina': 'NC', 'North Dakota': 'ND', 'Ohio': 'OH',
    'Oklahoma': 'OK', 'Oregon': 'OR', 'Pennsylvania': 'PA',
    'Rhode Island': 'RI', 'South Carolina': 'SC', 'South Dakota': 'SD',
    'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT', 'Vermont': 'VT',
    'Virginia': 'VA', 'Washington': 'WA', 'West Virginia': 'WV',
    'Wisconsin': 'WI', 'Wyoming': 'WY'
}

avg_pollutant_state['State Code']=avg_pollutant_state['State'].map(state_abbrev)

for i in mean_pollutant_cols:
  fig = px.choropleth(
      avg_pollutant_state,
      locations='State Code',
      locationmode='USA-states',
      color=i,
      color_continuous_scale='Reds',
      scope='usa',
      labels={i: f'{i} Concentration'},
      title = f'Average {i} Concentration By State'
  )

  fig.show()


In [None]:
#scaling continous variables
scaler = StandardScaler()
scale_cols = ['O3 Mean','O3 1st Max Value','CO Mean', 'CO 1st Max Value', 'SO2 Mean','SO2 1st Max Value','NO2 Mean', 'NO2 1st Max Value']
pollution_dataset[scale_cols] = scaler.fit_transform(pollution_dataset[scale_cols])

In [None]:
# Add temporal features from Date
pollution_dataset['Year'] = pollution_dataset['Date'].dt.year
pollution_dataset['Month'] = pollution_dataset['Date'].dt.month
pollution_dataset['Day'] = pollution_dataset['Date'].dt.day
pollution_dataset['DayOfWeek'] = pollution_dataset['Date'].dt.dayofweek
pollution_dataset['IsWeekend'] = pollution_dataset['DayOfWeek'].isin([5, 6]).astype(int)
pollution_dataset['IsWedThur'] = pollution_dataset['DayOfWeek'].isin([2, 3]).astype(int)

# Create pollution level aggregates
pollution_dataset['Pollution_Avg'] = pollution_dataset[[
    'O3 Mean','CO Mean','SO2 Mean','NO2 Mean'
]].mean(axis=1)

# Lag features
pollution_dataset = pollution_dataset.sort_values(['City', 'Date'])
for col in ['O3 Mean','CO Mean','SO2 Mean','NO2 Mean']:
    pollution_dataset[f'{col}_lag1'] = pollution_dataset.groupby('City')[col].shift(1)

pollution_dataset.to_csv('cleaned_pollution_data.csv', index=False)

In [None]:
pollution_dataset

In [None]:
# Correlation heatmap
plt.figure(figsize=(10,6))
sns.heatmap(pollution_dataset[scale_cols].corr(), annot=True, cmap='coolwarm')
plt.title("Correlation of Pollutant Features")
plt.show()

# Time trend of a pollutant in a major city
city_name = 'Los Angeles'
la_data = pollution_dataset[pollution_dataset['City'] == city_name]
fig = px.line(la_data, x='Date', y='O3 Mean', title=f'O3 Levels Over Time in {city_name}')
fig.show()