In [175]:
#loads data into the Python file

import pandas as pd
air_data = pd.read_csv('AirQuality_Daily_StudentVersion.csv')

In [176]:
#Defines Categories for humidity
def categorize_humidity(hum):
    if hum < 50:
        return 'Low'
    elif  hum <= 80:
        return 'High'
    else:
        return 'Very High'

#Defines Categories for temperature
def categorize_temperature(temp):
    if temp < 32:
        return 'Below Freezing'
   
    elif  temp <= 50:
        return 'Cool'
  
    elif temp <=70:
        return 'Warm'
    else:
        return 'Hot'
        
#Defines Categories for altitude#
def categorize_altitude(alt):
    if alt < 1500:
        return "low"
       
    elif alt <= 2500:
        return "Medium"
    
    else:
        return "High"



        

In [189]:
#Applies categories to the dataset 
air_data['humidity_category'] = air_data['humidity'].apply(categorize_humidity)

air_data['temperature_category'] = air_data['temperature'].apply(categorize_temperature)

air_data['altitude_category'] = air_data['sensor.altitude'].apply(categorize_altitude)

In [190]:
#--Step 1: find the top 5 locations with highest concentrations--

#Groups data by sensor name and aggregates based on median and mode
sorted_25 = air_data.groupby('sensor.name').agg({
    'pm2.5_atm': ['mean','median'],
        })
sorted_10 = air_data.groupby('sensor.name').agg({
    'pm10.0_atm': ['mean','median'],
        })
sorted_voc = air_data.groupby('sensor.name').agg({
    'voc': ['mean','median'],
        })


#sorts the aggregated data in decending order by the mean and median particle concentrations
pm25_mean = sorted_25.sort_values(by = ('pm2.5_atm','mean'), ascending=False).head(5)

pm10_mean = sorted_10.sort_values(by = ('pm10.0_atm','mean'), ascending=False).head(5)

voc_mean = sorted_voc.sort_values(by = ('voc','mean'), ascending=False).head(5)

pm25_median = sorted_25.sort_values(by = ('pm2.5_atm','median'), ascending=False).head(5)

pm10_median = sorted_10.sort_values(by = ('pm10.0_atm','median'), ascending=False).head(5)

voc_median = sorted_voc.sort_values(by = ('voc','median'), ascending=False).head(5)

In [179]:
#Prints out the necessary data for Step 1
print("      Top 5 locations with highest mean concentrations")
print("Top 5 pm2.5 mean:" )
print(pm25_mean)
print("\nTop 5 pm10.0 mean:")
print(pm10_mean)
print("\nTop 5 voc mean:" )
print(voc_mean)





      Top 5 locations with highest mean concentrations
Top 5 pm2.5 mean:
                                                     pm2.5_atm           
                                                          mean     median
sensor.name                                                              
Broken Bow                                          928.710593  36.050240
#16 - Richardson County Courthouse                  700.127342  11.977344
#18 - Southeast District Health Department- Tec...  613.175352  10.322875
NCDHD O'Neill #11                                   164.495078   7.251208
Swnphd-mccook                                       123.011622   4.582281

Top 5 pm10.0 mean:
                                                    pm10.0_atm           
                                                          mean     median
sensor.name                                                              
Broken Bow                                          929.678512  43.179094
#16 - Richardson Co

In [180]:
#prints out the necessary data for Step 1
print("      Top 5 locations with highest median concentrations")

print("Top 5 pm2.5 median:")
print(pm25_median)
print("\nTop 5 pm10.0 median:")
print(pm10_median)
print("\nTop 5 voc median:")
print(voc_median)

      Top 5 locations with highest median concentrations
Top 5 pm2.5 median:
                                                     pm2.5_atm           
                                                          mean     median
sensor.name                                                              
Broken Bow                                          928.710593  36.050240
#16 - Richardson County Courthouse                  700.127342  11.977344
#18 - Southeast District Health Department- Tec...  613.175352  10.322875
ELVPHD Norfolk HD 4                                  13.369492   9.706229
ELVPHD Wisner HD 5                                   11.154420   8.464583

Top 5 pm10.0 median:
                                                    pm10.0_atm           
                                                          mean     median
sensor.name                                                              
Broken Bow                                          929.678512  43.179094
#16 - Richard

In [181]:
#--Step 2: find the locations and dates where the maximum concentrations occured--

#filters the data set into only the necessary columns for each particle concentration
filter_voc = air_data[['date','sensor.name','voc']]

filter_25 = air_data[['date','sensor.name','pm2.5_atm']]

filter_10 = air_data[['date','sensor.name','pm10.0_atm']]


#sorts values in descending order showing only the top value which would be the max
pm25_max= filter_25.sort_values(by = ('pm2.5_atm'), ascending=False).head(1)

pm10_max= filter_10.sort_values(by = ('pm10.0_atm'), ascending=False).head(1)

voc_max= filter_voc.sort_values(by = ('voc'), ascending=False).head(1)



In [192]:
#prints out the necessary data for Step 2
print("      Top locations and dates where the maximum concentrations occured")

print("Top pm2.5 day:")

print(pm25_max)

print("\nTop pm10.0 day:")

print(pm10_max)

print("\nTop voc day:")

print(voc_max)

      Top locations and dates where the maximum concentrations occured
Top pm2.5 day:
          date                         sensor.name    pm2.5_atm
7561  02/18/25  #16 - Richardson County Courthouse  3782.823313

Top pm10.0 day:
          date                         sensor.name   pm10.0_atm
7561  02/18/25  #16 - Richardson County Courthouse  3784.682542

Top voc day:
          date      sensor.name          voc
2391  06/24/24  Swnphd-ogallala  1209.931571


In [183]:
#--Step 3: find if Temperature and Humidity have an impact on Air Quality--

# groups data by the temp categories we defined earlier
#aggregates by the mean particle concentration at each temperature
temperature_impact25 = air_data.groupby('temperature_category')['pm2.5_atm'].mean()
temperature_impact10 = air_data.groupby('temperature_category')['pm10.0_atm'].mean()

#sorts the concentration values in descending order to show impact
temp_imp25 = temperature_impact25.sort_values(ascending=False)

temp_imp10 = temperature_impact10.sort_values(ascending=False)

# groups data by the humidity categories we defined earlier
#aggregates by the mean particle concentration at each humidity
humidity_impact25 = air_data.groupby('humidity_category')['pm2.5_atm'].mean()
humidity_impact10 = air_data.groupby('humidity_category')['pm10.0_atm'].mean()

#sorts the concentration values in descending order to show impact
humidity_imp25 = humidity_impact25.sort_values(ascending=False)

humidity_imp10 = humidity_impact10.sort_values(ascending=False)

In [184]:
#prints the necessary data for Step 3
print("       Mean Air Quality in different conditions")

print(   "temperatures impact on air quality:\n")

print(temp_imp25)
print(temp_imp10)

print("\n humitdities impact on air quality:\n")

print(humidity_imp25)
print(humidity_imp10)

      

       Mean Air Quality in different conditions
temperatures impact on air quality:

temperature_category
Below Freezing    273.698560
Cool              141.658843
Warm               86.506629
Hot                70.145543
Name: pm2.5_atm, dtype: float64
temperature_category
Below Freezing    276.080794
Cool              143.074475
Warm               87.428866
Hot                71.821037
Name: pm10.0_atm, dtype: float64

 humitdities impact on air quality:

humidity_category
Very High    533.786399
High          80.874444
Low           76.513935
Name: pm2.5_atm, dtype: float64
humidity_category
Very High    536.520850
High          82.561133
Low           77.880520
Name: pm10.0_atm, dtype: float64


In [185]:
#--Step 4: find Dates and times where air quality levels where unhealthy

#filters the data set into only the necessary columns for each particle concentration
filter_25 = air_data[['date','sensor.name','pm2.5_atm']]
filter_10 = air_data[['date','sensor.name','pm10.0_atm']]
filter_voc = air_data[['date','sensor.name','voc']]

#Shows only data where the particle concentration was above 101 PPM or higher(unhealthy)
high_risk25 = filter_25[filter_25['pm2.5_atm'] >= 101]


high_risk10 = filter_10[filter_10['pm10.0_atm'] >= 101]




In [186]:
#prints out the necessary data for Step 4
print("      Days and Locations of Unhealthy levels of concentration")

print("\n Unhealthy levels of pm2.5\n")
print(high_risk25)
print("\n Unhealthy levels of pm10.0\n")
print(high_risk10)





      

      Days and Locations of Unhealthy levels of concentration

 Unhealthy levels of pm2.5

          date                                        sensor.name    pm2.5_atm
2092  06/12/24                                           FCHD-YPS   363.201104
2144  06/14/24                Loup Basin Public Health Department   230.482438
2443  06/27/24                                   Swnphd-Benklemen   188.156771
3016  07/20/24                 #16 - Richardson County Courthouse  1164.533750
3041  07/21/24                 #16 - Richardson County Courthouse  1355.784542
...        ...                                                ...          ...
8274  03/23/25                                      Swnphd-mccook   981.919875
8291  03/24/25  #18 - Southeast District Health Department- Te...  1692.498333
8293  03/24/25                                  NCDHD O'Neill #11  1714.236813
8294  03/24/25                                         Broken Bow  1713.840375
8295  03/24/25                          

In [187]:
#--Step 5 Find if altitude has an impact on air quality--

#Groups data by the altitude categories we defined earlier
#aggregates data by the average particle concentration of that category
altitude_impact25 = air_data.groupby('altitude_category')['pm2.5_atm'].mean()
altitude_impact10 = air_data.groupby('altitude_category')['pm10.0_atm'].mean()
altitude_impactvoc = air_data.groupby('altitude_category')['voc'].mean()

#sorts particle concentrations in descending order to show impact
alt_imp25 = altitude_impact25.sort_values(ascending=False)

alt_imp10 = altitude_impact10.sort_values(ascending=False)

alt_impvoc = altitude_impactvoc.sort_values(ascending=False)



In [188]:
#prints necessary data for Step 5
print(   "altitudes impact on pm2.5 levels:\n")
print(alt_imp25)
print(   "altitudes impact on pm10.0 levels:\n")
print(alt_imp10)
print(   "altitudes impact on voc levels:\n")
print(alt_impvoc)

altitudes impact on pm2.5 levels:

altitude_category
low       167.986720
High      147.867249
Medium     20.014525
Name: pm2.5_atm, dtype: float64
altitudes impact on pm10.0 levels:

altitude_category
low       169.849046
High      149.067477
Medium     21.461588
Name: pm10.0_atm, dtype: float64
altitudes impact on voc levels:

altitude_category
Medium    284.553208
High      283.223755
low       239.718255
Name: voc, dtype: float64
