# 1. Loading the Data


In [2]:
# load the data from your CSV file into a pandas DataFrame.
import pandas as pd

# Load the data
file_path = '/Users/nataliiabondarenko/Desktop/Project4/Resources/nc_aqi_2010-2022.csv'
data = pd.read_csv(file_path)

# Preview the data
print(data.head())


            State     County  Year  Days with AQI  Good Days  Moderate Days  \
0  North Carolina   Alamance  2010            363        231            132   
1  North Carolina  Alexander  2010            214        132             75   
2  North Carolina      Avery  2010            362        280             75   
3  North Carolina   Buncombe  2010            364        246            114   
4  North Carolina   Caldwell  2010            212        149             59   

   Unhealthy for Sensitive Groups Days  Unhealthy Days  Very Unhealthy Days  \
0                                    0               0                    0   
1                                    7               0                    0   
2                                    7               0                    0   
3                                    4               0                    0   
4                                    4               0                    0   

   Hazardous Days  Max AQI  90th Percentile AQI  M

# 2. Data Inspection
Inspect the data to understand its structure, missing values, and potential inconsistencies.

In [3]:
# Basic info about the data
print(data.info())

# Summary statistics
print(data.describe())

# Checking for missing values
print(data.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 521 entries, 0 to 520
Data columns (total 18 columns):
 #   Column                               Non-Null Count  Dtype 
---  ------                               --------------  ----- 
 0   State                                521 non-null    object
 1   County                               521 non-null    object
 2   Year                                 521 non-null    int64 
 3   Days with AQI                        521 non-null    int64 
 4   Good Days                            521 non-null    int64 
 5   Moderate Days                        521 non-null    int64 
 6   Unhealthy for Sensitive Groups Days  521 non-null    int64 
 7   Unhealthy Days                       521 non-null    int64 
 8   Very Unhealthy Days                  521 non-null    int64 
 9   Hazardous Days                       521 non-null    int64 
 10  Max AQI                              521 non-null    int64 
 11  90th Percentile AQI                  521 non-

# 3. Data Cleaning
Based on the inspection, perform necessary cleaning steps 

In [6]:
# Removing duplicates
data = data.drop_duplicates()

# Confirming if any duplicates were removed
print(f"Data shape after removing duplicates: {data.shape}")



Data shape after removing duplicates: (521, 18)


# 4. Data Transformation
Transform the data as needed for analysis.

In [7]:
# Converting 'State' and 'County' to category data types
data['State'] = data['State'].astype('category')
data['County'] = data['County'].astype('category')

# Checking the data types again to confirm the changes
data.dtypes


State                                  category
County                                 category
Year                                      int64
Days with AQI                             int64
Good Days                                 int64
Moderate Days                             int64
Unhealthy for Sensitive Groups Days       int64
Unhealthy Days                            int64
Very Unhealthy Days                       int64
Hazardous Days                            int64
Max AQI                                   int64
90th Percentile AQI                       int64
Median AQI                                int64
Days CO                                   int64
Days NO2                                  int64
Days Ozone                                int64
Days PM2.5                                int64
Days PM10                                 int64
dtype: object

# 5. Data Export
After cleaning export it to a new CSV file or directly use it for further analysis or machine learning.

In [8]:
# Exporting to a new CSV file
cleaned_file_path = '/Users/nataliiabondarenko/Desktop/Project4/Resources/nc_aqi_2010-2022.csv'
data.to_csv(cleaned_file_path, index=False)


# 6. Database Storage 


In [9]:
from sqlalchemy import create_engine

# Create an engine that stores data in the local directory's file
engine = create_engine('sqlite:///nc_aqi_data.db')



In [10]:
# Store the data in a table named 'air_quality_data'
data.to_sql('air_quality_data', con=engine, if_exists='replace', index=False)


521

In [11]:
# Read data back from the database to confirm
retrieved_data = pd.read_sql('air_quality_data', con=engine)
print(retrieved_data.head())


            State     County  Year  Days with AQI  Good Days  Moderate Days  \
0  North Carolina   Alamance  2010            363        231            132   
1  North Carolina  Alexander  2010            214        132             75   
2  North Carolina      Avery  2010            362        280             75   
3  North Carolina   Buncombe  2010            364        246            114   
4  North Carolina   Caldwell  2010            212        149             59   

   Unhealthy for Sensitive Groups Days  Unhealthy Days  Very Unhealthy Days  \
0                                    0               0                    0   
1                                    7               0                    0   
2                                    7               0                    0   
3                                    4               0                    0   
4                                    4               0                    0   

   Hazardous Days  Max AQI  90th Percentile AQI  M

In [None]:
# Feature Engineering
# Adding new features based on the existing data

# Ratio of Good to Moderate Days
data['Good_to_Moderate_Ratio'] = data['Good Days'] / data['Moderate Days']

# Percentage of Unhealthy Days (of all types)
data['Unhealthy_Days_Percentage'] = (
    data['Unhealthy for Sensitive Groups Days'] + data['Unhealthy Days'] + data['Very Unhealthy Days'] + data['Hazardous Days']
) / data['Days with AQI'] * 100

# Display the first few rows with the new features
data[['State', 'County', 'Year', 'Good_to_Moderate_Ratio', 'Unhealthy_Days_Percentage']].head()

