# 1. Loading the Data


In [1]:
# load the data from your CSV file into a pandas DataFrame.
import pandas as pd

# Load the data
file_path = '/Users/nataliiabondarenko/Desktop/GitHub/project-4/Resources/nc_aqi_1980-2022.csv'
data = pd.read_csv(file_path)

# Preview the data
print(data.head())


            State     County  Year  Days with AQI  Good Days  Moderate Days  \
0  North Carolina   Alamance  2000            104         37             66   
1  North Carolina  Alexander  2000            211         94             64   
2  North Carolina      Avery  2000            365        211            117   
3  North Carolina   Buncombe  2000            260        129            110   
4  North Carolina   Cabarrus  2000            115         36             78   

   Unhealthy for Sensitive Groups Days  Unhealthy Days  Very Unhealthy Days  \
0                                    1               0                    0   
1                                   48               5                    0   
2                                   34               3                    0   
3                                   15               6                    0   
4                                    1               0                    0   

   Hazardous Days  Max AQI  90th Percentile AQI  M

# 2. Data Inspection
Inspect the data to understand its structure, missing values, and potential inconsistencies.

In [2]:
# Basic info about the data
print(data.info())



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1587 entries, 0 to 1586
Data columns (total 18 columns):
 #   Column                               Non-Null Count  Dtype 
---  ------                               --------------  ----- 
 0   State                                1587 non-null   object
 1   County                               1587 non-null   object
 2   Year                                 1587 non-null   int64 
 3   Days with AQI                        1587 non-null   int64 
 4   Good Days                            1587 non-null   int64 
 5   Moderate Days                        1587 non-null   int64 
 6   Unhealthy for Sensitive Groups Days  1587 non-null   int64 
 7   Unhealthy Days                       1587 non-null   int64 
 8   Very Unhealthy Days                  1587 non-null   int64 
 9   Hazardous Days                       1587 non-null   int64 
 10  Max AQI                              1587 non-null   int64 
 11  90th Percentile AQI                  1587 n

In [3]:
# Summary statistics
print(data.describe())


              Year  Days with AQI    Good Days  Moderate Days  \
count  1587.000000    1587.000000  1587.000000    1587.000000   
mean   2003.705734     242.311279   164.687461      64.797732   
std      10.548201     102.244244    84.592536      46.220878   
min    1980.000000       4.000000     4.000000       0.000000   
25%    1996.000000     195.000000    98.000000      33.000000   
50%    2004.000000     235.000000   151.000000      62.000000   
75%    2012.000000     357.000000   235.000000      84.000000   
max    2022.000000     366.000000   359.000000     237.000000   

       Unhealthy for Sensitive Groups Days  Unhealthy Days  \
count                          1587.000000     1587.000000   
mean                             10.408948        2.236295   
std                              13.896111        4.984869   
min                               0.000000        0.000000   
25%                               0.000000        0.000000   
50%                               3.000000

In [4]:
# Checking for missing values
print(data.isnull().sum())

State                                  0
County                                 0
Year                                   0
Days with AQI                          0
Good Days                              0
Moderate Days                          0
Unhealthy for Sensitive Groups Days    0
Unhealthy Days                         0
Very Unhealthy Days                    0
Hazardous Days                         0
Max AQI                                0
90th Percentile AQI                    0
Median AQI                             0
Days CO                                0
Days NO2                               0
Days Ozone                             0
Days PM2.5                             0
Days PM10                              0
dtype: int64


# 3. Data Cleaning
Based on the inspection, perform necessary cleaning steps 

In [5]:
# Removing duplicates
data = data.drop_duplicates()

# Confirming if any duplicates were removed
print(f"Data shape after removing duplicates: {data.shape}")



Data shape after removing duplicates: (1587, 18)


# 4. Data Transformation
Transform the data as needed for analysis.

In [6]:
# Converting 'State' and 'County' to category data types
data['State'] = data['State'].astype('category')
data['County'] = data['County'].astype('category')

# Checking the data types again to confirm the changes
data.dtypes


State                                  category
County                                 category
Year                                      int64
Days with AQI                             int64
Good Days                                 int64
Moderate Days                             int64
Unhealthy for Sensitive Groups Days       int64
Unhealthy Days                            int64
Very Unhealthy Days                       int64
Hazardous Days                            int64
Max AQI                                   int64
90th Percentile AQI                       int64
Median AQI                                int64
Days CO                                   int64
Days NO2                                  int64
Days Ozone                                int64
Days PM2.5                                int64
Days PM10                                 int64
dtype: object

In [7]:
# Drop the 'State' column
data = data.drop(columns=['State'])

# Confirming if the column was dropped
print(f"Data shape after dropping 'State' column: {data.shape}")

Data shape after dropping 'State' column: (1587, 17)


# 5. Data Export
After cleaning export it to a new CSV file or directly use it for further analysis or machine learning.

In [8]:
# Exporting to a new CSV file
cleaned_file_path = '/Users/nataliiabondarenko/Desktop/GitHub/project-4/Resources/nc_aqi_1980-2022.csv'
data.to_csv(cleaned_file_path, index=False)


# 6. Database Storage 


In [9]:
from sqlalchemy import create_engine

# Create an engine that stores data in the local directory's file
engine = create_engine('sqlite:///nc_aqi_data.db')



In [10]:
# Store the data in a table named 'air_quality_data'
data.to_sql('air_quality_data', con=engine, if_exists='replace', index=False)


1587

In [11]:
# Read data back from the database to confirm
retrieved_data = pd.read_sql('air_quality_data', con=engine)
print(retrieved_data.head())


      County  Year  Days with AQI  Good Days  Moderate Days  \
0   Alamance  2000            104         37             66   
1  Alexander  2000            211         94             64   
2      Avery  2000            365        211            117   
3   Buncombe  2000            260        129            110   
4   Cabarrus  2000            115         36             78   

   Unhealthy for Sensitive Groups Days  Unhealthy Days  Very Unhealthy Days  \
0                                    1               0                    0   
1                                   48               5                    0   
2                                   34               3                    0   
3                                   15               6                    0   
4                                    1               0                    0   

   Hazardous Days  Max AQI  90th Percentile AQI  Median AQI  Days CO  \
0               0      104                   76          56        0   
1 