# 1. Loading the Data


In [1]:
# load the data from your CSV file into a pandas DataFrame.
import pandas as pd

# Load the data
file_path = '/Users/nataliiabondarenko/Desktop/GitHub/project-4/Resources/nc_aqi_1980-2022.csv'
data = pd.read_csv(file_path)

# Preview the data
print(data.head())


     County  Year  Days with AQI  Good Days  Moderate Days  \
0  Buncombe  2000          260.0      129.0          110.0   
1  Buncombe  2001          253.0      141.0          100.0   
2  Buncombe  2002          260.0      144.0           83.0   
3  Buncombe  2003          303.0      178.0          119.0   
4  Buncombe  2004          357.0      187.0          166.0   

   Unhealthy for Sensitive Groups Days  Unhealthy Days  Very Unhealthy Days  \
0                                 15.0             6.0                  0.0   
1                                 11.0             1.0                  0.0   
2                                 27.0             6.0                  0.0   
3                                  6.0             0.0                  0.0   
4                                  4.0             0.0                  0.0   

   Hazardous Days  Max AQI  90th Percentile AQI  Median AQI  Days CO  \
0             0.0    179.0                 93.0        51.0      0.0   
1       

# 2. Data Inspection
Inspect the data to understand its structure, missing values, and potential inconsistencies.

In [2]:
# Basic info about the data
print(data.info())



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1845 entries, 0 to 1844
Data columns (total 17 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   County                               1845 non-null   object 
 1   Year                                 1845 non-null   int64  
 2   Days with AQI                        1845 non-null   float64
 3   Good Days                            1845 non-null   float64
 4   Moderate Days                        1845 non-null   float64
 5   Unhealthy for Sensitive Groups Days  1845 non-null   float64
 6   Unhealthy Days                       1845 non-null   float64
 7   Very Unhealthy Days                  1845 non-null   float64
 8   Hazardous Days                       1845 non-null   float64
 9   Max AQI                              1845 non-null   float64
 10  90th Percentile AQI                  1845 non-null   float64
 11  Median AQI                    

In [3]:
# Summary statistics
print(data.describe())


              Year  Days with AQI    Good Days  Moderate Days  \
count  1845.000000    1845.000000  1845.000000    1845.000000   
mean   2003.327371     251.702439   169.016260      68.559350   
std      10.868527     101.860410    83.806316      48.120402   
min    1980.000000       4.000000     4.000000       0.000000   
25%    1995.000000     204.000000   103.000000      36.000000   
50%    2003.000000     247.000000   155.000000      64.000000   
75%    2012.000000     360.000000   237.000000      90.000000   
max    2022.000000     366.000000   359.000000     237.000000   

       Unhealthy for Sensitive Groups Days  Unhealthy Days  \
count                          1845.000000     1845.000000   
mean                             11.323577        2.569648   
std                              14.508802        5.495921   
min                               0.000000        0.000000   
25%                               0.000000        0.000000   
50%                               4.000000

In [4]:
# Checking for missing values
print(data.isnull().sum())

County                                 0
Year                                   0
Days with AQI                          0
Good Days                              0
Moderate Days                          0
Unhealthy for Sensitive Groups Days    0
Unhealthy Days                         0
Very Unhealthy Days                    0
Hazardous Days                         0
Max AQI                                0
90th Percentile AQI                    0
Median AQI                             0
Days CO                                0
Days NO2                               0
Days Ozone                             0
Days PM2.5                             0
Days PM10                              0
dtype: int64


# 3. Data Cleaning
Based on the inspection, perform necessary cleaning steps 

In [5]:
# Removing duplicates
data = data.drop_duplicates()

# Confirming if any duplicates were removed
print(f"Data shape after removing duplicates: {data.shape}")



Data shape after removing duplicates: (1596, 17)


# 4. Data Transformation
Transform the data as needed for analysis.

In [6]:
# Converting 'State' and 'County' to category data types
if 'State' in data.columns:
    data['State'] = data['State'].astype('category')
data['County'] = data['County'].astype('category')

# Checking the data types again to confirm the changes
print(data.dtypes)

County                                 category
Year                                      int64
Days with AQI                           float64
Good Days                               float64
Moderate Days                           float64
Unhealthy for Sensitive Groups Days     float64
Unhealthy Days                          float64
Very Unhealthy Days                     float64
Hazardous Days                          float64
Max AQI                                 float64
90th Percentile AQI                     float64
Median AQI                              float64
Days CO                                 float64
Days NO2                                float64
Days Ozone                              float64
Days PM2.5                              float64
Days PM10                               float64
dtype: object


In [7]:
# Drop the 'State' column if it exists
if 'State' in data.columns:
    data = data.drop(columns=['State'])

# Confirming if the column was dropped
print(f"Data shape after dropping 'State' column: {data.shape}")

Data shape after dropping 'State' column: (1596, 17)


# 5. Data Export
After cleaning export it to a new CSV file or directly use it for further analysis or machine learning.

In [8]:
# Exporting to a new CSV file
cleaned_file_path = '/Users/nataliiabondarenko/Desktop/GitHub/project-4/Resources/cleaned_data.csv'
data.to_csv(cleaned_file_path, index=False)


# 6. Database Storage 


In [8]:
from sqlalchemy import create_engine

# Create an engine that stores data in the local directory's file
engine = create_engine('sqlite:///nc_aqi_data1.db')



In [9]:
# Store the data in a table named 'air_quality'
data.to_sql('air_quality', con=engine, if_exists='replace', index=False)

# Store the data in a table named 'predicted_2023'
data.to_sql('predicted_2023', con=engine, if_exists='replace', index=False)


1596

In [10]:
# Read data back from the database to confirm
retrieved_data = pd.read_sql('air_quality', con=engine)
retrieved_data1 = pd.read_sql('predicted_2023', con=engine)
print(retrieved_data.head())
print(retrieved_data1.head())


     County  Year  Days with AQI  Good Days  Moderate Days  \
0  Buncombe  2000          260.0      129.0          110.0   
1  Buncombe  2001          253.0      141.0          100.0   
2  Buncombe  2002          260.0      144.0           83.0   
3  Buncombe  2003          303.0      178.0          119.0   
4  Buncombe  2004          357.0      187.0          166.0   

   Unhealthy for Sensitive Groups Days  Unhealthy Days  Very Unhealthy Days  \
0                                 15.0             6.0                  0.0   
1                                 11.0             1.0                  0.0   
2                                 27.0             6.0                  0.0   
3                                  6.0             0.0                  0.0   
4                                  4.0             0.0                  0.0   

   Hazardous Days  Max AQI  90th Percentile AQI  Median AQI  Days CO  \
0             0.0    179.0                 93.0        51.0      0.0   
1       

In [11]:
# Adding new features based on the existing data

# Ratio of Good to Moderate Days
data['Good_to_Moderate_Ratio'] = data['Good Days'] / data['Moderate Days']

# Percentage of Unhealthy Days (of all types)
data['Unhealthy_Days_Percentage'] = (
    data['Unhealthy for Sensitive Groups Days'] + data['Unhealthy Days'] + data['Very Unhealthy Days'] + data['Hazardous Days']
) / data['Days with AQI'] * 100

# Display the first few rows with the new features
data[['County', 'Year', 'Good_to_Moderate_Ratio', 'Unhealthy_Days_Percentage']].head()


Unnamed: 0,County,Year,Good_to_Moderate_Ratio,Unhealthy_Days_Percentage
0,Buncombe,2000,1.172727,8.076923
1,Buncombe,2001,1.41,4.743083
2,Buncombe,2002,1.73494,12.692308
3,Buncombe,2003,1.495798,1.980198
4,Buncombe,2004,1.126506,1.120448


In [18]:
# Exporting to a new CSV file
new_features_file_path = '/Users/nataliiabondarenko/Desktop/GitHub/project-4/Resources/cleaned_data.csv'
data.to_csv(new_features_file_path, index=False)