# Linear Regression Model

In [1]:
# Imports
from pathlib import Path
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [2]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
file_path = ".\Resources\cleaned_data.csv"
df_nc_aqi = pd.read_csv(file_path)

# Review the DataFrame
df_nc_aqi.head()

Unnamed: 0,County,Year,Days with AQI,Good Days,Moderate Days,Unhealthy for Sensitive Groups Days,Unhealthy Days,Very Unhealthy Days,Hazardous Days,Max AQI,90th Percentile AQI,Median AQI,Days CO,Days NO2,Days Ozone,Days PM2.5,Days PM10,Good_to_Moderate_Ratio,Unhealthy_Days_Percentage
0,Buncombe,2000,260.0,129.0,110.0,15.0,6.0,0.0,0.0,179.0,93.0,51.0,0.0,0.0,176.0,84.0,0.0,1.172727,8.076923
1,Buncombe,2001,253.0,141.0,100.0,11.0,1.0,0.0,0.0,171.0,87.0,48.0,0.0,0.0,174.0,78.0,1.0,1.41,4.743083
2,Buncombe,2002,260.0,144.0,83.0,27.0,6.0,0.0,0.0,179.0,108.0,48.0,0.0,0.0,172.0,88.0,0.0,1.73494,12.692308
3,Buncombe,2003,303.0,178.0,119.0,6.0,0.0,0.0,0.0,137.0,72.0,46.0,0.0,0.0,90.0,213.0,0.0,1.495798,1.980198
4,Buncombe,2004,357.0,187.0,166.0,4.0,0.0,0.0,0.0,133.0,74.0,49.0,0.0,0.0,82.0,275.0,0.0,1.126506,1.120448


# Identifying Independent Variables(X) and Target Variable(y)

In [3]:
# creating target and independent variables
# assign double square backets to create df for multiple features
y = df_nc_aqi["Unhealthy_Days_Percentage"]
X = df_nc_aqi[["Days CO", "Days NO2", "Days Ozone", "Days PM2.5", "Days PM10", "90th Percentile AQI"]]

# Spliting Data into Training and Testing

In [4]:
#splitting data
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

# Create Linear Regression Model and Fit

In [5]:
#create and train model
model = LinearRegression()
model.fit(X_train, y_train)

# Testing Predictions

In [6]:
#predictions
y_pred = model.predict(X_test)
predictions_df = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)

# Calculate Accuracy

In [7]:
#evaluate the model
#aiming for predictive power of .80 R-squared

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Squared Error: 7.319732670847785
R-squared: 0.8726487577269881


# Creating Scope

In [8]:
#creating subset for Raleigh, Charlotte, Wilmigton, Greenville, Fayettville, and Ashville 
selected_counties = ["Buncombe","Mecklenburg","Wake","New Hanover","Cumberland","Pitt"]
df_nc_aqi_subset = df_nc_aqi[df_nc_aqi['County'].isin(selected_counties)]

#make sure it loaded properly 
print(df_nc_aqi_subset.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 258 entries, 0 to 257
Data columns (total 19 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   County                               258 non-null    object 
 1   Year                                 258 non-null    int64  
 2   Days with AQI                        258 non-null    float64
 3   Good Days                            258 non-null    float64
 4   Moderate Days                        258 non-null    float64
 5   Unhealthy for Sensitive Groups Days  258 non-null    float64
 6   Unhealthy Days                       258 non-null    float64
 7   Very Unhealthy Days                  258 non-null    float64
 8   Hazardous Days                       258 non-null    float64
 9   Max AQI                              258 non-null    float64
 10  90th Percentile AQI                  258 non-null    float64
 11  Median AQI                      

# Making Predictions

In [9]:
#preparing features and target variable
y_subset = df_nc_aqi_subset["Unhealthy_Days_Percentage"]
X_subset = df_nc_aqi_subset[["Days CO", "Days NO2", "Days Ozone", "Days PM2.5", "Days PM10", "90th Percentile AQI"]]

#making unhealthy days predictions
predictions_subset = model.predict(X_subset)

ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- Year


# Interpreting Results

In [None]:
#creating dataframe with the 2023 unhealthy days predicted precentage
df_predictions_subset = pd.DataFrame({"County": df_nc_aqi_subset["County"], "Predicted_Unhealthy_Days_Percentage": predictions_subset})
print(df_predictions_subset)