# Linear Regression Model

In [1]:
# Imports
from pathlib import Path
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [2]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
file_path = ".\Resources\cleaned_data.csv"
df_nc_aqi = pd.read_csv(file_path)

# Review the DataFrame
df_nc_aqi.head()

Unnamed: 0,County,Year,Days with AQI,Good Days,Moderate Days,Unhealthy for Sensitive Groups Days,Unhealthy Days,Very Unhealthy Days,Hazardous Days,Max AQI,90th Percentile AQI,Median AQI,Days CO,Days NO2,Days Ozone,Days PM2.5,Days PM10,Good_to_Moderate_Ratio,Unhealthy_Days_Percentage
0,Buncombe,2000,260.0,129.0,110.0,15.0,6.0,0.0,0.0,179.0,93.0,51.0,0.0,0.0,176.0,84.0,0.0,1.172727,8.076923
1,Buncombe,2001,253.0,141.0,100.0,11.0,1.0,0.0,0.0,171.0,87.0,48.0,0.0,0.0,174.0,78.0,1.0,1.41,4.743083
2,Buncombe,2002,260.0,144.0,83.0,27.0,6.0,0.0,0.0,179.0,108.0,48.0,0.0,0.0,172.0,88.0,0.0,1.73494,12.692308
3,Buncombe,2003,303.0,178.0,119.0,6.0,0.0,0.0,0.0,137.0,72.0,46.0,0.0,0.0,90.0,213.0,0.0,1.495798,1.980198
4,Buncombe,2004,357.0,187.0,166.0,4.0,0.0,0.0,0.0,133.0,74.0,49.0,0.0,0.0,82.0,275.0,0.0,1.126506,1.120448


# Identifying Independent Variables(X) and Target Variable(y)

In [3]:
# creating target and independent variables
# assign double square backets to create df for multiple features
y = df_nc_aqi["Unhealthy_Days_Percentage"]
X = df_nc_aqi[["Days CO", "Days NO2", "Days Ozone", "Days PM2.5", "Days PM10", "90th Percentile AQI"]]

# Spliting Data into Training and Testing

In [4]:
#splitting data
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

# Create Linear Regression Model and Fit

In [5]:
#create and train model
model = LinearRegression()
model.fit(X_train, y_train)

# Testing Predictions

In [6]:
#predictions
y_pred = model.predict(X_test)
predictions_df = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)

# Calculate Accuracy

In [7]:
#evaluate the model
#aiming for predictive power of .80 R-squared

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Squared Error: 7.300552526641221
R-squared: 0.8729824605138959


# Creating Scope for Raleigh and Charlotte

In [8]:
#use csv file with 2023 data for Charlotte and Raleigh to predict the number of unhealthy days 
file_path2 = "./Resources/nc_2023_data.csv"
df_values_2023 = pd.read_csv(file_path2)

#review the new dataframe 
df_values_2023.head()

Unnamed: 0,Year,County,90th Percentile AQI,Days CO,Days NO2,Days OZONE,Days PM2.5,Days PM10
0,2023,Mecklenburg,71.0,0,1,218,108,0
1,2023,Wake,63.0,0,2,175,148,0


# Cleaning Dataframe to Fit Model

In [9]:
#changing column name Days OZONE to Days Ozone to match model
df_values_2023.rename(columns={'Days OZONE': 'Days Ozone'}, inplace=True)

#reordering to fit model
desired_order = ['Year','County','Days CO', 'Days NO2', 'Days Ozone', 'Days PM2.5', 'Days PM10', '90th Percentile AQI']
df_values_2023 = df_values_2023[desired_order]

#setting index to year and county to differentiate 
df_values_2023.set_index(["County", "Year"], inplace=True)

#reviewing changes 
df_values_2023.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Days CO,Days NO2,Days Ozone,Days PM2.5,Days PM10,90th Percentile AQI
County,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Mecklenburg,2023,0,1,218,108,0,71.0
Wake,2023,0,2,175,148,0,63.0


In [10]:
print(X.columns)

Index(['Days CO', 'Days NO2', 'Days Ozone', 'Days PM2.5', 'Days PM10',
       '90th Percentile AQI'],
      dtype='object')


In [11]:
print(df_values_2023.columns)

Index(['Days CO', 'Days NO2', 'Days Ozone', 'Days PM2.5', 'Days PM10',
       '90th Percentile AQI'],
      dtype='object')


# Finding Predicted Values

In [13]:
#making predictions using avg values
predictions_2023 = model.predict(df_values_2023)

#creating df for avg predictions
df_predict = pd.DataFrame({"Predicted_Unhealthy_Days_Percentage": predictions_2023})

# Combine 'County' and 'Year' columns
df_predict['County'] = df_values_2023.index.get_level_values("County")
df_predict['Year'] = df_values_2023.index.get_level_values("Year")

#setting index
df_predict.set_index(["County", "Year"], inplace=True)

#showing the df
df_predict.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted_Unhealthy_Days_Percentage
County,Year,Unnamed: 2_level_1
Mecklenburg,2023,3.021587
Wake,2023,0.926351


In [14]:
#merge the predicted values dataframe with the df_values_2023 dataframe
merged_2023_df = pd.merge(df_values_2023, df_predict, left_index = True, right_index = True, how = 'outer')
merged_2023_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Days CO,Days NO2,Days Ozone,Days PM2.5,Days PM10,90th Percentile AQI,Predicted_Unhealthy_Days_Percentage
County,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Mecklenburg,2023,0,1,218,108,0,71.0,3.021587
Wake,2023,0,2,175,148,0,63.0,0.926351


# Saving Results

In [15]:
#saving results for just the predicted values 
csv_file_path = ".\Resources\predicted_2023.csv"
merged_2023_df.to_csv(csv_file_path, index = True)
print(f"CSV file saved to: Resources")

CSV file saved to: Resources
