# Regression and Analysis

In [None]:
# Load variables from the previous notebook
%store -r den_ndvi_cdc_gdf den_tract_cdc_gdf all_den_ndvi_stats_df 

In [None]:
# Import libraries to help with ...

# Reproducible file paths
import os # Reproducible file paths
from glob import glob # Find files by pattern
import pathlib # Find the home folder
import time # formatting time
import warnings # Filter warning messages
import zipfile # Work with zip files
from io import BytesIO # Stream binary (zip) files

# Find files by pattern
import numpy as np # adjust images 
import matplotlib.pyplot as plt # Overlay pandas and xarry plots, Overlay raster and vector data
import requests # Request data over HTTP

# Work with tabular, vector, and raster data
import cartopy.crs as ccrs # CRSs (Coordinate Reference Systems)
import geopandas as gpd # work with vector data
import geoviews as gv # holoviews extension for data visualization
import hvplot.pandas # Interactive tabular and vector data
import hvplot.xarray # Interactive raster
import pandas as pd # Group and aggregate
import pystac_client # Modify returns from API
import shapely # Perform geometric operations on spatial data
import xarray as xr # Adjust images
import rioxarray as rxr # Work with geospatial raster data
from rioxarray.merge import merge_arrays # Merge rasters

# Processing and regression related
from scipy.ndimage import convolve # Image and signal processing
from sklearn.model_selection import KFold # Cross validation
from scipy.ndimage import label # Labels connected features in an array
from sklearn.linear_model import LinearRegression # Work with linear regression models
from sklearn.model_selection import train_test_split # Split data into subsets - evaluate model
from tqdm.notebook import tqdm # Visualize progress of iterative operations

# import to be able to save plots
import holoviews as hv # be able to save hvplots

# Suppress third party warnings - 'ignore'
warnings.simplefilter('ignore')

# Prevent GDAL from quitting due to momentary disruptions
os.environ["GDAL_HTTP_MAX_RETRY"] = "5"
os.environ["GDAL_HTTP_RETRY_DELAY"] = "1"

In [None]:
# Variable selection and transformation
# Create new variable for the model df
den_model_df = (
    # Using the den_ndvi_cdc_gdf
    den_ndvi_cdc_gdf
    # Create a copy to avoid modifying the original data
    .copy()
    # Select the subet of columns needed
    [['frac_veg', 'depression', 'all_mean_patch_size', 'all_edge_density', 'geometry']]
    # Remove any rows with NaN VALUES
    .dropna()
)
# Log transformation of depression data in the df
# This is to help handle skewed data or effort to normalize it
den_model_df['log_depression'] = np.log(den_model_df.depression)

# Plot scatter matrix to identify variables that need transformation
# Create new variable to save plots to
den_scatter_matrix = (

# Generate a scatter matrix (or pair plot)
hvplot.scatter_matrix(
    # Using model df
    den_model_df
    # Select columns to be plotted in the matrix
    [[ 
        'all_mean_patch_size',
        'all_edge_density',
        'log_depression'
    ]]
    )
)

# Save the plot as html to be able to display online
hv.save(den_scatter_matrix, 'den_scatter_matrix.html')  

# Display the plots
den_scatter_matrix


# Fit and Predict

In [None]:
# Select predictor and outcome variables
# Define the predictor or indpendent variables
X = den_model_df[['all_edge_density', 'all_mean_patch_size']]
# Define the outcome variable or dependent variable
y = den_model_df[['log_depression']]

# Split into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(
    # Specifiy that 33% of the data will be used for testing
    X, y, test_size=0.33, 
    # Ensure that data is split randomly - the random split is reproducible
    random_state=42)

# Fit a linear regression
#Create an instance of the linear regression model
reg = LinearRegression()
# Fit the training data to the linear regression model
reg.fit(X_train, y_train)

# Predict depression values for the test dataset
y_test['pred_depression'] = np.exp(
    # Apply exponential function to predicted values to transform to original scale
    reg.predict(X_test))
# Apply exponential function to predicted values to transform to original scale
y_test['depression'] = np.exp(y_test.log_depression)

# Plot measured vs. predicted depression prevalence with a 1-to-1 line

# Find max value of depression prevalence in the test dat to set the limits for the plot axes
y_max = y_test.depression.max()

# Create new variable to save plot to
den_measured_v_predicted_depression = (
(
# Create scatterplot 
 y_test.hvplot.scatter(
        # X axis is actual depression prevalence and Y axis is predicted depression prevalence
        x='depression', y='pred_depression',
        # Label x axis
        xlabel='Measured Depression Prevalence', 
        # Label y axis
        ylabel='Predicted Depression Prevalence',
        # Create title for plot
        title='Linear Regression Performance - Testing Data'
    ) 
    .opts(
        # Scale both axes the same
        aspect='equal', 
        # Set limits for the axes - scale according to range of actual depression values
        xlim=(0, y_max), ylim=(0, y_max), 
        # Set size of the plot
        height=500, width=600)
    # Add a slope line and set color of line
) * hv.Slope(slope=1, y_intercept=0).opts(color='black')
)
# Save the plot as html to be able to display online
hv.save(den_measured_v_predicted_depression, 'den_measured_v_predicted_depression.html') 

# Display the plot 
den_measured_v_predicted_depression

# Compute Error

In [None]:
# Compute model error for all census tracts
# Apply exponential function to predicted values to transform to original scale
den_model_df['pred_depression'] = np.exp(reg.predict(X))
# Calculate model error for each Census tract, store computed errors in a new column
den_model_df['err_depression'] = den_model_df['pred_depression'] - den_model_df['depression']

# Create new variable to save the plot to
den_model_error_chloropleth = (
# Plot error geographically as a chloropleth
(
    # Color the chloropleth based on the model error
    plot_chloropleth(den_model_df, color='err_depression', cmap='RdBu')
    # Adjust the color scale/range for the model error
    .redim.range(err_depression=(-.3, .3))
    # Customize plot
    .opts(
        # Add a title
        title= 'City of Denver - Model Errors for Predicted Depression Prevalence',
        # Add a label for color bar
        clabel= 'Model Error',
        # Adjust size of plot
        frame_width=600, 
        # Ensure aspect ratio equal (helps preserve the true shaps of census tracts)
        aspect='equal')
)
)
# Save the plot as html to be able to display online
hv.save(den_model_error_chloropleth, 'den_model_error_chloropleth.html')  

# Display the plot
den_model_error_chloropleth

# Describe and Interpret Image

In [None]:
# Store variables to use in next notebook
%store 