# Data Exploration - Ouray County Parcvel Risk
**Author:** Bryce A Young  
**Created:** 2025-01-17 | 
**Modified:** 2025-01-17  

#### Overview
In the notebooks contained under the `data_prep` folder of this repository, we prepared raster, vector, and tabular data for analysis through extensive cleaning. In the `hiz` folder of this repository, the notebooks contain code for finding and summarizing raster values within the HIZ of each home, then appending that to the shapefiles. In this `analysis` folder, we finally get to see what the data is capable of producing.

This notebook explores relationships between variables in order to describe them for my thesis. 

First, let's import the training data. We will import a csv for lighter computation instead of the shapefile. Later, we will join the results to the shapefile. 

In [None]:
import pandas as pd
import numpy as np

# Set pandas option to display all columns instead of truncate
pd.options.display.max_columns = 100

# Import training data
train = pd.read_csv('file/path.csv')

print('training data shape: ', train.shape)
print('training data preview: ')
train.head()

Now let's visualize the realationship of every numeric predictor variable to the target variable.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Set up a 5x3 grid
fig, axes = plt.subplots(nrows=5, ncols=3, figsize=(15, 20))

# Flatten the 2D array of subplots for easier indexing
axes = axes.flatten()

# Variables 'f6' to 'f20'
variables = ['f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20']

# Response variable
target_variable = 'target'

# Create scatter plots for each variable against 'target' in the grid
for i, variable in enumerate(variables):
    sns.scatterplot(x=train[variable], y=train[target_variable], ax=axes[i])
    axes[i].set_title(f'Scatter Plot for {variable} vs. {target_variable}')
    axes[i].set_xlabel(variable)
    axes[i].set_ylabel(target_variable)

# Adjust layout
plt.tight_layout()
plt.show()

Let's see the distribution of the continuous data with box and whisker plots using seaborn.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

fig, axes = plt.subplots(nrows=5, ncols=3, figsize=(15, 20))

# Flatten the 2D array of subplots for easier indexing
axes = axes.flatten()

# Variables 'f6' to 'f20'
variables = ['f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20']

# Create boxen plots for each variable in the grid
for i, variable in enumerate(variables):
    sns.boxplot(x=train[variable], ax=axes[i])
    axes[i].set_xlabel(variable)

# Adjust layout
plt.tight_layout()
plt.show()

Now we can visualize the distribution of categorical variables by printing out their value counts.

In [None]:
cols = ['1', '2']

for col in cols:
    print(train[col].value_counts(normalize=True)) # Normalize will print as a percentage

Let's make a bar chart for each  of these

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Example DataFrame
data = {
    'var1': ['A', 'B', 'A', 'C', 'A', 'B', 'C'],
    'var2': ['X', 'X', 'Y', 'Y', 'Z', 'X', 'Z'],
    'var3': ['D', 'E', 'D', 'F', 'F', 'E', 'D']
}
df = pd.DataFrame(data)

# Plot bar chart subplots
def plot_categorical_distributions(df):
    # Determine the number of variables
    cat_columns = df.select_dtypes(include='object').columns
    num_vars = len(cat_columns)
    
    # Set up subplots
    fig, axes = plt.subplots(nrows=num_vars, ncols=1, figsize=(8, num_vars * 3))
    if num_vars == 1:  # If only one variable, axes is not a list
        axes = [axes]
    
    # Plot each variable
    for ax, col in zip(axes, cat_columns):
        df[col].value_counts().plot(kind='bar', ax=ax, color='#76c7c0', alpha=0.9)
        ax.set_title(f'Distribution of {col}')
        ax.set_ylabel('Count')
        ax.set_xlabel('Categories')
    
    # Adjust layout
    plt.tight_layout()
    plt.show()

# Call the function
plot_categorical_distributions(df)

In [None]:
import seaborn as sns

# Count plot is another handy visualization tool
sns.countplot(data=train, x='col')

# OPTIONAL: add hue to break it down by another variable
# sns.countplot(data=train, x='col1', hue='col2')

Now this one is going to be cool. Let's make a heatmap of each categorical variable, showing its correlation to every other variable.

In [None]:
# Variable correlations - correlation heat map with numbers
X = train[['f1','f2','f3','f4','f5','f6','f7','f8','f9','f10',
          'f11','f12','f13','f14','f15','f16','f17','f18','f19','f20']]
y = train[['target']].copy()

# Compute the correlation matrix
corr = X.corr()

# Set up the matplotlib figure
plt.figure(figsize=(12, 10))

# Create a heatmap with correlation coefficients
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f", vmin=-1, vmax=1)
plt.title('Correlation Heatmap with Coefficients')
plt.show()

We can verify this correlation a little more scientifically with chi-squared tests, below. We can also create a correlation matrix.

In [None]:
from scipy.stats import chi2_contingency

contingency_table = pd.crosstab(df['cat1'], df['cat2'])
chi2, p, dof, expected = chi2_contingency(contingency_table)
print(f"Chi2: {chi2}, p-value: {p}")

In [None]:
# Correlation matrix and heatmap for continuous data
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Example DataFrame
data = {
    'var1': [1, 2, 3, 4, 5],
    'var2': [5, 4, 3, 2, 1],
    'var3': [2, 3, 4, 5, 6]
}
df = pd.DataFrame(data)

# Compute correlation matrix
corr_matrix = df.corr()

# Plot heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', cbar=True)
plt.title("Correlation Heatmap")
plt.show()

In [None]:
# Association heatmap for categorical data
from scipy.stats import chi2_contingency

def cramers_v(x, y):
    """Calculate Cramér's V for two categorical variables."""
    contingency_table = pd.crosstab(x, y)
    chi2, _, _, _ = chi2_contingency(contingency_table)
    n = contingency_table.sum().sum()
    return np.sqrt(chi2 / (n * (min(contingency_table.shape) - 1)))

# Example DataFrame
data = {
    'var1': ['A', 'B', 'A', 'C', 'A', 'B', 'C'],
    'var2': ['X', 'X', 'Y', 'Y', 'Z', 'X', 'Z'],
    'var3': ['D', 'E', 'D', 'F', 'F', 'E', 'D']
}
df = pd.DataFrame(data)

# Compute Cramér's V matrix
categorical_cols = df.columns
matrix = pd.DataFrame(index=categorical_cols, columns=categorical_cols)

for col1 in categorical_cols:
    for col2 in categorical_cols:
        matrix.loc[col1, col2] = cramers_v(df[col1], df[col2])

matrix = matrix.astype(float)  # Ensure numeric for heatmap

# Plot heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(matrix, annot=True, fmt=".2f", cmap='coolwarm', cbar=True)
plt.title("Cramér's V Heatmap (Categorical Variables)")
plt.show()