# Model Building and Experimentation
This notebook is used for building and experimenting with models using parcel data retrieved from the Regrid API.

# Import Required Libraries
Import main, pandas, numpy, json, and scikit-learn.

In [3]:
# Import Required Libraries
from app import main
import pandas as pd
import numpy as np
import os
import json
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load and Explore Data
Use the regrid API connector in main.py to retrieve parcel data and load it into a pandas DataFrame. Perform basic exploration of the data.

In [4]:
# Load and Explore Data
# Retrieve parcel data using the regrid API connector
API = os.getenv('REGRID_API_KEY')
connector = main.RegridConnector(API)
parcel_data = connector.get_construction_data_by_address(address = '2508 Lorentz Pl N')

# Load data into a pandas DataFrame
df = pd.DataFrame(parcel_data)

# Display basic information about the dataset
print("Dataset Info:")
print(df.info())

# Display the first few rows of the dataset
print("First 5 Rows:")
print(df.head())

2025-05-08 01:49:26,743 - RegridConnector - INFO - Searching for parcels with address: 2508 Lorentz Pl N
2025-05-08 01:49:27,256 - RegridConnector - ERROR - Error in API call: 401 - {"status":"error","message":"An access token is required."}
2025-05-08 01:49:27,256 - RegridConnector - ERROR - Error in API call: 401 - {"status":"error","message":"An access token is required."}


HTTPError: 401 Client Error: Unauthorized for url: https://app.regrid.com/api/v1/search?address=2508+Lorentz+Pl+N

# Data Dictionary
The parcel data structures retrieved by the regrid API connector include the following fields:
- `parcel_id`: Unique identifier for each parcel.
- `geometry`: Geospatial data representing the parcel's shape.
- `land_use`: Categorical variable indicating the type of land use (e.g., residential, commercial).
- `area`: Numerical value representing the parcel's area in square meters.
- `value`: Numerical value representing the assessed value of the parcel.

# Data Dictionary
The following are the parcel data structures retrieved by the Regrid API connector:

- **MOCK_PARCEL_POINT**: Contains parcel ID and coordinates (latitude and longitude).
  - `parcel_id`: Unique identifier for the parcel.
  - `lat`: Latitude of the parcel.
  - `lon`: Longitude of the parcel.

- **MOCK_PARCEL_DETAILS**: Contains detailed information about the parcel.
  - `parcel_id`: Unique identifier for the parcel.
  - `zoning`: Zoning code of the parcel.
  - `zoning_description`: Description of the zoning code.
  - `land_use_code`: Land use code of the parcel.
  - `building_area_sq_ft`: Building area in square feet.
  - `land_area_sq_ft`: Land area in square feet.
  - `year_built`: Year the building was constructed.

- **MOCK_PARCEL_BOUNDARY**: Contains the boundary information of the parcel.
  - `type`: Type of the boundary (e.g., Feature).
  - `properties`: Properties of the boundary, including area.
  - `geometry`: Geometric representation of the boundary, including coordinates.

# Preprocess Data
Clean and preprocess the data, including handling missing values, encoding categorical variables, and scaling numerical features.

In [None]:
# Preprocess Data
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Handle missing values
df = df.dropna()

# Encode categorical variables
encoder = OneHotEncoder(sparse=False)
encoded_land_use = encoder.fit_transform(df[['land_use']])
encoded_land_use_df = pd.DataFrame(encoded_land_use, columns=encoder.get_feature_names_out(['land_use']))

# Scale numerical features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[['area', 'value']])
scaled_features_df = pd.DataFrame(scaled_features, columns=['scaled_area', 'scaled_value'])

# Combine preprocessed data
df_preprocessed = pd.concat([df.reset_index(drop=True), encoded_land_use_df, scaled_features_df], axis=1)
df_preprocessed = df_preprocessed.drop(['land_use', 'area', 'value'], axis=1)

print("Preprocessed Data:")
print(df_preprocessed.head())

# Build and Train Model
Use scikit-learn to build and train a machine learning model on the preprocessed data.

In [None]:
# Build and Train Model
# Split data into features and target
X = df_preprocessed.drop('parcel_id', axis=1)
y = df['value']  # Target variable

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

print("Model training complete.")

# Evaluate Model
Evaluate the model's performance using appropriate metrics and visualize the results.

In [None]:
# Evaluate Model
# Make predictions
y_pred = model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

# Visualize results
import matplotlib.pyplot as plt

plt.scatter(y_test, y_pred)
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs Predicted Values")
plt.show()