# Alejandro Marchini Asssignment 3

In [361]:
import boto3
import sagemaker
import pandas as pd
import numpy as np
import time
from datetime import datetime
from sagemaker.session import Session
from sagemaker.feature_store.feature_group import FeatureGroup
from sagemaker import get_execution_role
from time import gmtime, strftime

### Initialize SageMaker Feature Store Session

In [362]:
region = boto3.Session().region_name
boto_session = boto3.Session(region_name=region)

sagemaker_client = boto_session.client(service_name="sagemaker", region_name=region)
featurestore_runtime = boto_session.client(
    service_name="sagemaker-featurestore-runtime", region_name=region
)

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime,
)

# Get the execution role and default bucket
role = get_execution_role()
default_s3_bucket_name = feature_store_session.default_bucket()
prefix = "sagemaker-featurestore-neighborhood"

### Load Data

In [363]:
# Load the housing data
housing_data = pd.read_csv('/home/sagemaker-user/aai-540-homework/homework-3-1/housing.csv')
housing_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [364]:
# Load the Google Maps location data
gmaps_data = pd.read_csv('/home/sagemaker-user/aai-540-homework/homework-3-1/housing_gmaps_data_raw.csv')
gmaps_data.head()

Unnamed: 0,street_number,route,locality-political,administrative_area_level_2-political,administrative_area_level_1-political,country-political,postal_code,address,longitude,latitude,...,establishment-natural_feature,airport-establishment-point_of_interest,political-sublocality-sublocality_level_1,administrative_area_level_3-political,post_box,establishment-light_rail_station-point_of_interest-transit_station,establishment-point_of_interest,aquarium-establishment-park-point_of_interest-tourist_attraction-zoo,campground-establishment-lodging-park-point_of_interest-rv_park-tourist_attraction,cemetery-establishment-park-point_of_interest
0,3130,Grizzly Peak Boulevard,Berkeley,Alameda County,California,United States,94705.0,"3130 Grizzly Peak Blvd, Berkeley, CA 94705, USA",-122.23,37.88,...,,,,,,,,,,
1,2005,Tunnel Road,Oakland,Alameda County,California,United States,94611.0,"2005 Tunnel Rd, Oakland, CA 94611, USA",-122.22,37.86,...,,,,,,,,,,
2,6886,Chabot Road,Oakland,Alameda County,California,United States,94618.0,"6886 Chabot Rd, Oakland, CA 94618, USA",-122.24,37.85,...,,,,,,,,,,
3,6365,Florio Street,Oakland,Alameda County,California,United States,94618.0,"6365 Florio St, Oakland, CA 94618, USA",-122.25,37.85,...,,,,,,,,,,
4,5407,Bryant Avenue,Oakland,Alameda County,California,United States,94618.0,"5407 Bryant Ave, Oakland, CA 94618, USA",-122.25,37.84,...,,,,,,,,,,


### Data Checks

I initially encountered some issues with missing data trying to derive the required features

In [365]:
print("Missing values in housing data:")
print(housing_data[['longitude', 'latitude', 'total_bedrooms', 'households', 
                     'median_house_value', 'housing_median_age', 'ocean_proximity']].isnull().sum())

print("\nMissing values in gmaps data:")
print(gmaps_data[['longitude', 'latitude', 'neighborhood-political', 'postal_code']].isnull().sum())

print("\nUnique ocean_proximity values:")
print(housing_data['ocean_proximity'].unique())

Missing values in housing data:
longitude               0
latitude                0
total_bedrooms        207
households              0
median_house_value      0
housing_median_age      0
ocean_proximity         0
dtype: int64

Missing values in gmaps data:
longitude                    0
latitude                     0
neighborhood-political    8413
postal_code                180
dtype: int64

Unique ocean_proximity values:
['NEAR BAY' '<1H OCEAN' 'INLAND' 'NEAR OCEAN' 'ISLAND']


### Merge Datasets

Merge the housing data with location data based on longitude and latitude coordinates. It seems that every housing record has a matching set of coordinates in the google maps data, but some google maps records do not have a neighborhood value. We'll only work with data that has a valid neighborhood.

In [366]:
gmaps_with_neighborhoods = gmaps_data[gmaps_data['neighborhood-political'].notna()].copy()

merged_data = housing_data.merge(
    gmaps_with_neighborhoods[['longitude', 'latitude', 'neighborhood-political', 'postal_code']],
    on=['longitude', 'latitude'],
    how='inner'
)

# Rename neighborhood column for clarity
merged_data['neighborhood'] = merged_data['neighborhood-political']

merged_data[['longitude', 'latitude', 'neighborhood', 'postal_code', 
              'median_house_value', 'ocean_proximity']].head(20)

Unnamed: 0,longitude,latitude,neighborhood,postal_code,median_house_value,ocean_proximity
0,-122.22,37.86,Merriewood,94611.0,358500.0,NEAR BAY
1,-122.24,37.85,Upper Rockridge,94618.0,352100.0,NEAR BAY
2,-122.25,37.85,Rockridge,94618.0,341300.0,NEAR BAY
3,-122.25,37.85,Rockridge,94618.0,342200.0,NEAR BAY
4,-122.25,37.85,Rockridge,94618.0,269700.0,NEAR BAY
5,-122.25,37.84,Rockridge,94618.0,299200.0,NEAR BAY
6,-122.25,37.84,Rockridge,94618.0,241400.0,NEAR BAY
7,-122.26,37.84,Shafter,94618.0,226700.0,NEAR BAY
8,-122.25,37.84,Rockridge,94618.0,261100.0,NEAR BAY
9,-122.26,37.85,Upper Telegraph,94609.0,281500.0,NEAR BAY


In [367]:
print(f"\nRecords with NaN neighborhoods: {merged_data['neighborhood'].isna().sum()}")


Records with NaN neighborhoods: 0


### One-Hot Encode Ocean Proximity

I found there were no records with ISLAND as ocean_proximity once records without a valid neighborhood were filtered, so I just added it manually so it doesnt get left out

In [368]:
encoded_ocean = pd.get_dummies(merged_data['ocean_proximity'])

merged_data = pd.concat([merged_data, encoded_ocean], axis=1)

# renaming to remove spaces and lower case
merged_data = merged_data.rename(
    columns={"<1H OCEAN": "lt_1h_ocean",
             "INLAND": "inland",
             "NEAR BAY": "near_bay",
             "NEAR OCEAN": "near_ocean"}
)

merged_data['island'] = False

merged_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,neighborhood-political,postal_code,neighborhood,lt_1h_ocean,inland,near_bay,near_ocean,island
0,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,Merriewood,94611.0,Merriewood,False,False,True,False,False
1,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,Upper Rockridge,94618.0,Upper Rockridge,False,False,True,False,False
2,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,Rockridge,94618.0,Rockridge,False,False,True,False,False
3,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,Rockridge,94618.0,Rockridge,False,False,True,False,False
4,-122.25,37.85,52.0,919.0,213.0,413.0,193.0,4.0368,269700.0,NEAR BAY,Rockridge,94618.0,Rockridge,False,False,True,False,False


### Bedrooms per household

In [369]:
# Calculate bedrooms per household for records where both values exist
merged_data['bedrooms_per_household_raw'] = merged_data['total_bedrooms'] / merged_data['households']

# Calculate average bedrooms per household by postal code for imputation
postal_code_bedrooms_avg = merged_data.groupby('postal_code')['bedrooms_per_household_raw'].mean()

# Impute missing total_bedrooms using postal code average
def impute_bedrooms(row):
    if pd.isna(row['total_bedrooms']):
        if pd.notna(row['postal_code']) and row['postal_code'] in postal_code_bedrooms_avg.index:
            # Use postal code average
            avg_ratio = postal_code_bedrooms_avg[row['postal_code']]
            if pd.notna(avg_ratio):
                return avg_ratio * row['households']
        # Fallback to overall average
        return merged_data['bedrooms_per_household_raw'].mean() * row['households']
    return row['total_bedrooms']

merged_data['total_bedrooms_imputed'] = merged_data.apply(impute_bedrooms, axis=1)

# Recalculate bedrooms per household with imputed values
merged_data['bedrooms_per_household'] = merged_data['total_bedrooms_imputed'] / merged_data['households']

print(f"\nMissing values before imputation: {merged_data['total_bedrooms'].isna().sum()}")
print(f"Missing values after imputation: {merged_data['bedrooms_per_household'].isna().sum()}")
merged_data.head()


Missing values before imputation: 89
Missing values after imputation: 0


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,...,postal_code,neighborhood,lt_1h_ocean,inland,near_bay,near_ocean,island,bedrooms_per_household_raw,total_bedrooms_imputed,bedrooms_per_household
0,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,...,94611.0,Merriewood,False,False,True,False,False,0.97188,1106.0,0.97188
1,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,...,94618.0,Upper Rockridge,False,False,True,False,False,1.073446,190.0,1.073446
2,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,...,94618.0,Rockridge,False,False,True,False,False,1.073059,235.0,1.073059
3,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,...,94618.0,Rockridge,False,False,True,False,False,1.081081,280.0,1.081081
4,-122.25,37.85,52.0,919.0,213.0,413.0,193.0,4.0368,269700.0,NEAR BAY,...,94618.0,Rockridge,False,False,True,False,False,1.103627,213.0,1.103627


### Aggregate by Neighborhood

In [370]:
neighborhood_features = merged_data.groupby('neighborhood').agg({
    # Ocean proximity - take max (if any house in neighborhood has this, it's marked as 1)
    'lt_1h_ocean': 'max',
    'inland': 'max',
    'island': 'max',
    'near_bay': 'max',
    'near_ocean': 'max',
    'median_house_value': 'mean',
    'housing_median_age': 'mean',
    'households': 'mean',
    'bedrooms_per_household': 'mean'
}).reset_index()

# Round up total households to integer
neighborhood_features['total_households'] = np.ceil(neighborhood_features['households']).astype(int)

# Cap median house value at 500,000
neighborhood_features['median_house_value'] = neighborhood_features['median_house_value'].clip(upper=500000)

# Discretize the averaged median house age into 10-year buckets
neighborhood_features['median_house_age'] = (neighborhood_features['housing_median_age'] // 10).astype(int) * 10

neighborhood_features = neighborhood_features[[
    'neighborhood',
    'lt_1h_ocean',
    'inland',
    'island',
    'near_bay',
    'near_ocean',
    'median_house_value',
    'median_house_age',
    'total_households',
    'bedrooms_per_household'

]]

neighborhood_features.head(10)


Unnamed: 0,neighborhood,lt_1h_ocean,inland,island,near_bay,near_ocean,median_house_value,median_house_age,total_households,bedrooms_per_household
0,28 Palms,True,False,False,False,False,222200.0,20,923,1.017335
1,Acorn Industrial,False,False,False,True,False,81300.0,50,147,1.659864
2,Adams Hill,True,False,False,False,False,250733.333333,30,494,1.034649
3,Agua Mansa Industrial Corridor,False,True,False,False,False,112300.0,10,516,1.102713
4,Al Tahoe,False,True,False,False,False,109180.0,20,249,1.641739
5,Alamitos Beach,False,False,False,False,True,188194.117647,30,664,1.117409
6,Alessandro,False,True,False,False,False,68075.0,30,409,1.055551
7,Alessandro Heights,False,True,False,False,False,315200.0,0,1107,1.066847
8,Alhambra Triangle,False,True,False,False,False,88850.0,40,382,1.114779
9,Alice,False,False,False,True,False,404300.0,30,483,1.020704


### Add Event Time

In [371]:
# Add event time (current timestamp)
current_time_sec = int(round(time.time()))
neighborhood_features['event_time'] = pd.Series([current_time_sec] * len(neighborhood_features), dtype="float64")

neighborhood_features.head()

Unnamed: 0,neighborhood,lt_1h_ocean,inland,island,near_bay,near_ocean,median_house_value,median_house_age,total_households,bedrooms_per_household,event_time
0,28 Palms,True,False,False,False,False,222200.0,20,923,1.017335,1768859000.0
1,Acorn Industrial,False,False,False,True,False,81300.0,50,147,1.659864,1768859000.0
2,Adams Hill,True,False,False,False,False,250733.333333,30,494,1.034649,1768859000.0
3,Agua Mansa Industrial Corridor,False,True,False,False,False,112300.0,10,516,1.102713,1768859000.0
4,Al Tahoe,False,True,False,False,False,109180.0,20,249,1.641739,1768859000.0


### Prep Data for Feature Store

In [372]:
def cast_object_to_string(data_frame):
    for label in data_frame.columns:
        if data_frame.dtypes[label] == "object":
            data_frame[label] = data_frame[label].astype("str").astype("string")

cast_object_to_string(neighborhood_features)
bool_columns = neighborhood_features.select_dtypes(include=['bool']).columns
neighborhood_features[bool_columns] = neighborhood_features[bool_columns].astype('int64')

neighborhood_features.head()

Unnamed: 0,neighborhood,lt_1h_ocean,inland,island,near_bay,near_ocean,median_house_value,median_house_age,total_households,bedrooms_per_household,event_time
0,28 Palms,1,0,0,0,0,222200.0,20,923,1.017335,1768859000.0
1,Acorn Industrial,0,0,0,1,0,81300.0,50,147,1.659864,1768859000.0
2,Adams Hill,1,0,0,0,0,250733.333333,30,494,1.034649,1768859000.0
3,Agua Mansa Industrial Corridor,0,1,0,0,0,112300.0,10,516,1.102713,1768859000.0
4,Al Tahoe,0,1,0,0,0,109180.0,20,249,1.641739,1768859000.0


### Create Feature Group

In [373]:
# Create unique feature group name with timestamp
neighborhood_feature_group_name = "neighborhood-feature-group-" + strftime("%d-%H-%M-%S", gmtime())
print(f"Feature group name: {neighborhood_feature_group_name}")

# Create the feature group
neighborhood_feature_group = FeatureGroup(
    name=neighborhood_feature_group_name, 
    sagemaker_session=feature_store_session
)

# Load feature definitions
neighborhood_feature_group.load_feature_definitions(data_frame=neighborhood_features)

print("Feature definitions loaded successfully")

Feature group name: neighborhood-feature-group-19-21-51-02
Feature definitions loaded successfully


In [374]:
def wait_for_feature_group_creation_complete(feature_group):
    status = feature_group.describe().get("FeatureGroupStatus")
    print(f"Initial status: {status}")
    while status == "Creating":
        print("Waiting for Feature Group Creation...")
        time.sleep(5)
        status = feature_group.describe().get("FeatureGroupStatus")
    if status != "Created":
        raise RuntimeError(f"Failed to create feature group {feature_group.name}. Status: {status}")
    print(f"FeatureGroup {feature_group.name} successfully created.")

# Define record identifier and event time feature names
record_identifier_feature_name = "neighborhood"
event_time_feature_name = "event_time"

# Create the feature group in SageMaker
neighborhood_feature_group.create(
    s3_uri=f"s3://{default_s3_bucket_name}/{prefix}",
    record_identifier_name=record_identifier_feature_name,
    event_time_feature_name=event_time_feature_name,
    role_arn=role,
    enable_online_store=True,
)
wait_for_feature_group_creation_complete(feature_group=neighborhood_feature_group)

Initial status: Creating
Waiting for Feature Group Creation...
Waiting for Feature Group Creation...
Waiting for Feature Group Creation...
Waiting for Feature Group Creation...
FeatureGroup neighborhood-feature-group-19-21-51-02 successfully created.


### Ingest Data into Feature Store

In [375]:
# Ingest data into the feature group
print(f"Ingesting {len(neighborhood_features)} records into feature store...")

neighborhood_feature_group.ingest(
    data_frame=neighborhood_features, 
    max_workers=3, 
    wait=True
)

print("Data ingestion completed successfully!")

Ingesting 1306 records into feature store...
Data ingestion completed successfully!


### Query Neighborhoods

Query the feature values for Brooktree, Fisherman's Wharf, and Los Osos neighborhoods

In [376]:
neighborhoods_to_query = ['Brooktree', "Fisherman's Wharf", 'Los Osos']

for neighborhood in neighborhoods_to_query:
    print(f"NEIGHBORHOOD: {neighborhood}")
    
    record = featurestore_runtime.get_record(
        FeatureGroupName=neighborhood_feature_group_name,
        RecordIdentifierValueAsString=neighborhood,
    )
    
    print("\nFeature Values:")
    
    features_dict = {}
    for feature in record['Record']:
        feature_name = feature['FeatureName']
        feature_value = feature['ValueAsString']
        features_dict[feature_name] = feature_value
        print(f"  {feature_name:30s}: {feature_value}")
    print()
    

NEIGHBORHOOD: Brooktree

Feature Values:
  neighborhood                  : Brooktree
  lt_1h_ocean                   : 1
  inland                        : 0
  island                        : 0
  near_bay                      : 0
  near_ocean                    : 0
  median_house_value            : 257400.0
  median_house_age              : 0
  total_households              : 1438
  bedrooms_per_household        : 1.0747777761942723
  event_time                    : 1768859462.0

NEIGHBORHOOD: Fisherman's Wharf

Feature Values:
  neighborhood                  : Fisherman's Wharf
  lt_1h_ocean                   : 0
  inland                        : 0
  island                        : 0
  near_bay                      : 1
  near_ocean                    : 0
  median_house_value            : 500000.0
  median_house_age              : 50
  total_households              : 250
  bedrooms_per_household        : 1.268
  event_time                    : 1768859462.0

NEIGHBORHOOD: Los Osos

Featu