In [15]:
!pip install sagemaker pandas boto3 awswrangler --quiet


[0m

In [16]:
# AWS Imports
import boto3
from botocore.client import ClientError
import sagemaker
from pyathena import connect
import awswrangler as wr
import pandas as pd
from sagemaker.feature_store.feature_group import FeatureGroup
from time import gmtime, strftime

# Data Transformation Imports
from io import StringIO

# Misc Imports
from IPython.display import display, HTML

In [17]:
# Create a SageMaker session object, which is used to manage interactions with SageMaker resources.
sess = sagemaker.Session()

# Retrieve the default Amazon S3 bucket associated with the SageMaker session.
bucket = sess.default_bucket()

# Get the IAM role associated with the current SageMaker notebook or environment.
role = sagemaker.get_execution_role()

# Get the AWS region name for the current session.
region = boto3.Session().region_name

# Retrieve the AWS account ID of the caller using the Security Token Service (STS) client.
account_id = boto3.client("sts").get_caller_identity().get("Account")

# Create a Boto3 client for the SageMaker service, specifying the AWS region.
sm = boto3.Session().client(service_name="sagemaker", region_name=region)

# Create an S3 client
s3 = boto3.client('s3')

featurestore_runtime = boto3.client("sagemaker-featurestore-runtime")

FILE_NAME="data.csv"
DATA_SOURCE="db_source"
DATA_FOLDER =f"s3://{bucket}/aai-540-group-3-final-project/data/"
FILE_LOCATION=f"{DATA_FOLDER}{FILE_NAME}"
DATA_PATH = f"{DATA_FOLDER}{DATA_SOURCE}/"


In [18]:
# Define the Feature Group Schema

feature_group_name = "employee-attrition-feature-store"
record_identifier_name = "Employee ID"  # Unique identifier for each record
event_time_feature_name = "EventTime"  # Required timestamp field

# Define feature group
feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=sess)


In [19]:
# Load and Prepare Data
file_key = "aai-540-group-3-final-project/data/db_source/data.csv"

# Download the file from S3 to a local file object
response = s3.get_object(Bucket=bucket, Key=file_key)

# Read the content of the file into a pandas DataFrame
data = pd.read_csv(response['Body'])

# Display the DataFrame
display(data)

Unnamed: 0,Employee ID,Age,Gender,Years at Company,Job Role,Monthly Income,Work-Life Balance,Job Satisfaction,Performance Rating,Number of Promotions,...,Number of Dependents,Job Level,Company Size,Company Tenure,Remote Work,Leadership Opportunities,Innovation Opportunities,Company Reputation,Employee Recognition,Attrition
0,8410,31,Male,19,Education,5390,Excellent,Medium,Average,2,...,0,Mid,Medium,89,No,No,No,Excellent,Medium,Stayed
1,64756,59,Female,4,Media,5534,Poor,High,Low,3,...,3,Mid,Medium,21,No,No,No,Fair,Low,Stayed
2,30257,24,Female,10,Healthcare,8159,Good,High,Low,0,...,3,Mid,Medium,74,No,No,No,Poor,Low,Stayed
3,65791,36,Female,7,Education,3989,Good,High,High,1,...,2,Mid,Small,50,Yes,No,No,Good,Medium,Stayed
4,65026,56,Male,41,Education,4821,Fair,Very High,Average,0,...,0,Senior,Medium,68,No,No,No,Fair,Medium,Stayed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74493,16243,56,Female,42,Healthcare,7830,Poor,Medium,Average,0,...,0,Senior,Medium,60,No,No,No,Poor,Medium,Stayed
74494,47175,30,Female,15,Education,3856,Good,Medium,Average,2,...,0,Entry,Medium,20,No,No,No,Good,Medium,Left
74495,12409,52,Male,5,Education,5654,Good,Very High,Below Average,0,...,4,Mid,Small,7,No,No,No,Good,High,Left
74496,9554,18,Male,4,Education,5276,Fair,High,Average,0,...,3,Mid,Large,5,No,No,No,Poor,High,Stayed


In [20]:
# Rename feature names to remove spaces
data.columns = (
    data.columns
    .str.replace(" ", "_")  # Replace spaces with underscores
    .str.replace("-", "_")  # Replace hyphens with underscores (optional)
    .str.replace("/", "_")  # Replace slashes with underscores (optional)
)

# Verify updated column names
print(data.columns)


Index(['Employee_ID', 'Age', 'Gender', 'Years_at_Company', 'Job_Role',
       'Monthly_Income', 'Work_Life_Balance', 'Job_Satisfaction',
       'Performance_Rating', 'Number_of_Promotions', 'Overtime',
       'Distance_from_Home', 'Education_Level', 'Marital_Status',
       'Number_of_Dependents', 'Job_Level', 'Company_Size', 'Company_Tenure',
       'Remote_Work', 'Leadership_Opportunities', 'Innovation_Opportunities',
       'Company_Reputation', 'Employee_Recognition', 'Attrition'],
      dtype='object')


In [21]:
from sagemaker.feature_store.feature_group import FeatureGroup
import pandas as pd
import sagemaker

# Initialize SageMaker session
sagemaker_session = sagemaker.Session()

# Feature Group Name
feature_group_name = "employee-attrition-feature-store"

# Check if Feature Group already exists
try:
    existing_feature_group = sagemaker_session.sagemaker_client.describe_feature_group(
        FeatureGroupName=feature_group_name
    )
    print(f"✅ Feature Group '{feature_group_name}' already exists. Skipping creation.")
except sagemaker_session.sagemaker_client.exceptions.ResourceNotFoundException:
    print(f"🔄 Feature Group '{feature_group_name}' not found. Creating a new one...")

    # Ensure the EventTime column exists and is properly formatted
    data["EventTime"] = pd.to_datetime("now").strftime("%Y-%m-%dT%H:%M:%SZ")

    # Verify EventTime is present
    print(data.columns)  # Check if 'EventTime' appears in the list

    # Ensure Employee ID is properly named
    data.rename(columns={"Employee ID": "Employee_ID"}, inplace=True)

    # Fix all column names to remove spaces
    data.columns = data.columns.str.replace(" ", "_")

    # Verify DataFrame structure before loading to Feature Store
    print(data.head())

    # Initialize Feature Group
    feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=sagemaker_session)

    # Reload feature definitions after column renaming
    feature_group.load_feature_definitions(data_frame=data)

    # Create Feature Group with corrected names
    feature_group.create(
        record_identifier_name="Employee_ID",  # Ensure matches renamed column
        event_time_feature_name="EventTime",  # Ensure this column exists
        role_arn=role,
        description="Feature store for employee attrition prediction",
        s3_uri=DATA_PATH  # ✅ Required for Offline Store
    )
    print(f"🚀 Feature Group '{feature_group_name}' has been successfully created.")


✅ Feature Group 'employee-attrition-feature-store' already exists. Skipping creation.


In [22]:
feature_group.describe()


{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:095342792399:feature-group/employee-attrition-feature-store',
 'FeatureGroupName': 'employee-attrition-feature-store',
 'RecordIdentifierFeatureName': 'Employee_ID',
 'EventTimeFeatureName': 'EventTime',
 'FeatureDefinitions': [{'FeatureName': 'Employee_ID',
   'FeatureType': 'String'},
  {'FeatureName': 'Age', 'FeatureType': 'Integral'},
  {'FeatureName': 'Gender', 'FeatureType': 'String'},
  {'FeatureName': 'Years_at_Company', 'FeatureType': 'Integral'},
  {'FeatureName': 'Job_Role', 'FeatureType': 'String'},
  {'FeatureName': 'Monthly_Income', 'FeatureType': 'Integral'},
  {'FeatureName': 'Work_Life_Balance', 'FeatureType': 'String'},
  {'FeatureName': 'Job_Satisfaction', 'FeatureType': 'String'},
  {'FeatureName': 'Performance_Rating', 'FeatureType': 'String'},
  {'FeatureName': 'Number_of_Promotions', 'FeatureType': 'Integral'},
  {'FeatureName': 'Overtime', 'FeatureType': 'String'},
  {'FeatureName': 'Distance_from_Home', 'FeatureT

In [23]:
# Convert Employee_ID to string
data["Employee_ID"] = data["Employee_ID"].astype(str)

# Convert EventTime to correct format
data["EventTime"] = pd.to_datetime("now").strftime("%Y-%m-%dT%H:%M:%SZ")

# Verify again
print("✅ Data Types After Conversion:\n", data.dtypes)
print("✅ Sample Data After Conversion:\n", data.head())


✅ Data Types After Conversion:
 Employee_ID                 object
Age                          int64
Gender                      object
Years_at_Company             int64
Job_Role                    object
Monthly_Income               int64
Work_Life_Balance           object
Job_Satisfaction            object
Performance_Rating          object
Number_of_Promotions         int64
Overtime                    object
Distance_from_Home           int64
Education_Level             object
Marital_Status              object
Number_of_Dependents         int64
Job_Level                   object
Company_Size                object
Company_Tenure               int64
Remote_Work                 object
Leadership_Opportunities    object
Innovation_Opportunities    object
Company_Reputation          object
Employee_Recognition        object
Attrition                   object
EventTime                   object
dtype: object
✅ Sample Data After Conversion:
   Employee_ID  Age  Gender  Years_at_Company  

In [24]:
# Verify column names and data types before ingestion
print("🔍 Column Names:", data.columns)
print("🔍 Data Types:\n", data.dtypes)
print("🔍 First Few Rows:\n", data.head())


🔍 Column Names: Index(['Employee_ID', 'Age', 'Gender', 'Years_at_Company', 'Job_Role',
       'Monthly_Income', 'Work_Life_Balance', 'Job_Satisfaction',
       'Performance_Rating', 'Number_of_Promotions', 'Overtime',
       'Distance_from_Home', 'Education_Level', 'Marital_Status',
       'Number_of_Dependents', 'Job_Level', 'Company_Size', 'Company_Tenure',
       'Remote_Work', 'Leadership_Opportunities', 'Innovation_Opportunities',
       'Company_Reputation', 'Employee_Recognition', 'Attrition', 'EventTime'],
      dtype='object')
🔍 Data Types:
 Employee_ID                 object
Age                          int64
Gender                      object
Years_at_Company             int64
Job_Role                    object
Monthly_Income               int64
Work_Life_Balance           object
Job_Satisfaction            object
Performance_Rating          object
Number_of_Promotions         int64
Overtime                    object
Distance_from_Home           int64
Education_Level        

In [25]:
# Reload feature definitions with corrected data
feature_group.load_feature_definitions(data_frame=data)

# Ingest data into Feature Store with reduced parallel workers
feature_group.ingest(data_frame=data, max_workers=1, wait=True)


IngestionManagerPandas(feature_group_name='employee-attrition-feature-store', feature_definitions={'Employee_ID': {'FeatureName': 'Employee_ID', 'FeatureType': 'String'}, 'Age': {'FeatureName': 'Age', 'FeatureType': 'Integral'}, 'Gender': {'FeatureName': 'Gender', 'FeatureType': 'String'}, 'Years_at_Company': {'FeatureName': 'Years_at_Company', 'FeatureType': 'Integral'}, 'Job_Role': {'FeatureName': 'Job_Role', 'FeatureType': 'String'}, 'Monthly_Income': {'FeatureName': 'Monthly_Income', 'FeatureType': 'Integral'}, 'Work_Life_Balance': {'FeatureName': 'Work_Life_Balance', 'FeatureType': 'String'}, 'Job_Satisfaction': {'FeatureName': 'Job_Satisfaction', 'FeatureType': 'String'}, 'Performance_Rating': {'FeatureName': 'Performance_Rating', 'FeatureType': 'String'}, 'Number_of_Promotions': {'FeatureName': 'Number_of_Promotions', 'FeatureType': 'Integral'}, 'Overtime': {'FeatureName': 'Overtime', 'FeatureType': 'String'}, 'Distance_from_Home': {'FeatureName': 'Distance_from_Home', 'FeatureT

In [26]:
# Describe the Feature Group to check ingestion status
feature_group.describe()


{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:095342792399:feature-group/employee-attrition-feature-store',
 'FeatureGroupName': 'employee-attrition-feature-store',
 'RecordIdentifierFeatureName': 'Employee_ID',
 'EventTimeFeatureName': 'EventTime',
 'FeatureDefinitions': [{'FeatureName': 'Employee_ID',
   'FeatureType': 'String'},
  {'FeatureName': 'Age', 'FeatureType': 'Integral'},
  {'FeatureName': 'Gender', 'FeatureType': 'String'},
  {'FeatureName': 'Years_at_Company', 'FeatureType': 'Integral'},
  {'FeatureName': 'Job_Role', 'FeatureType': 'String'},
  {'FeatureName': 'Monthly_Income', 'FeatureType': 'Integral'},
  {'FeatureName': 'Work_Life_Balance', 'FeatureType': 'String'},
  {'FeatureName': 'Job_Satisfaction', 'FeatureType': 'String'},
  {'FeatureName': 'Performance_Rating', 'FeatureType': 'String'},
  {'FeatureName': 'Number_of_Promotions', 'FeatureType': 'Integral'},
  {'FeatureName': 'Overtime', 'FeatureType': 'String'},
  {'FeatureName': 'Distance_from_Home', 'FeatureT

In [28]:
import boto3

s3_client = boto3.client('s3')

bucket = "sagemaker-us-east-1-095342792399"
prefix = "aai-540-group-3-final-project/data/db_source/"

# List files in the Feature Store S3 directory
response = s3_client.list_objects_v2(Bucket=bucket, Prefix=prefix)

# Print out S3 files
if "Contents" in response:
    print("✅ Data found in S3:")
    for obj in response["Contents"]:
        print(obj["Key"])
else:
    print("❌ No data found in S3. Ingestion might have failed.")


✅ Data found in S3:
aai-540-group-3-final-project/data/db_source//095342792399/sagemaker/us-east-1/offline-store/employee-attrition-feature-store-1738390195/employee-attrition-feature-store2025-02-01T06:09:55.639Z.txt
aai-540-group-3-final-project/data/db_source//095342792399/sagemaker/us-east-1/offline-store/employee-attrition-feature-store-1738390382/employee-attrition-feature-store2025-02-01T06:13:02.308Z.txt
aai-540-group-3-final-project/data/db_source/095342792399/sagemaker/us-east-1/offline-store/employee-attrition-feature-store-1738390195/data/year=2025/month=02/day=01/hour=06/20250201T060955Z_1HVfxwwlVUIrRVYH.parquet
aai-540-group-3-final-project/data/db_source/095342792399/sagemaker/us-east-1/offline-store/employee-attrition-feature-store-1738390195/data/year=2025/month=02/day=01/hour=06/20250201T060955Z_4wi6W9RZG81B8Wmc.parquet
aai-540-group-3-final-project/data/db_source/095342792399/sagemaker/us-east-1/offline-store/employee-attrition-feature-store-1738390195/data/year=2025

In [29]:
import awswrangler as wr

# Manually trigger Glue to detect the table
wr.athena.repair_table(
    table="employee-attrition-feature-store",
    database="sagemaker_featurestore"
)

print("✅ Glue table repair triggered. Retry Athena query after 5-10 minutes.")


✅ Glue table repair triggered. Retry Athena query after 5-10 minutes.


In [33]:
wr.catalog.create_parquet_table(
    database="sagemaker_featurestore",
    table="employee-attrition-feature-store",
    path="s3://sagemaker-us-east-1-095342792399/aai-540-group-3-final-project/data/db_source/095342792399/sagemaker/us-east-1/offline-store/employee-attrition-feature-store-1738390195/data/",
    columns_types={
        "Employee_ID": "string",
        "Age": "int",
        "Gender": "string",
        "Years_at_Company": "int",
        "Job_Role": "string",
        "Monthly_Income": "int",
        "Work_Life_Balance": "string",
        "Job_Satisfaction": "string",
        "Performance_Rating": "string",
        "Number_of_Promotions": "int",
        "Overtime": "string",
        "Distance_from_Home": "int",
        "Education_Level": "string",
        "Marital_Status": "string",
        "Number_of_Dependents": "int",
        "Job_Level": "string",
        "Company_Size": "string",
        "Company_Tenure": "int",
        "Remote_Work": "string",
        "Leadership_Opportunities": "string",
        "Innovation_Opportunities": "string",
        "Company_Reputation": "string",
        "Employee_Recognition": "string",
        "Attrition": "string",
        "EventTime": "string"
    }
)

print("✅ Glue table manually created. Retry Athena query in 5-10 minutes.")


✅ Glue table manually created. Retry Athena query in 5-10 minutes.


In [31]:
import awswrangler as wr

# arn:aws:sagemaker:us-east-1:095342792399:feature-group/employee-attrition-feature-store

# Athena Query to fetch data
query = f'SELECT * FROM "{feature_group_name}" LIMIT 5'

# Read from Athena
athena_df = wr.athena.read_sql_query(query, database="sagemaker_featurestore")

# Display results
import ace_tools as tools
tools.display_dataframe_to_user(name="Feature Store Data", dataframe=athena_df)


In [32]:
query = f'SELECT * FROM "{feature_group_name}"'
athena_df = sagemaker_session.athena_query(query)
athena_df.to_pandas()


In [None]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}

In [None]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>